diff options
author | jfons <joan.fonssanchez@gmail.com> | 2021-05-20 12:49:33 +0200 |
---|---|---|
committer | jfons <joan.fonssanchez@gmail.com> | 2021-05-21 17:00:24 +0200 |
commit | 767e374dced69b45db0afb30ca2ccf0bbbeef672 (patch) | |
tree | a712cecc2c8cc2c6d6ecdc4a50020d423ddb4c0c /thirdparty/embree/common | |
parent | 42b6602f1d4b108cecb94b94c0d2b645acaebd4f (diff) |
Upgrade Embree to the latest official release.
Since Embree v3.13.0 supports AARCH64, switch back to the
official repo instead of using Embree-aarch64.
`thirdparty/embree/patches/godot-changes.patch` should now contain
an accurate diff of the changes done to the library.
Diffstat (limited to 'thirdparty/embree/common')
103 files changed, 29497 insertions, 0 deletions
diff --git a/thirdparty/embree/common/algorithms/parallel_any_of.h b/thirdparty/embree/common/algorithms/parallel_any_of.h new file mode 100644 index 0000000000..a64e4a1889 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_any_of.h @@ -0,0 +1,55 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <functional> +#include "parallel_reduce.h" + +namespace embree +{ + + template<typename Index, class UnaryPredicate> + __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred) + { + bool ret = false; + +#if defined(TASKING_TBB) +#if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred,&context](const tbb::blocked_range<size_t>& r) { + if (context.is_group_execution_cancelled()) return; + for (size_t i = r.begin(); i != r.end(); ++i) { + if (pred(i)) { + ret = true; + context.cancel_group_execution(); + } + } + }); +#else + tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred](const tbb::blocked_range<size_t>& r) { + if (tbb::task::self().is_cancelled()) return; + for (size_t i = r.begin(); i != r.end(); ++i) { + if (pred(i)) { + ret = true; + tbb::task::self().cancel_group_execution(); + } + } + }); +#endif +#else + ret = parallel_reduce (first, last, false, [pred](const range<size_t>& r)->bool { + bool localret = false; + for (auto i=r.begin(); i<r.end(); ++i) { + localret |= pred(i); + } + return localret; + }, + std::bit_or<bool>() + ); +#endif + + return ret; + } + +} // end namespace diff --git a/thirdparty/embree/common/algorithms/parallel_filter.h b/thirdparty/embree/common/algorithms/parallel_filter.h new file mode 100644 index 0000000000..090ef164c2 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_filter.h @@ -0,0 +1,93 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for.h" + +namespace embree +{ + template<typename Ty, typename Index, typename Predicate> + inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate) + { + Index j = first; + for (Index i=first; i<last; i++) + if (predicate(data[i])) + data[j++] = data[i]; + + return j; + } + + template<typename Ty, typename Index, typename Predicate> + inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate) + { + /* sequential fallback */ + if (end-begin <= minStepSize) + return sequential_filter(data,begin,end,predicate); + + /* calculate number of tasks to use */ + enum { MAX_TASKS = 64 }; + const Index numThreads = TaskScheduler::threadCount(); + const Index numBlocks = (end-begin+minStepSize-1)/minStepSize; + const Index taskCount = min(numThreads,numBlocks,(Index)MAX_TASKS); + + /* filter blocks */ + Index nused[MAX_TASKS]; + Index nfree[MAX_TASKS]; + parallel_for(taskCount, [&](const Index taskIndex) + { + const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount; + const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount; + const Index i2 = sequential_filter(data,i0,i1,predicate); + nused[taskIndex] = i2-i0; + nfree[taskIndex] = i1-i2; + }); + + /* calculate offsets */ + Index sused=0; + Index sfree=0; + Index pfree[MAX_TASKS]; + for (Index i=0; i<taskCount; i++) + { + sused+=nused[i]; + Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree; + } + + /* return if we did not filter out any element */ + assert(sfree <= end-begin); + assert(sused <= end-begin); + if (sused == end-begin) + return end; + + /* otherwise we have to copy misplaced elements around */ + parallel_for(taskCount, [&](const Index taskIndex) + { + /* destination to write elements to */ + Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex]; + Index dst_end = min(dst+nfree[taskIndex],begin+sused); + if (dst_end <= dst) return; + + /* range of misplaced elements to copy to destination */ + Index r0 = pfree[taskIndex]; + Index r1 = r0+dst_end-dst; + + /* find range in misplaced elements in back to front order */ + Index k0=0; + for (Index i=taskCount-1; i>0; i--) + { + if (k0 > r1) break; + Index k1 = k0+nused[i]; + Index src = begin+(i+0)*(end-begin)/taskCount+nused[i]; + for (Index i=max(r0,k0); i<min(r1,k1); i++) { + Index isrc = src-i+k0-1; + assert(dst >= begin && dst < end); + assert(isrc >= begin && isrc < end); + data[dst++] = data[isrc]; + } + k0 = k1; + } + }); + + return begin+sused; + } +} diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h new file mode 100644 index 0000000000..645681ac63 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_for.h @@ -0,0 +1,186 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../tasking/taskscheduler.h" +#include "../sys/array.h" +#include "../math/math.h" +#include "../math/range.h" + +namespace embree +{ + /* parallel_for without range */ + template<typename Index, typename Func> + __forceinline void parallel_for( const Index N, const Func& func) + { +#if defined(TASKING_INTERNAL) + if (N) { + TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range<Index>& r) { + assert(r.size() == 1); + func(r.begin()); + }); + if (!TaskScheduler::wait()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + } + +#elif defined(TASKING_TBB) + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + }); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + +#elif defined(TASKING_PPL) + concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + }); +#else +# error "no tasking system enabled" +#endif + } + + /* parallel for with range and granulatity */ + template<typename Index, typename Func> + __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func) + { + assert(first <= last); +#if defined(TASKING_INTERNAL) + TaskScheduler::spawn(first,last,minStepSize,func); + if (!TaskScheduler::wait()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + +#elif defined(TASKING_TBB) + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) { + func(range<Index>(r.begin(),r.end())); + },context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) { + func(range<Index>(r.begin(),r.end())); + }); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + +#elif defined(TASKING_PPL) + concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) { + func(range<Index>(i,i+1)); + }); + +#else +# error "no tasking system enabled" +#endif + } + + /* parallel for with range */ + template<typename Index, typename Func> + __forceinline void parallel_for( const Index first, const Index last, const Func& func) + { + assert(first <= last); + parallel_for(first,last,(Index)1,func); + } + +#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001) + + template<typename Index, typename Func> + __forceinline void parallel_for_static( const Index N, const Func& func) + { + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },tbb::simple_partitioner(),context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },tbb::simple_partitioner()); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + } + + typedef tbb::affinity_partitioner affinity_partitioner; + + template<typename Index, typename Func> + __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap) + { + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },ap,context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },ap); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + } + +#else + + template<typename Index, typename Func> + __forceinline void parallel_for_static( const Index N, const Func& func) + { + parallel_for(N,func); + } + + struct affinity_partitioner { + }; + + template<typename Index, typename Func> + __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap) + { + parallel_for(N,func); + } + +#endif +} diff --git a/thirdparty/embree/common/algorithms/parallel_for_for.h b/thirdparty/embree/common/algorithms/parallel_for_for.h new file mode 100644 index 0000000000..92c37a4a38 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_for_for.h @@ -0,0 +1,149 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for.h" + +namespace embree +{ + template<typename ArrayArray, typename Func> + __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) + { + size_t k=0; + for (size_t i=0; i!=array2.size(); ++i) { + const size_t N = array2[i]->size(); + if (N) func(array2[i],range<size_t>(0,N),k); + k+=N; + } + } + + class ParallelForForState + { + public: + + enum { MAX_TASKS = 64 }; + + __forceinline ParallelForForState () + : taskCount(0) {} + + template<typename ArrayArray> + __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) { + init(array2,minStepSize); + } + + template<typename ArrayArray> + __forceinline void init ( ArrayArray& array2, const size_t minStepSize ) + { + /* first calculate total number of elements */ + size_t N = 0; + for (size_t i=0; i<array2.size(); i++) { + N += array2[i] ? array2[i]->size() : 0; + } + this->N = N; + + /* calculate number of tasks to use */ + const size_t numThreads = TaskScheduler::threadCount(); + const size_t numBlocks = (N+minStepSize-1)/minStepSize; + taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS))); + + /* calculate start (i,j) for each task */ + size_t taskIndex = 0; + i0[taskIndex] = 0; + j0[taskIndex] = 0; + size_t k0 = (++taskIndex)*N/taskCount; + for (size_t i=0, k=0; taskIndex < taskCount; i++) + { + assert(i<array2.size()); + size_t j=0, M = array2[i] ? array2[i]->size() : 0; + while (j<M && k+M-j >= k0 && taskIndex < taskCount) { + assert(taskIndex<taskCount); + i0[taskIndex] = i; + j0[taskIndex] = j += k0-k; + k=k0; + k0 = (++taskIndex)*N/taskCount; + } + k+=M-j; + } + } + + __forceinline size_t size() const { + return N; + } + + public: + size_t i0[MAX_TASKS]; + size_t j0[MAX_TASKS]; + size_t taskCount; + size_t N; + }; + + template<typename ArrayArray, typename Func> + __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) + { + ParallelForForState state(array2,minStepSize); + + parallel_for(state.taskCount, [&](const size_t taskIndex) + { + /* calculate range */ + const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; + const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; + size_t i0 = state.i0[taskIndex]; + size_t j0 = state.j0[taskIndex]; + + /* iterate over arrays */ + size_t k=k0; + for (size_t i=i0; k<k1; i++) { + const size_t N = array2[i] ? array2[i]->size() : 0; + const size_t r0 = j0, r1 = min(N,r0+k1-k); + if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k); + k+=r1-r0; j0 = 0; + } + }); + } + + template<typename ArrayArray, typename Func> + __forceinline void parallel_for_for( ArrayArray& array2, const Func& func ) + { + parallel_for_for(array2,1,func); + } + + template<typename ArrayArray, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { + ParallelForForState state(array2,minStepSize); + Value temp[ParallelForForState::MAX_TASKS]; + + for (size_t i=0; i<state.taskCount; i++) + temp[i] = identity; + + parallel_for(state.taskCount, [&](const size_t taskIndex) + { + /* calculate range */ + const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; + const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; + size_t i0 = state.i0[taskIndex]; + size_t j0 = state.j0[taskIndex]; + + /* iterate over arrays */ + size_t k=k0; + for (size_t i=i0; k<k1; i++) { + const size_t N = array2[i] ? array2[i]->size() : 0; + const size_t r0 = j0, r1 = min(N,r0+k1-k); + if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k)); + k+=r1-r0; j0 = 0; + } + }); + + Value ret = identity; + for (size_t i=0; i<state.taskCount; i++) + ret = reduction(ret,temp[i]); + return ret; + } + + template<typename ArrayArray, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction) + { + return parallel_for_for_reduce(array2,1,identity,func,reduction); + } +} diff --git a/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h new file mode 100644 index 0000000000..b15b44a991 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h @@ -0,0 +1,112 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for_for.h" +#include "parallel_prefix_sum.h" + +namespace embree +{ + template<typename Value> + struct ParallelForForPrefixSumState : public ParallelForForState + { + __forceinline ParallelForForPrefixSumState () {} + + template<typename ArrayArray> + __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize) + : ParallelForForState(array2,minStepSize) {} + + ParallelPrefixSumState<Value> prefix_state; + }; + + template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, + const Value& identity, const Func& func, const Reduction& reduction) + { + /* calculate number of tasks to use */ + const size_t taskCount = state.taskCount; + /* perform parallel prefix sum */ + parallel_for(taskCount, [&](const size_t taskIndex) + { + const size_t k0 = (taskIndex+0)*state.size()/taskCount; + const size_t k1 = (taskIndex+1)*state.size()/taskCount; + size_t i0 = state.i0[taskIndex]; + size_t j0 = state.j0[taskIndex]; + + /* iterate over arrays */ + size_t k=k0; + Value N=identity; + for (size_t i=i0; k<k1; i++) { + const size_t size = array2[i] ? array2[i]->size() : 0; + const size_t r0 = j0, r1 = min(size,r0+k1-k); + if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i)); + k+=r1-r0; j0 = 0; + } + state.prefix_state.counts[taskIndex] = N; + }); + + /* calculate prefix sum */ + Value sum=identity; + for (size_t i=0; i<taskCount; i++) + { + const Value c = state.prefix_state.counts[i]; + state.prefix_state.sums[i] = sum; + sum=reduction(sum,c); + } + + return sum; + } + + template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, + const Value& identity, const Func& func, const Reduction& reduction) + { + /* calculate number of tasks to use */ + const size_t taskCount = state.taskCount; + /* perform parallel prefix sum */ + parallel_for(taskCount, [&](const size_t taskIndex) + { + const size_t k0 = (taskIndex+0)*state.size()/taskCount; + const size_t k1 = (taskIndex+1)*state.size()/taskCount; + size_t i0 = state.i0[taskIndex]; + size_t j0 = state.j0[taskIndex]; + + /* iterate over arrays */ + size_t k=k0; + Value N=identity; + for (size_t i=i0; k<k1; i++) { + const size_t size = array2[i] ? array2[i]->size() : 0; + const size_t r0 = j0, r1 = min(size,r0+k1-k); + if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N))); + k+=r1-r0; j0 = 0; + } + state.prefix_state.counts[taskIndex] = N; + }); + + /* calculate prefix sum */ + Value sum=identity; + for (size_t i=0; i<taskCount; i++) + { + const Value c = state.prefix_state.counts[i]; + state.prefix_state.sums[i] = sum; + sum=reduction(sum,c); + } + + return sum; + } + + template<typename ArrayArray, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, + const Value& identity, const Func& func, const Reduction& reduction) + { + return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction); + } + + template<typename ArrayArray, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, + const Value& identity, const Func& func, const Reduction& reduction) + { + return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction); + } +} diff --git a/thirdparty/embree/common/algorithms/parallel_map.h b/thirdparty/embree/common/algorithms/parallel_map.h new file mode 100644 index 0000000000..15c098fe20 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_map.h @@ -0,0 +1,85 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_sort.h" + +namespace embree +{ + /*! implementation of a key/value map with parallel construction */ + template<typename Key, typename Val> + class parallel_map + { + /* key/value pair to build the map */ + struct KeyValue + { + __forceinline KeyValue () {} + + __forceinline KeyValue (const Key key, const Val val) + : key(key), val(val) {} + + __forceinline operator Key() const { + return key; + } + + public: + Key key; + Val val; + }; + + public: + + /*! parallel map constructors */ + parallel_map () {} + + /*! construction from pair of vectors */ + template<typename KeyVector, typename ValVector> + parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); } + + /*! initialized the parallel map from a vector with keys and values */ + template<typename KeyVector, typename ValVector> + void init(const KeyVector& keys, const ValVector& values) + { + /* reserve sufficient space for all data */ + assert(keys.size() == values.size()); + vec.resize(keys.size()); + + /* generate key/value pairs */ + parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) + vec[i] = KeyValue((Key)keys[i],values[i]); + }); + + /* perform parallel radix sort of the key/value pairs */ + std::vector<KeyValue> temp(keys.size()); + radix_sort<KeyValue,Key>(vec.data(),temp.data(),keys.size()); + } + + /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */ + __forceinline const Val* lookup(const Key& key) const + { + typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key); + if (i == vec.end()) return nullptr; + if (i->key != key) return nullptr; + return &i->val; + } + + /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */ + __forceinline Val lookup(const Key& key, const Val& def) const + { + typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key); + if (i == vec.end()) return def; + if (i->key != key) return def; + return i->val; + } + + /*! clears all state */ + void clear() { + vec.clear(); + } + + private: + std::vector<KeyValue> vec; //!< vector containing sorted elements + }; +} diff --git a/thirdparty/embree/common/algorithms/parallel_partition.h b/thirdparty/embree/common/algorithms/parallel_partition.h new file mode 100644 index 0000000000..a1cbdc8e04 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_partition.h @@ -0,0 +1,283 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for.h" +#include "../math/range.h" + +namespace embree +{ + /* serial partitioning */ + template<typename T, typename V, typename IsLeft, typename Reduction_T> + __forceinline size_t serial_partitioning(T* array, + const size_t begin, + const size_t end, + V& leftReduction, + V& rightReduction, + const IsLeft& is_left, + const Reduction_T& reduction_t) + { + T* l = array + begin; + T* r = array + end - 1; + + while(1) + { + /* *l < pivot */ + while (likely(l <= r && is_left(*l) )) + { + //prefetchw(l+4); // FIXME: enable? + reduction_t(leftReduction,*l); + ++l; + } + /* *r >= pivot) */ + while (likely(l <= r && !is_left(*r))) + { + //prefetchw(r-4); FIXME: enable? + reduction_t(rightReduction,*r); + --r; + } + if (r<l) break; + + reduction_t(leftReduction ,*r); + reduction_t(rightReduction,*l); + xchg(*l,*r); + l++; r--; + } + + return l - array; + } + + template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> + class __aligned(64) parallel_partition_task + { + ALIGNED_CLASS_(64); + private: + + static const size_t MAX_TASKS = 64; + + T* array; + size_t N; + const IsLeft& is_left; + const Reduction_T& reduction_t; + const Reduction_V& reduction_v; + const Vi& identity; + + size_t numTasks; + __aligned(64) size_t counter_start[MAX_TASKS+1]; + __aligned(64) size_t counter_left[MAX_TASKS+1]; + __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS]; + __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS]; + __aligned(64) V leftReductions[MAX_TASKS]; + __aligned(64) V rightReductions[MAX_TASKS]; + + public: + + __forceinline parallel_partition_task(T* array, + const size_t N, + const Vi& identity, + const IsLeft& is_left, + const Reduction_T& reduction_t, + const Reduction_V& reduction_v, + const size_t BLOCK_SIZE) + + : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity), + numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {} + + __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges) + { + size_t i = 0; + while(index >= (size_t)r[i].size()) + { + assert(i < numRanges); + index -= (size_t)r[i].size(); + i++; + } + return &r[i]; + } + + __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges, + const size_t numRightMisplacedRanges, + const size_t startID, + const size_t endID) + { + size_t leftLocalIndex = startID; + size_t rightLocalIndex = startID; + const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges); + const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges); + + size_t l_left = l_range->size() - leftLocalIndex; + size_t r_left = r_range->size() - rightLocalIndex; + T *__restrict__ l = &array[l_range->begin() + leftLocalIndex]; + T *__restrict__ r = &array[r_range->begin() + rightLocalIndex]; + size_t size = endID - startID; + size_t items = min(size,min(l_left,r_left)); + + while (size) + { + if (unlikely(l_left == 0)) + { + l_range++; + l_left = l_range->size(); + l = &array[l_range->begin()]; + items = min(size,min(l_left,r_left)); + } + + if (unlikely(r_left == 0)) + { + r_range++; + r_left = r_range->size(); + r = &array[r_range->begin()]; + items = min(size,min(l_left,r_left)); + } + + size -= items; + l_left -= items; + r_left -= items; + + while(items) { + items--; + xchg(*l++,*r++); + } + } + } + + __forceinline size_t partition(V& leftReduction, V& rightReduction) + { + /* partition the individual ranges for each task */ + parallel_for(numTasks,[&] (const size_t taskID) { + const size_t startID = (taskID+0)*N/numTasks; + const size_t endID = (taskID+1)*N/numTasks; + V local_left(identity); + V local_right(identity); + const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t); + counter_start[taskID] = startID; + counter_left [taskID] = mid-startID; + leftReductions[taskID] = local_left; + rightReductions[taskID] = local_right; + }); + counter_start[numTasks] = N; + counter_left[numTasks] = 0; + + /* finalize the reductions */ + for (size_t i=0; i<numTasks; i++) { + reduction_v(leftReduction,leftReductions[i]); + reduction_v(rightReduction,rightReductions[i]); + } + + /* calculate mid point for partitioning */ + size_t mid = counter_left[0]; + for (size_t i=1; i<numTasks; i++) + mid += counter_left[i]; + const range<ssize_t> globalLeft (0,mid); + const range<ssize_t> globalRight(mid,N); + + /* calculate all left and right ranges that are on the wrong global side */ + size_t numMisplacedRangesLeft = 0; + size_t numMisplacedRangesRight = 0; + size_t numMisplacedItemsLeft = 0; + size_t numMisplacedItemsRight = 0; + + for (size_t i=0; i<numTasks; i++) + { + const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]); + const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]); + const range<ssize_t> left_misplaced = globalLeft. intersect(right_range); + const range<ssize_t> right_misplaced = globalRight.intersect(left_range); + + if (!left_misplaced.empty()) + { + numMisplacedItemsLeft += left_misplaced.size(); + leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced; + } + + if (!right_misplaced.empty()) + { + numMisplacedItemsRight += right_misplaced.size(); + rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced; + } + } + assert( numMisplacedItemsLeft == numMisplacedItemsRight ); + + /* if no items are misplaced we are done */ + if (numMisplacedItemsLeft == 0) + return mid; + + /* otherwise we copy the items to the right place in parallel */ + parallel_for(numTasks,[&] (const size_t taskID) { + const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks; + const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks; + swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID); + }); + + return mid; + } + }; + + template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> + __noinline size_t parallel_partitioning(T* array, + const size_t begin, + const size_t end, + const Vi &identity, + V &leftReduction, + V &rightReduction, + const IsLeft& is_left, + const Reduction_T& reduction_t, + const Reduction_V& reduction_v, + size_t BLOCK_SIZE = 128) + { + /* fall back to single threaded partitioning for small N */ + if (unlikely(end-begin < BLOCK_SIZE)) + return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); + + /* otherwise use parallel code */ + else { + typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task; + std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); + return begin+p->partition(leftReduction,rightReduction); + } + } + + template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> + __noinline size_t parallel_partitioning(T* array, + const size_t begin, + const size_t end, + const Vi &identity, + V &leftReduction, + V &rightReduction, + const IsLeft& is_left, + const Reduction_T& reduction_t, + const Reduction_V& reduction_v, + size_t BLOCK_SIZE, + size_t PARALLEL_THRESHOLD) + { + /* fall back to single threaded partitioning for small N */ + if (unlikely(end-begin < PARALLEL_THRESHOLD)) + return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); + + /* otherwise use parallel code */ + else { + typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task; + std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); + return begin+p->partition(leftReduction,rightReduction); + } + } + + + template<typename T, typename IsLeft> + inline size_t parallel_partitioning(T* array, + const size_t begin, + const size_t end, + const IsLeft& is_left, + size_t BLOCK_SIZE = 128) + { + size_t leftReduction = 0; + size_t rightReduction = 0; + return parallel_partitioning( + array,begin,end,0,leftReduction,rightReduction,is_left, + [] (size_t& t,const T& ref) { }, + [] (size_t& t0,size_t& t1) { }, + BLOCK_SIZE); + } + +} diff --git a/thirdparty/embree/common/algorithms/parallel_prefix_sum.h b/thirdparty/embree/common/algorithms/parallel_prefix_sum.h new file mode 100644 index 0000000000..208bb4e480 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_prefix_sum.h @@ -0,0 +1,85 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for.h" + +namespace embree +{ + template<typename Value> + struct ParallelPrefixSumState + { + enum { MAX_TASKS = 64 }; + Value counts[MAX_TASKS]; + Value sums [MAX_TASKS]; + }; + + template<typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction) + { + /* calculate number of tasks to use */ + const size_t numThreads = TaskScheduler::threadCount(); + const size_t numBlocks = (last-first+minStepSize-1)/minStepSize; + const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS)); + + /* perform parallel prefix sum */ + parallel_for(taskCount, [&](const size_t taskIndex) + { + const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount; + const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount; + state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]); + }); + + /* calculate prefix sum */ + Value sum=identity; + for (size_t i=0; i<taskCount; i++) + { + const Value c = state.counts[i]; + state.sums[i] = sum; + sum=reduction(sum,c); + } + + return sum; + } + + /*! parallel calculation of prefix sums */ + template<typename SrcArray, typename DstArray, typename Value, typename Add> + __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096) + { + /* perform single threaded prefix operation for small N */ + if (N < SINGLE_THREAD_THRESHOLD) + { + Value sum=identity; + for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum; + return sum; + } + + /* perform parallel prefix operation for large N */ + else + { + ParallelPrefixSumState<Value> state; + + /* initial run just sets up start values for subtasks */ + parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value { + + Value s = identity; + for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]); + return s; + + }, add); + + /* final run calculates prefix sum */ + return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value { + + Value s = identity; + for (size_t i=r.begin(); i<r.end(); i++) { + dst[i] = add(sum,s); + s = add(s,src[i]); + } + return s; + + }, add); + } + } +} diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h new file mode 100644 index 0000000000..8271372ea4 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_reduce.h @@ -0,0 +1,150 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for.h" + +namespace embree +{ + template<typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) + { + return func(range<Index>(first,last)); + } + + template<typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { + return func(range<Index>(first,last)); + } + + template<typename Index, typename Value, typename Func, typename Reduction> + __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { + const Index maxTasks = 512; + const Index threadCount = (Index) TaskScheduler::threadCount(); + taskCount = min(taskCount,threadCount,maxTasks); + + /* parallel invokation of all tasks */ + dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack + parallel_for(taskCount, [&](const Index taskIndex) { + const Index k0 = first+(taskIndex+0)*(last-first)/taskCount; + const Index k1 = first+(taskIndex+1)*(last-first)/taskCount; + values[taskIndex] = func(range<Index>(k0,k1)); + }); + + /* perform reduction over all tasks */ + Value v = identity; + for (Index i=0; i<taskCount; i++) v = reduction(v,values[i]); + return v; + } + + template<typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { +#if defined(TASKING_INTERNAL) + + /* fast path for small number of iterations */ + Index taskCount = (last-first+minStepSize-1)/minStepSize; + if (likely(taskCount == 1)) { + return func(range<Index>(first,last)); + } + return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction); + +#elif defined(TASKING_TBB) + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity, + [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); }, + reduction,context); + // -- GODOT start -- + // if (context.is_group_execution_cancelled()) + // throw std::runtime_error("task cancelled"); + // -- GODOT end -- + return v; + #else + const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity, + [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); }, + reduction); + // -- GODOT start -- + // if (tbb::task::self().is_cancelled()) + // throw std::runtime_error("task cancelled"); + // -- GODOT end -- + return v; + #endif +#else // TASKING_PPL + struct AlignedValue + { + char storage[__alignof(Value)+sizeof(Value)]; + static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); }; + Value* getValuePtr() { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); } + const Value* getValuePtr() const { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); } + AlignedValue(const Value& v) { new(getValuePtr()) Value(v); } + AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); } + AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); }; + AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; }; + AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; }; + operator Value() const { return *getValuePtr(); } + }; + + struct Iterator_Index + { + Index v; + typedef std::forward_iterator_tag iterator_category; + typedef AlignedValue value_type; + typedef Index difference_type; + typedef Index distance_type; + typedef AlignedValue* pointer; + typedef AlignedValue& reference; + __forceinline Iterator_Index() {} + __forceinline Iterator_Index(Index v) : v(v) {} + __forceinline bool operator== (Iterator_Index other) { return v == other.v; } + __forceinline bool operator!= (Iterator_Index other) { return v != other.v; } + __forceinline Iterator_Index operator++() { return Iterator_Index(++v); } + __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); } + }; + + auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) { + assert(begin.v < end.v); + return reduction(start, func(range<Index>(begin.v, end.v))); + }; + const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction); + return v; +#endif + } + + template<typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) + { + if (likely(last-first < parallel_threshold)) { + return func(range<Index>(first,last)); + } else { + return parallel_reduce(first,last,minStepSize,identity,func,reduction); + } + } + + template<typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_reduce( const range<Index> range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) + { + return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction); + } + + template<typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) + { + auto funcr = [&] ( const range<Index> r ) { + Value v = identity; + for (Index i=r.begin(); i<r.end(); i++) + v = reduction(v,func(i)); + return v; + }; + return parallel_reduce(first,last,Index(1),identity,funcr,reduction); + } + + template<typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_reduce( const range<Index> range, const Value& identity, const Func& func, const Reduction& reduction ) + { + return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction); + } +} diff --git a/thirdparty/embree/common/algorithms/parallel_set.h b/thirdparty/embree/common/algorithms/parallel_set.h new file mode 100644 index 0000000000..7eae577457 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_set.h @@ -0,0 +1,52 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_sort.h" + +namespace embree +{ + /* implementation of a set of values with parallel construction */ + template<typename T> + class parallel_set + { + public: + + /*! default constructor for the parallel set */ + parallel_set () {} + + /*! construction from vector */ + template<typename Vector> + parallel_set (const Vector& in) { init(in); } + + /*! initialized the parallel set from a vector */ + template<typename Vector> + void init(const Vector& in) + { + /* copy data to internal vector */ + vec.resize(in.size()); + parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) + vec[i] = in[i]; + }); + + /* sort the data */ + std::vector<T> temp(in.size()); + radix_sort<T>(vec.data(),temp.data(),vec.size()); + } + + /*! tests if some element is in the set */ + __forceinline bool lookup(const T& elt) const { + return std::binary_search(vec.begin(), vec.end(), elt); + } + + /*! clears all state */ + void clear() { + vec.clear(); + } + + private: + std::vector<T> vec; //!< vector containing sorted elements + }; +} diff --git a/thirdparty/embree/common/algorithms/parallel_sort.h b/thirdparty/embree/common/algorithms/parallel_sort.h new file mode 100644 index 0000000000..30e56c2bfc --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_sort.h @@ -0,0 +1,454 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../simd/simd.h" +#include "parallel_for.h" +#include <algorithm> + +namespace embree +{ + template<class T> + __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length) + { + for(size_t i = 1;i<length;++i) + { + T v = array[i]; + size_t j = i; + while(j > 0 && v < array[j-1]) + { + array[j] = array[j-1]; + --j; + } + array[j] = v; + } + } + + template<class T> + __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length) + { + for(size_t i = 1;i<length;++i) + { + T v = array[i]; + size_t j = i; + while(j > 0 && v > array[j-1]) + { + array[j] = array[j-1]; + --j; + } + array[j] = v; + } + } + + template<class T> + void quicksort_ascending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] > pivotvalue); + while (t[++left] < pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const int pivot = right; + quicksort_ascending(t, begin, pivot); + quicksort_ascending(t, pivot + 1, end); + } + } + + template<class T> + void quicksort_decending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] < pivotvalue); + while (t[++left] > pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const int pivot = right; + quicksort_decending(t, begin, pivot); + quicksort_decending(t, pivot + 1, end); + } + } + + + template<class T, ssize_t THRESHOLD> + void quicksort_insertionsort_ascending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const ssize_t size = end-begin+1; + if (likely(size <= THRESHOLD)) + { + insertionsort_ascending<T>(&t[begin],size); + } + else + { + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] > pivotvalue); + while (t[++left] < pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const ssize_t pivot = right; + quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot); + quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end); + } + } + } + + + template<class T, ssize_t THRESHOLD> + void quicksort_insertionsort_decending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const ssize_t size = end-begin+1; + if (likely(size <= THRESHOLD)) + { + insertionsort_decending<T>(&t[begin],size); + } + else + { + + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] < pivotvalue); + while (t[++left] > pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const ssize_t pivot = right; + quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot); + quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end); + } + } + } + + template<typename T> + static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8) + { + static const unsigned int BITS = 8; + static const unsigned int BUCKETS = (1 << BITS); + static const unsigned int CMP_SORT_THRESHOLD = 16; + + __aligned(64) unsigned int count[BUCKETS]; + + /* clear buckets */ + for (size_t i=0;i<BUCKETS;i++) count[i] = 0; + + /* count buckets */ +#if defined(__INTEL_COMPILER) +#pragma nounroll +#endif + for (size_t i=0;i<num;i++) + count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++; + + /* prefix sums */ + __aligned(64) unsigned int head[BUCKETS]; + __aligned(64) unsigned int tail[BUCKETS]; + + head[0] = 0; + for (size_t i=1; i<BUCKETS; i++) + head[i] = head[i-1] + count[i-1]; + + for (size_t i=0; i<BUCKETS-1; i++) + tail[i] = head[i+1]; + + tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1]; + + assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]); + assert(tail[BUCKETS-1] == num); + + /* in-place swap */ + for (size_t i=0;i<BUCKETS;i++) + { + /* process bucket */ + while(head[i] < tail[i]) + { + T v = morton[head[i]]; + while(1) + { + const size_t b = (unsigned(v) >> shift) & (BUCKETS-1); + if (b == i) break; + std::swap(v,morton[head[b]++]); + } + assert((unsigned(v) >> shift & (BUCKETS-1)) == i); + morton[head[i]++] = v; + } + } + if (shift == 0) return; + + size_t offset = 0; + for (size_t i=0;i<BUCKETS;i++) + if (count[i]) + { + + for (size_t j=offset;j<offset+count[i]-1;j++) + assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i); + + if (unlikely(count[i] < CMP_SORT_THRESHOLD)) + insertionsort_ascending(morton + offset, count[i]); + else + radixsort32(morton + offset, count[i], shift-BITS); + + for (size_t j=offset;j<offset+count[i]-1;j++) + assert(morton[j] <= morton[j+1]); + + offset += count[i]; + } + } + + template<typename Ty, typename Key> + class ParallelRadixSort + { + static const size_t MAX_TASKS = 64; + static const size_t BITS = 8; + static const size_t BUCKETS = (1 << BITS); + typedef unsigned int TyRadixCount[BUCKETS]; + + template<typename T> + static bool compare(const T& v0, const T& v1) { + return (Key)v0 < (Key)v1; + } + + private: + ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement + ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement + + + public: + ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N) + : radixCount(nullptr), src(src), tmp(tmp), N(N) {} + + void sort(const size_t blockSize) + { + assert(blockSize > 0); + + /* perform single threaded sort for small N */ + if (N<=blockSize) // handles also special case of 0! + { + /* do inplace sort inside destination array */ + std::sort(src,src+N,compare<Ty>); + } + + /* perform parallel sort for large N */ + else + { + const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS)); + tbbRadixSort(numThreads); + } + } + + ~ParallelRadixSort() + { + alignedFree(radixCount); + radixCount = nullptr; + } + + private: + + void tbbRadixIteration0(const Key shift, + const Ty* __restrict const src, + Ty* __restrict const dst, + const size_t threadIndex, const size_t threadCount) + { + const size_t startID = (threadIndex+0)*N/threadCount; + const size_t endID = (threadIndex+1)*N/threadCount; + + /* mask to extract some number of bits */ + const Key mask = BUCKETS-1; + + /* count how many items go into the buckets */ + for (size_t i=0; i<BUCKETS; i++) + radixCount[threadIndex][i] = 0; + + /* iterate over src array and count buckets */ + unsigned int * __restrict const count = radixCount[threadIndex]; +#if defined(__INTEL_COMPILER) +#pragma nounroll +#endif + for (size_t i=startID; i<endID; i++) { +#if defined(__64BIT__) + const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask; +#else + const Key index = ((Key)src[i] >> shift) & mask; +#endif + count[index]++; + } + } + + void tbbRadixIteration1(const Key shift, + const Ty* __restrict const src, + Ty* __restrict const dst, + const size_t threadIndex, const size_t threadCount) + { + const size_t startID = (threadIndex+0)*N/threadCount; + const size_t endID = (threadIndex+1)*N/threadCount; + + /* mask to extract some number of bits */ + const Key mask = BUCKETS-1; + + /* calculate total number of items for each bucket */ + __aligned(64) unsigned int total[BUCKETS]; + /* + for (size_t i=0; i<BUCKETS; i++) + total[i] = 0; + */ + for (size_t i=0; i<BUCKETS; i+=VSIZEX) + vintx::store(&total[i], zero); + + for (size_t i=0; i<threadCount; i++) + { + /* + for (size_t j=0; j<BUCKETS; j++) + total[j] += radixCount[i][j]; + */ + for (size_t j=0; j<BUCKETS; j+=VSIZEX) + vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j])); + } + + /* calculate start offset of each bucket */ + __aligned(64) unsigned int offset[BUCKETS]; + offset[0] = 0; + for (size_t i=1; i<BUCKETS; i++) + offset[i] = offset[i-1] + total[i-1]; + + /* calculate start offset of each bucket for this thread */ + for (size_t i=0; i<threadIndex; i++) + { + /* + for (size_t j=0; j<BUCKETS; j++) + offset[j] += radixCount[i][j]; + */ + for (size_t j=0; j<BUCKETS; j+=VSIZEX) + vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j])); + } + + /* copy items into their buckets */ +#if defined(__INTEL_COMPILER) +#pragma nounroll +#endif + for (size_t i=startID; i<endID; i++) { + const Ty elt = src[i]; +#if defined(__64BIT__) + const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask; +#else + const size_t index = ((Key)src[i] >> shift) & mask; +#endif + dst[offset[index]++] = elt; + } + } + + void tbbRadixIteration(const Key shift, const bool last, + const Ty* __restrict src, Ty* __restrict dst, + const size_t numTasks) + { + affinity_partitioner ap; + parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap); + parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap); + } + + void tbbRadixSort(const size_t numTasks) + { + radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64); + + if (sizeof(Key) == sizeof(uint32_t)) { + tbbRadixIteration(0*BITS,0,src,tmp,numTasks); + tbbRadixIteration(1*BITS,0,tmp,src,numTasks); + tbbRadixIteration(2*BITS,0,src,tmp,numTasks); + tbbRadixIteration(3*BITS,1,tmp,src,numTasks); + } + else if (sizeof(Key) == sizeof(uint64_t)) + { + tbbRadixIteration(0*BITS,0,src,tmp,numTasks); + tbbRadixIteration(1*BITS,0,tmp,src,numTasks); + tbbRadixIteration(2*BITS,0,src,tmp,numTasks); + tbbRadixIteration(3*BITS,0,tmp,src,numTasks); + tbbRadixIteration(4*BITS,0,src,tmp,numTasks); + tbbRadixIteration(5*BITS,0,tmp,src,numTasks); + tbbRadixIteration(6*BITS,0,src,tmp,numTasks); + tbbRadixIteration(7*BITS,1,tmp,src,numTasks); + } + } + + private: + TyRadixCount* radixCount; + Ty* const src; + Ty* const tmp; + const size_t N; + }; + + template<typename Ty> + void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) + { + ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize); + } + + template<typename Ty, typename Key> + void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) + { + ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize); + } + + template<typename Ty> + void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { + radix_sort<Ty,uint32_t>(src,tmp,N,blockSize); + } + + template<typename Ty> + void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { + radix_sort<Ty,uint64_t>(src,tmp,N,blockSize); + } +} diff --git a/thirdparty/embree/common/lexers/parsestream.h b/thirdparty/embree/common/lexers/parsestream.h new file mode 100644 index 0000000000..f65a52cb47 --- /dev/null +++ b/thirdparty/embree/common/lexers/parsestream.h @@ -0,0 +1,101 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "stringstream.h" +#include "../sys/filename.h" +#include "../math/vec2.h" +#include "../math/vec3.h" +#include "../math/col3.h" +#include "../math/color.h" + +namespace embree +{ + /*! helper class for simple command line parsing */ + class ParseStream : public Stream<std::string> + { + public: + ParseStream (const Ref<Stream<std::string> >& cin) : cin(cin) {} + + ParseStream (const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ", + const std::string& endl = "", bool multiLine = false) + : cin(new StringStream(cin,seps,endl,multiLine)) {} + + public: + ParseLocation location() { return cin->loc(); } + std::string next() { return cin->get(); } + + void force(const std::string& next) { + std::string token = getString(); + if (token != next) + THROW_RUNTIME_ERROR("token \""+next+"\" expected but token \""+token+"\" found"); + } + + std::string getString() { + return get(); + } + + FileName getFileName() { + return FileName(get()); + } + + int getInt () { + return atoi(get().c_str()); + } + + Vec2i getVec2i() { + int x = atoi(get().c_str()); + int y = atoi(get().c_str()); + return Vec2i(x,y); + } + + Vec3ia getVec3ia() { + int x = atoi(get().c_str()); + int y = atoi(get().c_str()); + int z = atoi(get().c_str()); + return Vec3ia(x,y,z); + } + + float getFloat() { + return (float)atof(get().c_str()); + } + + Vec2f getVec2f() { + float x = (float)atof(get().c_str()); + float y = (float)atof(get().c_str()); + return Vec2f(x,y); + } + + Vec3f getVec3f() { + float x = (float)atof(get().c_str()); + float y = (float)atof(get().c_str()); + float z = (float)atof(get().c_str()); + return Vec3f(x,y,z); + } + + Vec3fa getVec3fa() { + float x = (float)atof(get().c_str()); + float y = (float)atof(get().c_str()); + float z = (float)atof(get().c_str()); + return Vec3fa(x,y,z); + } + + Col3f getCol3f() { + float x = (float)atof(get().c_str()); + float y = (float)atof(get().c_str()); + float z = (float)atof(get().c_str()); + return Col3f(x,y,z); + } + + Color getColor() { + float r = (float)atof(get().c_str()); + float g = (float)atof(get().c_str()); + float b = (float)atof(get().c_str()); + return Color(r,g,b); + } + + private: + Ref<Stream<std::string> > cin; + }; +} diff --git a/thirdparty/embree/common/lexers/stream.h b/thirdparty/embree/common/lexers/stream.h new file mode 100644 index 0000000000..a40c15f8eb --- /dev/null +++ b/thirdparty/embree/common/lexers/stream.h @@ -0,0 +1,215 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/ref.h" +#include "../sys/filename.h" +#include "../sys/string.h" + +#include <vector> +#include <iostream> +#include <cstdio> +#include <string.h> + +namespace embree +{ + /*! stores the location of a stream element in the source */ + class ParseLocation + { + public: + ParseLocation () : lineNumber(-1), colNumber(-1) {} + ParseLocation (std::shared_ptr<std::string> fileName, ssize_t lineNumber, ssize_t colNumber, ssize_t /*charNumber*/) + : fileName(fileName), lineNumber(lineNumber), colNumber(colNumber) {} + + std::string str() const + { + std::string str = "unknown"; + if (fileName) str = *fileName; + if (lineNumber >= 0) str += " line " + toString(lineNumber); + if (lineNumber >= 0 && colNumber >= 0) str += " character " + toString(colNumber); + return str; + } + + private: + std::shared_ptr<std::string> fileName; /// name of the file (or stream) the token is from + ssize_t lineNumber; /// the line number the token is from + ssize_t colNumber; /// the character number in the current line + }; + + /*! a stream class templated over the stream elements */ + template<typename T> class Stream : public RefCount + { + enum { BUF_SIZE = 1024 }; + + private: + virtual T next() = 0; + virtual ParseLocation location() = 0; + __forceinline std::pair<T,ParseLocation> nextHelper() { + ParseLocation l = location(); + T v = next(); + return std::pair<T,ParseLocation>(v,l); + } + __forceinline void push_back(const std::pair<T,ParseLocation>& v) { + if (past+future == BUF_SIZE) pop_front(); + size_t end = (start+past+future++)%BUF_SIZE; + buffer[end] = v; + } + __forceinline void pop_front() { + if (past == 0) THROW_RUNTIME_ERROR("stream buffer empty"); + start = (start+1)%BUF_SIZE; past--; + } + public: + Stream () : start(0), past(0), future(0), buffer(BUF_SIZE) {} + virtual ~Stream() {} + + public: + + const ParseLocation& loc() { + if (future == 0) push_back(nextHelper()); + return buffer[(start+past)%BUF_SIZE].second; + } + T get() { + if (future == 0) push_back(nextHelper()); + T t = buffer[(start+past)%BUF_SIZE].first; + past++; future--; + return t; + } + const T& peek() { + if (future == 0) push_back(nextHelper()); + return buffer[(start+past)%BUF_SIZE].first; + } + const T& unget(size_t n = 1) { + if (past < n) THROW_RUNTIME_ERROR ("cannot unget that many items"); + past -= n; future += n; + return peek(); + } + void drop() { + if (future == 0) push_back(nextHelper()); + past++; future--; + } + private: + size_t start,past,future; + std::vector<std::pair<T,ParseLocation> > buffer; + }; + + /*! warps an iostream stream */ + class StdStream : public Stream<int> + { + public: + StdStream (std::istream& cin, const std::string& name = "std::stream") + : cin(cin), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {} + ~StdStream() {} + ParseLocation location() { + return ParseLocation(name,lineNumber,colNumber,charNumber); + } + int next() { + int c = cin.get(); + if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; + charNumber++; + return c; + } + private: + std::istream& cin; + ssize_t lineNumber; /// the line number the token is from + ssize_t colNumber; /// the character number in the current line + ssize_t charNumber; /// the character in the file + std::shared_ptr<std::string> name; /// name of buffer + }; + + /*! creates a stream from a file */ + class FileStream : public Stream<int> + { + public: + + FileStream (FILE* file, const std::string& name = "file") + : file(file), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {} + + FileStream (const FileName& fileName) + : lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(fileName.str()))) + { + file = fopen(fileName.c_str(),"r"); + if (file == nullptr) THROW_RUNTIME_ERROR("cannot open file " + fileName.str()); + } + ~FileStream() { if (file) fclose(file); } + + public: + ParseLocation location() { + return ParseLocation(name,lineNumber,colNumber,charNumber); + } + + int next() { + int c = fgetc(file); + if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; + charNumber++; + return c; + } + + private: + FILE* file; + ssize_t lineNumber; /// the line number the token is from + ssize_t colNumber; /// the character number in the current line + ssize_t charNumber; /// the character in the file + std::shared_ptr<std::string> name; /// name of buffer + }; + + /*! creates a stream from a string */ + class StrStream : public Stream<int> + { + public: + + StrStream (const char* str) + : str(str), lineNumber(1), colNumber(0), charNumber(0) {} + + public: + ParseLocation location() { + return ParseLocation(std::shared_ptr<std::string>(),lineNumber,colNumber,charNumber); + } + + int next() { + int c = str[charNumber]; + if (c == 0) return EOF; + if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; + charNumber++; + return c; + } + + private: + const char* str; + ssize_t lineNumber; /// the line number the token is from + ssize_t colNumber; /// the character number in the current line + ssize_t charNumber; /// the character in the file + }; + + /*! creates a character stream from a command line */ + class CommandLineStream : public Stream<int> + { + public: + CommandLineStream (int argc, char** argv, const std::string& name = "command line") + : i(0), j(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) + { + if (argc > 0) { + for (size_t i=0; argv[0][i] && i<1024; i++) charNumber++; + charNumber++; + } + for (ssize_t k=1; k<argc; k++) args.push_back(argv[k]); + } + ~CommandLineStream() {} + public: + ParseLocation location() { + return ParseLocation(name,0,charNumber,charNumber); + } + int next() { + if (i == args.size()) return EOF; + if (j == args[i].size()) { i++; j=0; charNumber++; return ' '; } + charNumber++; + return args[i][j++]; + } + private: + size_t i,j; + std::vector<std::string> args; + ssize_t charNumber; /// the character in the file + std::shared_ptr<std::string> name; /// name of buffer + }; +} diff --git a/thirdparty/embree/common/lexers/streamfilters.h b/thirdparty/embree/common/lexers/streamfilters.h new file mode 100644 index 0000000000..3592b77b03 --- /dev/null +++ b/thirdparty/embree/common/lexers/streamfilters.h @@ -0,0 +1,39 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "stream.h" + +namespace embree +{ + /* removes all line comments from a stream */ + class LineCommentFilter : public Stream<int> + { + public: + LineCommentFilter (const FileName& fileName, const std::string& lineComment) + : cin(new FileStream(fileName)), lineComment(lineComment) {} + LineCommentFilter (Ref<Stream<int> > cin, const std::string& lineComment) + : cin(cin), lineComment(lineComment) {} + + ParseLocation location() { return cin->loc(); } + + int next() + { + /* look if the line comment starts here */ + for (size_t j=0; j<lineComment.size(); j++) { + if (cin->peek() != lineComment[j]) { cin->unget(j); goto not_found; } + cin->get(); + } + /* eat all characters until the end of the line (or file) */ + while (cin->peek() != '\n' && cin->peek() != EOF) cin->get(); + + not_found: + return cin->get(); + } + + private: + Ref<Stream<int> > cin; + std::string lineComment; + }; +} diff --git a/thirdparty/embree/common/lexers/stringstream.cpp b/thirdparty/embree/common/lexers/stringstream.cpp new file mode 100644 index 0000000000..a037869506 --- /dev/null +++ b/thirdparty/embree/common/lexers/stringstream.cpp @@ -0,0 +1,51 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "stringstream.h" + +namespace embree +{ + static const std::string stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\"; + + /* creates map for fast categorization of characters */ + static void createCharMap(bool map[256], const std::string& chrs) { + for (size_t i=0; i<256; i++) map[i] = false; + for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true; + } + + /* simple tokenizer */ + StringStream::StringStream(const Ref<Stream<int> >& cin, const std::string& seps, const std::string& endl, bool multiLine) + : cin(cin), endl(endl), multiLine(multiLine) + { + createCharMap(isSepMap,seps); + createCharMap(isValidCharMap,stringChars); + } + + std::string StringStream::next() + { + /* skip separators */ + while (cin->peek() != EOF) { + if (endl != "" && cin->peek() == '\n') { cin->drop(); return endl; } + if (multiLine && cin->peek() == '\\') { + cin->drop(); + if (cin->peek() == '\n') { cin->drop(); continue; } + cin->unget(); + } + if (!isSeparator(cin->peek())) break; + cin->drop(); + } + + /* parse everything until the next separator */ + std::vector<char> str; str.reserve(64); + while (cin->peek() != EOF && !isSeparator(cin->peek())) { + int c = cin->get(); + // -- GODOT start -- + // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input"); + if (!isValidChar(c)) abort(); + // -- GODOT end -- + str.push_back((char)c); + } + str.push_back(0); + return std::string(str.data()); + } +} diff --git a/thirdparty/embree/common/lexers/stringstream.h b/thirdparty/embree/common/lexers/stringstream.h new file mode 100644 index 0000000000..6d9c27e3cd --- /dev/null +++ b/thirdparty/embree/common/lexers/stringstream.h @@ -0,0 +1,29 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "stream.h" + +namespace embree +{ + /*! simple tokenizer that produces a string stream */ + class StringStream : public Stream<std::string> + { + public: + StringStream(const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ", + const std::string& endl = "", bool multiLine = false); + public: + ParseLocation location() { return cin->loc(); } + std::string next(); + private: + __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; } + __forceinline bool isValidChar(unsigned int c) const { return c<256 && isValidCharMap[c]; } + private: + Ref<Stream<int> > cin; /*! source character stream */ + bool isSepMap[256]; /*! map for fast classification of separators */ + bool isValidCharMap[256]; /*! map for valid characters */ + std::string endl; /*! the token of the end of line */ + bool multiLine; /*! whether to parse lines wrapped with \ */ + }; +} diff --git a/thirdparty/embree/common/lexers/tokenstream.cpp b/thirdparty/embree/common/lexers/tokenstream.cpp new file mode 100644 index 0000000000..6ed6f2045a --- /dev/null +++ b/thirdparty/embree/common/lexers/tokenstream.cpp @@ -0,0 +1,181 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "tokenstream.h" +#include "../math/math.h" + +namespace embree +{ + /* shorthands for common sets of characters */ + const std::string TokenStream::alpha = "abcdefghijklmnopqrstuvwxyz"; + const std::string TokenStream::ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + const std::string TokenStream::numbers = "0123456789"; + const std::string TokenStream::separators = "\n\t\r "; + const std::string TokenStream::stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\"; + + /* creates map for fast categorization of characters */ + static void createCharMap(bool map[256], const std::string& chrs) { + for (size_t i=0; i<256; i++) map[i] = false; + for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true; + } + + /* build full tokenizer that takes list of valid characters and keywords */ + TokenStream::TokenStream(const Ref<Stream<int> >& cin, //< stream to read from + const std::string& alpha, //< valid characters for identifiers + const std::string& seps, //< characters that act as separators + const std::vector<std::string>& symbols) //< symbols + : cin(cin), symbols(symbols) + { + createCharMap(isAlphaMap,alpha); + createCharMap(isSepMap,seps); + createCharMap(isStringCharMap,stringChars); + } + + bool TokenStream::decDigits(std::string& str_o) + { + bool ok = false; + std::string str; + if (cin->peek() == '+' || cin->peek() == '-') str += (char)cin->get(); + while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); } + if (ok) str_o += str; + else cin->unget(str.size()); + return ok; + } + + bool TokenStream::decDigits1(std::string& str_o) + { + bool ok = false; + std::string str; + while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); } + if (ok) str_o += str; else cin->unget(str.size()); + return ok; + } + + bool TokenStream::trySymbol(const std::string& symbol) + { + size_t pos = 0; + while (pos < symbol.size()) { + if (symbol[pos] != cin->peek()) { cin->unget(pos); return false; } + cin->drop(); pos++; + } + return true; + } + + bool TokenStream::trySymbols(Token& token, const ParseLocation& loc) + { + for (size_t i=0; i<symbols.size(); i++) { + if (!trySymbol(symbols[i])) continue; + token = Token(symbols[i],Token::TY_SYMBOL,loc); + return true; + } + return false; + } + + bool TokenStream::tryFloat(Token& token, const ParseLocation& loc) + { + bool ok = false; + std::string str; + if (trySymbol("nan")) { + token = Token(float(nan)); + return true; + } + if (trySymbol("+inf")) { + token = Token(float(pos_inf)); + return true; + } + if (trySymbol("-inf")) { + token = Token(float(neg_inf)); + return true; + } + + if (decDigits(str)) + { + if (cin->peek() == '.') { + str += (char)cin->get(); + decDigits(str); + if (cin->peek() == 'e' || cin->peek() == 'E') { + str += (char)cin->get(); + if (decDigits(str)) ok = true; // 1.[2]E2 + } + else ok = true; // 1.[2] + } + else if (cin->peek() == 'e' || cin->peek() == 'E') { + str += (char)cin->get(); + if (decDigits(str)) ok = true; // 1E2 + } + } + else + { + if (cin->peek() == '.') { + str += (char)cin->get(); + if (decDigits(str)) { + if (cin->peek() == 'e' || cin->peek() == 'E') { + str += (char)cin->get(); + if (decDigits(str)) ok = true; // .3E2 + } + else ok = true; // .3 + } + } + } + if (ok) { + token = Token((float)atof(str.c_str()),loc); + } + else cin->unget(str.size()); + return ok; + } + + bool TokenStream::tryInt(Token& token, const ParseLocation& loc) { + std::string str; + if (decDigits(str)) { + token = Token(atoi(str.c_str()),loc); + return true; + } + return false; + } + + bool TokenStream::tryString(Token& token, const ParseLocation& loc) + { + std::string str; + if (cin->peek() != '\"') return false; + cin->drop(); + while (cin->peek() != '\"') { + const int c = cin->get(); + if (!isStringChar(c)) THROW_RUNTIME_ERROR("invalid string character "+std::string(1,c)+" at "+loc.str()); + str += (char)c; + } + cin->drop(); + token = Token(str,Token::TY_STRING,loc); + return true; + } + + bool TokenStream::tryIdentifier(Token& token, const ParseLocation& loc) + { + std::string str; + if (!isAlpha(cin->peek())) return false; + str += (char)cin->get(); + while (isAlphaNum(cin->peek())) str += (char)cin->get(); + token = Token(str,Token::TY_IDENTIFIER,loc); + return true; + } + + void TokenStream::skipSeparators() + { + /* skip separators */ + while (cin->peek() != EOF && isSeparator(cin->peek())) + cin->drop(); + } + + Token TokenStream::next() + { + Token token; + skipSeparators(); + ParseLocation loc = cin->loc(); + if (trySymbols (token,loc)) return token; /**< try to parse a symbol */ + if (tryFloat (token,loc)) return token; /**< try to parse float */ + if (tryInt (token,loc)) return token; /**< try to parse integer */ + if (tryString (token,loc)) return token; /**< try to parse string */ + if (tryIdentifier(token,loc)) return token; /**< try to parse identifier */ + if (cin->peek() == EOF ) return Token(loc); /**< return EOF token */ + return Token((char)cin->get(),loc); /**< return invalid character token */ + } +} diff --git a/thirdparty/embree/common/lexers/tokenstream.h b/thirdparty/embree/common/lexers/tokenstream.h new file mode 100644 index 0000000000..6e49dd0b39 --- /dev/null +++ b/thirdparty/embree/common/lexers/tokenstream.h @@ -0,0 +1,164 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "stream.h" +#include <string> +#include <vector> + +namespace embree +{ + /*! token class */ + class Token + { + public: + + enum Type { TY_EOF, TY_CHAR, TY_INT, TY_FLOAT, TY_IDENTIFIER, TY_STRING, TY_SYMBOL }; + + Token ( const ParseLocation& loc = ParseLocation()) : ty(TY_EOF ), loc(loc) {} + Token (char c, const ParseLocation& loc = ParseLocation()) : ty(TY_CHAR ), c(c), loc(loc) {} + Token (int i, const ParseLocation& loc = ParseLocation()) : ty(TY_INT ), i(i), loc(loc) {} + Token (float f,const ParseLocation& loc = ParseLocation()) : ty(TY_FLOAT), f(f), loc(loc) {} + Token (std::string str, Type ty, const ParseLocation& loc = ParseLocation()) : ty(ty), str(str), loc(loc) {} + + static Token Eof() { return Token(); } + static Token Sym(std::string str) { return Token(str,TY_SYMBOL); } + static Token Str(std::string str) { return Token(str,TY_STRING); } + static Token Id (std::string str) { return Token(str,TY_IDENTIFIER); } + + char Char() const { + if (ty == TY_CHAR) return c; + THROW_RUNTIME_ERROR(loc.str()+": character expected"); + } + + int Int() const { + if (ty == TY_INT) return i; + THROW_RUNTIME_ERROR(loc.str()+": integer expected"); + } + + float Float(bool cast = true) const { + if (ty == TY_FLOAT) return f; + if (ty == TY_INT && cast) return (float)i; + THROW_RUNTIME_ERROR(loc.str()+": float expected"); + } + + std::string Identifier() const { + if (ty == TY_IDENTIFIER) return str; + THROW_RUNTIME_ERROR(loc.str()+": identifier expected"); + } + + std::string String() const { + if (ty == TY_STRING) return str; + THROW_RUNTIME_ERROR(loc.str()+": string expected"); + } + + std::string Symbol() const { + if (ty == TY_SYMBOL) return str; + THROW_RUNTIME_ERROR(loc.str()+": symbol expected"); + } + + const ParseLocation& Location() const { return loc; } + + friend bool operator==(const Token& a, const Token& b) + { + if (a.ty != b.ty) return false; + if (a.ty == TY_CHAR) return a.c == b.c; + if (a.ty == TY_INT) return a.i == b.i; + if (a.ty == TY_FLOAT) return a.f == b.f; + if (a.ty == TY_IDENTIFIER) return a.str == b.str; + if (a.ty == TY_STRING) return a.str == b.str; + if (a.ty == TY_SYMBOL) return a.str == b.str; + return true; + } + + friend bool operator!=(const Token& a, const Token& b) { + return !(a == b); + } + + friend bool operator <( const Token& a, const Token& b ) { + if (a.ty != b.ty) return (int)a.ty < (int)b.ty; + if (a.ty == TY_CHAR) return a.c < b.c; + if (a.ty == TY_INT) return a.i < b.i; + if (a.ty == TY_FLOAT) return a.f < b.f; + if (a.ty == TY_IDENTIFIER) return a.str < b.str; + if (a.ty == TY_STRING) return a.str < b.str; + if (a.ty == TY_SYMBOL) return a.str < b.str; + return false; + } + + friend std::ostream& operator<<(std::ostream& cout, const Token& t) + { + if (t.ty == TY_EOF) return cout << "eof"; + if (t.ty == TY_CHAR) return cout << "Char(" << t.c << ")"; + if (t.ty == TY_INT) return cout << "Int(" << t.i << ")"; + if (t.ty == TY_FLOAT) return cout << "Float(" << t.f << ")"; + if (t.ty == TY_IDENTIFIER) return cout << "Id(" << t.str << ")"; + if (t.ty == TY_STRING) return cout << "String(" << t.str << ")"; + if (t.ty == TY_SYMBOL) return cout << "Symbol(" << t.str << ")"; + return cout << "unknown"; + } + + private: + Type ty; //< the type of the token + union { + char c; //< data for char tokens + int i; //< data for int tokens + float f; //< data for float tokens + }; + std::string str; //< data for string and identifier tokens + ParseLocation loc; //< the location the token is from + }; + + /*! build full tokenizer that takes list of valid characters and keywords */ + class TokenStream : public Stream<Token> + { + public: + + /*! shorthands for common sets of characters */ + static const std::string alpha; + static const std::string ALPHA; + static const std::string numbers; + static const std::string separators; + static const std::string stringChars; + + public: + TokenStream(const Ref<Stream<int> >& cin, + const std::string& alpha, //< valid characters for identifiers + const std::string& seps, //< characters that act as separators + const std::vector<std::string>& symbols = std::vector<std::string>()); //< symbols + public: + ParseLocation location() { return cin->loc(); } + Token next(); + bool trySymbol(const std::string& symbol); + + private: + void skipSeparators(); + bool decDigits(std::string& str); + bool decDigits1(std::string& str); + bool trySymbols(Token& token, const ParseLocation& loc); + bool tryFloat(Token& token, const ParseLocation& loc); + bool tryInt(Token& token, const ParseLocation& loc); + bool tryString(Token& token, const ParseLocation& loc); + bool tryIdentifier(Token& token, const ParseLocation& loc); + + Ref<Stream<int> > cin; + bool isSepMap[256]; + bool isAlphaMap[256]; + bool isStringCharMap[256]; + std::vector<std::string> symbols; + + /*! checks if a character is a separator */ + __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; } + + /*! checks if a character is a number */ + __forceinline bool isDigit(unsigned int c) const { return c >= '0' && c <= '9'; } + + /*! checks if a character is valid inside a string */ + __forceinline bool isStringChar(unsigned int c) const { return c<256 && isStringCharMap[c]; } + + /*! checks if a character is legal for an identifier */ + __forceinline bool isAlpha(unsigned int c) const { return c<256 && isAlphaMap[c]; } + __forceinline bool isAlphaNum(unsigned int c) const { return isAlpha(c) || isDigit(c); } + }; +} diff --git a/thirdparty/embree/common/math/affinespace.h b/thirdparty/embree/common/math/affinespace.h new file mode 100644 index 0000000000..9d4a0f0846 --- /dev/null +++ b/thirdparty/embree/common/math/affinespace.h @@ -0,0 +1,361 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "linearspace2.h" +#include "linearspace3.h" +#include "quaternion.h" +#include "bbox.h" +#include "vec4.h" + +namespace embree +{ + #define VectorT typename L::Vector + #define ScalarT typename L::Vector::Scalar + + //////////////////////////////////////////////////////////////////////////////// + // Affine Space + //////////////////////////////////////////////////////////////////////////////// + + template<typename L> + struct AffineSpaceT + { + L l; /*< linear part of affine space */ + VectorT p; /*< affine part of affine space */ + + //////////////////////////////////////////////////////////////////////////////// + // Constructors, Assignment, Cast, Copy Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline AffineSpaceT ( ) { } + __forceinline AffineSpaceT ( const AffineSpaceT& other ) { l = other.l; p = other.p; } + __forceinline AffineSpaceT ( const L & other ) { l = other ; p = VectorT(zero); } + __forceinline AffineSpaceT& operator=( const AffineSpaceT& other ) { l = other.l; p = other.p; return *this; } + + __forceinline AffineSpaceT( const VectorT& vx, const VectorT& vy, const VectorT& vz, const VectorT& p ) : l(vx,vy,vz), p(p) {} + __forceinline AffineSpaceT( const L& l, const VectorT& p ) : l(l), p(p) {} + + template<typename L1> __forceinline AffineSpaceT( const AffineSpaceT<L1>& s ) : l(s.l), p(s.p) {} + + //////////////////////////////////////////////////////////////////////////////// + // Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline AffineSpaceT( ZeroTy ) : l(zero), p(zero) {} + __forceinline AffineSpaceT( OneTy ) : l(one), p(zero) {} + + /*! return matrix for scaling */ + static __forceinline AffineSpaceT scale(const VectorT& s) { return L::scale(s); } + + /*! return matrix for translation */ + static __forceinline AffineSpaceT translate(const VectorT& p) { return AffineSpaceT(one,p); } + + /*! return matrix for rotation, only in 2D */ + static __forceinline AffineSpaceT rotate(const ScalarT& r) { return L::rotate(r); } + + /*! return matrix for rotation around arbitrary point (2D) or axis (3D) */ + static __forceinline AffineSpaceT rotate(const VectorT& u, const ScalarT& r) { return L::rotate(u,r); } + + /*! return matrix for rotation around arbitrary axis and point, only in 3D */ + static __forceinline AffineSpaceT rotate(const VectorT& p, const VectorT& u, const ScalarT& r) { return translate(+p) * rotate(u,r) * translate(-p); } + + /*! return matrix for looking at given point, only in 3D */ + static __forceinline AffineSpaceT lookat(const VectorT& eye, const VectorT& point, const VectorT& up) { + VectorT Z = normalize(point-eye); + VectorT U = normalize(cross(up,Z)); + VectorT V = normalize(cross(Z,U)); + return AffineSpaceT(L(U,V,Z),eye); + } + + }; + + // template specialization to get correct identity matrix for type AffineSpace3fa + template<> + __forceinline AffineSpaceT<LinearSpace3ff>::AffineSpaceT( OneTy ) : l(one), p(0.f, 0.f, 0.f, 1.f) {} + + //////////////////////////////////////////////////////////////////////////////// + // Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename L> __forceinline AffineSpaceT<L> operator -( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(-a.l,-a.p); } + template<typename L> __forceinline AffineSpaceT<L> operator +( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(+a.l,+a.p); } + template<typename L> __forceinline AffineSpaceT<L> rcp( const AffineSpaceT<L>& a ) { L il = rcp(a.l); return AffineSpaceT<L>(il,-(il*a.p)); } + + //////////////////////////////////////////////////////////////////////////////// + // Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename L> __forceinline const AffineSpaceT<L> operator +( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l+b.l,a.p+b.p); } + template<typename L> __forceinline const AffineSpaceT<L> operator -( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l-b.l,a.p-b.p); } + + template<typename L> __forceinline const AffineSpaceT<L> operator *( const ScalarT & a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a*b.l,a*b.p); } + template<typename L> __forceinline const AffineSpaceT<L> operator *( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l*b.l,a.l*b.p+a.p); } + template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a * rcp(b); } + template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const ScalarT & b ) { return a * rcp(b); } + + template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a * b; } + template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const ScalarT & b ) { return a = a * b; } + template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a / b; } + template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const ScalarT & b ) { return a = a / b; } + + template<typename L> __forceinline VectorT xfmPoint (const AffineSpaceT<L>& m, const VectorT& p) { return madd(VectorT(p.x),m.l.vx,madd(VectorT(p.y),m.l.vy,madd(VectorT(p.z),m.l.vz,m.p))); } + template<typename L> __forceinline VectorT xfmVector(const AffineSpaceT<L>& m, const VectorT& v) { return xfmVector(m.l,v); } + template<typename L> __forceinline VectorT xfmNormal(const AffineSpaceT<L>& m, const VectorT& n) { return xfmNormal(m.l,n); } + + __forceinline const BBox<Vec3fa> xfmBounds(const AffineSpaceT<LinearSpace3<Vec3fa> >& m, const BBox<Vec3fa>& b) + { + BBox3fa dst = empty; + const Vec3fa p0(b.lower.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p0)); + const Vec3fa p1(b.lower.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p1)); + const Vec3fa p2(b.lower.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p2)); + const Vec3fa p3(b.lower.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p3)); + const Vec3fa p4(b.upper.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p4)); + const Vec3fa p5(b.upper.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p5)); + const Vec3fa p6(b.upper.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p6)); + const Vec3fa p7(b.upper.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p7)); + return dst; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename L> __forceinline bool operator ==( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l == b.l && a.p == b.p; } + template<typename L> __forceinline bool operator !=( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l != b.l || a.p != b.p; } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template<typename L> __forceinline AffineSpaceT<L> select ( const typename L::Vector::Scalar::Bool& s, const AffineSpaceT<L>& t, const AffineSpaceT<L>& f ) { + return AffineSpaceT<L>(select(s,t.l,f.l),select(s,t.p,f.p)); + } + + //////////////////////////////////////////////////////////////////////////////// + // Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename L> static embree_ostream operator<<(embree_ostream cout, const AffineSpaceT<L>& m) { + return cout << "{ l = " << m.l << ", p = " << m.p << " }"; + } + + //////////////////////////////////////////////////////////////////////////////// + // Template Instantiations + //////////////////////////////////////////////////////////////////////////////// + + typedef AffineSpaceT<LinearSpace2f> AffineSpace2f; + typedef AffineSpaceT<LinearSpace3f> AffineSpace3f; + typedef AffineSpaceT<LinearSpace3fa> AffineSpace3fa; + typedef AffineSpaceT<LinearSpace3fx> AffineSpace3fx; + typedef AffineSpaceT<LinearSpace3ff> AffineSpace3ff; + typedef AffineSpaceT<Quaternion3f > OrthonormalSpace3f; + + template<int N> using AffineSpace3vf = AffineSpaceT<LinearSpace3<Vec3<vfloat<N>>>>; + typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<4>>>> AffineSpace3vf4; + typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<8>>>> AffineSpace3vf8; + typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<16>>>> AffineSpace3vf16; + + template<int N> using AffineSpace3vff = AffineSpaceT<LinearSpace3<Vec4<vfloat<N>>>>; + typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<4>>>> AffineSpace3vfa4; + typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<8>>>> AffineSpace3vfa8; + typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<16>>>> AffineSpace3vfa16; + + ////////////////////////////////////////////////////////////////////////////// + /// Interpolation + ////////////////////////////////////////////////////////////////////////////// + template<typename T, typename R> + __forceinline AffineSpaceT<T> lerp(const AffineSpaceT<T>& M0, + const AffineSpaceT<T>& M1, + const R& t) + { + return AffineSpaceT<T>(lerp(M0.l,M1.l,t),lerp(M0.p,M1.p,t)); + } + + // slerp interprets the 16 floats of the matrix M = D * R * S as components of + // three matrizes (D, R, S) that are interpolated individually. + template<typename T> __forceinline AffineSpaceT<LinearSpace3<Vec3<T>>> + slerp(const AffineSpaceT<LinearSpace3<Vec4<T>>>& M0, + const AffineSpaceT<LinearSpace3<Vec4<T>>>& M1, + const T& t) + { + QuaternionT<T> q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w); + QuaternionT<T> q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w); + QuaternionT<T> q = slerp(q0, q1, t); + + AffineSpaceT<LinearSpace3<Vec3<T>>> S = lerp(M0, M1, t); + AffineSpaceT<LinearSpace3<Vec3<T>>> D(one); + D.p.x = S.l.vx.y; + D.p.y = S.l.vx.z; + D.p.z = S.l.vy.z; + S.l.vx.y = 0; + S.l.vx.z = 0; + S.l.vy.z = 0; + + AffineSpaceT<LinearSpace3<Vec3<T>>> R = LinearSpace3<Vec3<T>>(q); + return D * R * S; + } + + // this is a specialized version for Vec3fa because that does + // not play along nicely with the other templated Vec3/Vec4 types + __forceinline AffineSpace3fa slerp(const AffineSpace3ff& M0, + const AffineSpace3ff& M1, + const float& t) + { + Quaternion3f q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w); + Quaternion3f q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w); + Quaternion3f q = slerp(q0, q1, t); + + AffineSpace3fa S = lerp(M0, M1, t); + AffineSpace3fa D(one); + D.p.x = S.l.vx.y; + D.p.y = S.l.vx.z; + D.p.z = S.l.vy.z; + S.l.vx.y = 0; + S.l.vx.z = 0; + S.l.vy.z = 0; + + AffineSpace3fa R = LinearSpace3fa(q); + return D * R * S; + } + + __forceinline AffineSpace3fa quaternionDecompositionToAffineSpace(const AffineSpace3ff& qd) + { + // compute affine transform from quaternion decomposition + Quaternion3f q(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w); + AffineSpace3fa M = qd; + AffineSpace3fa D(one); + D.p.x = M.l.vx.y; + D.p.y = M.l.vx.z; + D.p.z = M.l.vy.z; + M.l.vx.y = 0; + M.l.vx.z = 0; + M.l.vy.z = 0; + AffineSpace3fa R = LinearSpace3fa(q); + return D * R * M; + } + + __forceinline void quaternionDecomposition(const AffineSpace3ff& qd, Vec3fa& T, Quaternion3f& q, AffineSpace3fa& S) + { + q = Quaternion3f(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w); + S = qd; + T.x = qd.l.vx.y; + T.y = qd.l.vx.z; + T.z = qd.l.vy.z; + S.l.vx.y = 0; + S.l.vx.z = 0; + S.l.vy.z = 0; + } + + __forceinline AffineSpace3fx quaternionDecomposition(Vec3fa const& T, Quaternion3f const& q, AffineSpace3fa const& S) + { + AffineSpace3ff M = S; + M.l.vx.w = q.i; + M.l.vy.w = q.j; + M.l.vz.w = q.k; + M.p.w = q.r; + M.l.vx.y = T.x; + M.l.vx.z = T.y; + M.l.vy.z = T.z; + return M; + } + + struct __aligned(16) QuaternionDecomposition + { + float scale_x = 1.f; + float scale_y = 1.f; + float scale_z = 1.f; + float skew_xy = 0.f; + float skew_xz = 0.f; + float skew_yz = 0.f; + float shift_x = 0.f; + float shift_y = 0.f; + float shift_z = 0.f; + float quaternion_r = 1.f; + float quaternion_i = 0.f; + float quaternion_j = 0.f; + float quaternion_k = 0.f; + float translation_x = 0.f; + float translation_y = 0.f; + float translation_z = 0.f; + }; + + __forceinline QuaternionDecomposition quaternionDecomposition(AffineSpace3ff const& M) + { + QuaternionDecomposition qd; + qd.scale_x = M.l.vx.x; + qd.scale_y = M.l.vy.y; + qd.scale_z = M.l.vz.z; + qd.shift_x = M.p.x; + qd.shift_y = M.p.y; + qd.shift_z = M.p.z; + qd.translation_x = M.l.vx.y; + qd.translation_y = M.l.vx.z; + qd.translation_z = M.l.vy.z; + qd.skew_xy = M.l.vy.x; + qd.skew_xz = M.l.vz.x; + qd.skew_yz = M.l.vz.y; + qd.quaternion_r = M.p.w; + qd.quaternion_i = M.l.vx.w; + qd.quaternion_j = M.l.vy.w; + qd.quaternion_k = M.l.vz.w; + return qd; + } + + //////////////////////////////////////////////////////////////////////////////// + /* + * ! Template Specialization for 2D: return matrix for rotation around point + * (rotation around arbitrarty vector is not meaningful in 2D) + */ + template<> __forceinline + AffineSpace2f AffineSpace2f::rotate(const Vec2f& p, const float& r) { + return translate(+p)*AffineSpace2f(LinearSpace2f::rotate(r))*translate(-p); + } + + //////////////////////////////////////////////////////////////////////////////// + // Similarity Transform + // + // checks, if M is a similarity transformation, i.e if there exists a factor D + // such that for all x,y: distance(Mx, My) = D * distance(x, y) + //////////////////////////////////////////////////////////////////////////////// + __forceinline bool similarityTransform(const AffineSpace3fa& M, float* D) + { + if (D) *D = 0.f; + if (abs(dot(M.l.vx, M.l.vy)) > 1e-5f) return false; + if (abs(dot(M.l.vx, M.l.vz)) > 1e-5f) return false; + if (abs(dot(M.l.vy, M.l.vz)) > 1e-5f) return false; + + const float D_x = dot(M.l.vx, M.l.vx); + const float D_y = dot(M.l.vy, M.l.vy); + const float D_z = dot(M.l.vz, M.l.vz); + + if (abs(D_x - D_y) > 1e-5f || + abs(D_x - D_z) > 1e-5f || + abs(D_y - D_z) > 1e-5f) + return false; + + if (D) *D = sqrtf(D_x); + return true; + } + + __forceinline void AffineSpace3fa_store_unaligned(const AffineSpace3fa &source, AffineSpace3fa* ptr) + { + Vec3fa::storeu(&ptr->l.vx, source.l.vx); + Vec3fa::storeu(&ptr->l.vy, source.l.vy); + Vec3fa::storeu(&ptr->l.vz, source.l.vz); + Vec3fa::storeu(&ptr->p, source.p); + } + + __forceinline AffineSpace3fa AffineSpace3fa_load_unaligned(AffineSpace3fa* ptr) + { + AffineSpace3fa space; + space.l.vx = Vec3fa::loadu(&ptr->l.vx); + space.l.vy = Vec3fa::loadu(&ptr->l.vy); + space.l.vz = Vec3fa::loadu(&ptr->l.vz); + space.p = Vec3fa::loadu(&ptr->p); + return space; + } + + #undef VectorT + #undef ScalarT +} diff --git a/thirdparty/embree/common/math/bbox.h b/thirdparty/embree/common/math/bbox.h new file mode 100644 index 0000000000..bc43155358 --- /dev/null +++ b/thirdparty/embree/common/math/bbox.h @@ -0,0 +1,331 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec2.h" +#include "vec3.h" + +namespace embree +{ + namespace internal { + + template <typename T> __forceinline T divideByTwo(const T& v) { return v / T(2); } + template <> __forceinline float divideByTwo<float>(const float& v) { return v * 0.5f; } + template <> __forceinline double divideByTwo<double>(const double& v) { return v * 0.5; } + + } // namespace internal + template<typename T> + struct BBox + { + T lower, upper; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline BBox ( ) { } + template<typename T1> + __forceinline BBox ( const BBox<T1>& other ) : lower(other.lower), upper(other.upper) {} + __forceinline BBox& operator=( const BBox& other ) { lower = other.lower; upper = other.upper; return *this; } + + __forceinline BBox ( const T& v ) : lower(v), upper(v) {} + __forceinline BBox ( const T& lower, const T& upper ) : lower(lower), upper(upper) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Extending Bounds + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const BBox& extend(const BBox& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; } + __forceinline const BBox& extend(const T & other) { lower = min(lower,other ); upper = max(upper,other ); return *this; } + + /*! tests if box is empty */ + __forceinline bool empty() const { for (int i=0; i<T::N; i++) if (lower[i] > upper[i]) return true; return false; } + + /*! computes the size of the box */ + __forceinline T size() const { return upper - lower; } + + /*! computes the center of the box */ + __forceinline T center() const { return internal::divideByTwo<T>(lower+upper); } + + /*! computes twice the center of the box */ + __forceinline T center2() const { return lower+upper; } + + /*! merges two boxes */ + __forceinline static const BBox merge (const BBox& a, const BBox& b) { + return BBox(min(a.lower, b.lower), max(a.upper, b.upper)); + } + + /*! enlarge box by some scaling factor */ + __forceinline BBox enlarge_by(const float a) const { + return BBox(lower - T(a)*abs(lower), upper + T(a)*abs(upper)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline BBox( EmptyTy ) : lower(pos_inf), upper(neg_inf) {} + __forceinline BBox( FullTy ) : lower(neg_inf), upper(pos_inf) {} + __forceinline BBox( FalseTy ) : lower(pos_inf), upper(neg_inf) {} + __forceinline BBox( TrueTy ) : lower(neg_inf), upper(pos_inf) {} + __forceinline BBox( NegInfTy ): lower(pos_inf), upper(neg_inf) {} + __forceinline BBox( PosInfTy ): lower(neg_inf), upper(pos_inf) {} + }; + + template<> __forceinline bool BBox<float>::empty() const { + return lower > upper; + } + +#if defined(__SSE__) + template<> __forceinline bool BBox<Vec3fa>::empty() const { + return !all(le_mask(lower,upper)); + } + template<> __forceinline bool BBox<Vec3fx>::empty() const { + return !all(le_mask(lower,upper)); + } +#endif + + /*! tests if box is finite */ + __forceinline bool isvalid( const BBox<Vec3fa>& v ) { + return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE))); + } + + /*! tests if box is finite and non-empty*/ + __forceinline bool isvalid_non_empty( const BBox<Vec3fa>& v ) { + return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)) & le_mask(v.lower,v.upper)); + } + + /*! tests if box has finite entries */ + __forceinline bool is_finite( const BBox<Vec3fa>& b) { + return is_finite(b.lower) && is_finite(b.upper); + } + + /*! test if point contained in box */ + __forceinline bool inside ( const BBox<Vec3fa>& b, const Vec3fa& p ) { return all(ge_mask(p,b.lower) & le_mask(p,b.upper)); } + + /*! computes the center of the box */ + template<typename T> __forceinline const T center2(const BBox<T>& box) { return box.lower + box.upper; } + template<typename T> __forceinline const T center (const BBox<T>& box) { return internal::divideByTwo<T>(center2(box)); } + + /*! computes the volume of a bounding box */ + __forceinline float volume ( const BBox<Vec3fa>& b ) { return reduce_mul(b.size()); } + __forceinline float safeVolume( const BBox<Vec3fa>& b ) { if (b.empty()) return 0.0f; else return volume(b); } + + /*! computes the volume of a bounding box */ + __forceinline float volume( const BBox<Vec3f>& b ) { return reduce_mul(b.size()); } + + /*! computes the surface area of a bounding box */ + template<typename T> __forceinline const T area( const BBox<Vec2<T> >& b ) { const Vec2<T> d = b.size(); return d.x*d.y; } + + template<typename T> __forceinline const T halfArea( const BBox<Vec3<T> >& b ) { return halfArea(b.size()); } + template<typename T> __forceinline const T area( const BBox<Vec3<T> >& b ) { return T(2)*halfArea(b); } + + __forceinline float halfArea( const BBox<Vec3fa>& b ) { return halfArea(b.size()); } + __forceinline float area( const BBox<Vec3fa>& b ) { return 2.0f*halfArea(b); } + + __forceinline float halfArea( const BBox<Vec3fx>& b ) { return halfArea(b.size()); } + __forceinline float area( const BBox<Vec3fx>& b ) { return 2.0f*halfArea(b); } + + template<typename Vec> __forceinline float safeArea( const BBox<Vec>& b ) { if (b.empty()) return 0.0f; else return area(b); } + + template<typename T> __forceinline float expectedApproxHalfArea(const BBox<T>& box) { + return halfArea(box); + } + + /*! merges bounding boxes and points */ + template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const T& b ) { return BBox<T>(min(a.lower, b ), max(a.upper, b )); } + template<typename T> __forceinline const BBox<T> merge( const T& a, const BBox<T>& b ) { return BBox<T>(min(a , b.lower), max(a , b.upper)); } + template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(min(a.lower, b.lower), max(a.upper, b.upper)); } + + /*! Merges three boxes. */ + template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return merge(a,merge(b,c)); } + + /*! Merges four boxes. */ + template<typename T> __forceinline BBox<T> merge(const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d) { + return merge(merge(a,b),merge(c,d)); + } + + /*! Comparison Operators */ + template<typename T> __forceinline bool operator==( const BBox<T>& a, const BBox<T>& b ) { return a.lower == b.lower && a.upper == b.upper; } + template<typename T> __forceinline bool operator!=( const BBox<T>& a, const BBox<T>& b ) { return a.lower != b.lower || a.upper != b.upper; } + + /*! scaling */ + template<typename T> __forceinline BBox<T> operator *( const float& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); } + template<typename T> __forceinline BBox<T> operator *( const T& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); } + + /*! translations */ + template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower+b.lower,a.upper+b.upper); } + template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower-b.lower,a.upper-b.upper); } + template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const T & b ) { return BBox<T>(a.lower+b ,a.upper+b ); } + template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const T & b ) { return BBox<T>(a.lower-b ,a.upper-b ); } + + /*! extension */ + template<typename T> __forceinline BBox<T> enlarge(const BBox<T>& a, const T& b) { return BBox<T>(a.lower-b, a.upper+b); } + + /*! intersect bounding boxes */ + template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(max(a.lower, b.lower), min(a.upper, b.upper)); } + template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return intersect(a,intersect(b,c)); } + template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d ) { return intersect(intersect(a,b),intersect(c,d)); } + + /*! subtract bounds from each other */ + template<typename T> __forceinline void subtract(const BBox<T>& a, const BBox<T>& b, BBox<T>& c, BBox<T>& d) + { + c.lower = a.lower; + c.upper = min(a.upper,b.lower); + d.lower = max(a.lower,b.upper); + d.upper = a.upper; + } + + /*! tests if bounding boxes (and points) are disjoint (empty intersection) */ + template<typename T> __inline bool disjoint( const BBox<T>& a, const BBox<T>& b ) { return intersect(a,b).empty(); } + template<typename T> __inline bool disjoint( const BBox<T>& a, const T& b ) { return disjoint(a,BBox<T>(b)); } + template<typename T> __inline bool disjoint( const T& a, const BBox<T>& b ) { return disjoint(BBox<T>(a),b); } + + /*! tests if bounding boxes (and points) are conjoint (non-empty intersection) */ + template<typename T> __inline bool conjoint( const BBox<T>& a, const BBox<T>& b ) { return !intersect(a,b).empty(); } + template<typename T> __inline bool conjoint( const BBox<T>& a, const T& b ) { return conjoint(a,BBox<T>(b)); } + template<typename T> __inline bool conjoint( const T& a, const BBox<T>& b ) { return conjoint(BBox<T>(a),b); } + + /*! subset relation */ + template<typename T> __inline bool subset( const BBox<T>& a, const BBox<T>& b ) + { + for ( size_t i = 0; i < T::N; i++ ) if ( a.lower[i] < b.lower[i] ) return false; + for ( size_t i = 0; i < T::N; i++ ) if ( a.upper[i] > b.upper[i] ) return false; + return true; + } + + template<> __inline bool subset( const BBox<Vec3fa>& a, const BBox<Vec3fa>& b ) { + return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper)); + } + + template<> __inline bool subset( const BBox<Vec3fx>& a, const BBox<Vec3fx>& b ) { + return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper)); + } + + /*! blending */ + template<typename T> + __forceinline BBox<T> lerp(const BBox<T>& b0, const BBox<T>& b1, const float t) { + return BBox<T>(lerp(b0.lower,b1.lower,t),lerp(b0.upper,b1.upper,t)); + } + + /*! output operator */ + template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const BBox<T>& box) { + return cout << "[" << box.lower << "; " << box.upper << "]"; + } + + /*! default template instantiations */ + typedef BBox<float> BBox1f; + typedef BBox<Vec2f> BBox2f; + typedef BBox<Vec2fa> BBox2fa; + typedef BBox<Vec3f> BBox3f; + typedef BBox<Vec3fa> BBox3fa; + typedef BBox<Vec3fx> BBox3fx; + typedef BBox<Vec3ff> BBox3ff; +} + +//////////////////////////////////////////////////////////////////////////////// +/// SSE / AVX / MIC specializations +//////////////////////////////////////////////////////////////////////////////// + +#if defined __SSE__ +#include "../simd/sse.h" +#endif + +#if defined __AVX__ +#include "../simd/avx.h" +#endif + +#if defined(__AVX512F__) +#include "../simd/avx512.h" +#endif + +namespace embree +{ + template<int N> + __forceinline BBox<Vec3<vfloat<N>>> transpose(const BBox3fa* bounds); + + template<> + __forceinline BBox<Vec3<vfloat4>> transpose<4>(const BBox3fa* bounds) + { + BBox<Vec3<vfloat4>> dest; + + transpose((vfloat4&)bounds[0].lower, + (vfloat4&)bounds[1].lower, + (vfloat4&)bounds[2].lower, + (vfloat4&)bounds[3].lower, + dest.lower.x, + dest.lower.y, + dest.lower.z); + + transpose((vfloat4&)bounds[0].upper, + (vfloat4&)bounds[1].upper, + (vfloat4&)bounds[2].upper, + (vfloat4&)bounds[3].upper, + dest.upper.x, + dest.upper.y, + dest.upper.z); + + return dest; + } + +#if defined(__AVX__) + template<> + __forceinline BBox<Vec3<vfloat8>> transpose<8>(const BBox3fa* bounds) + { + BBox<Vec3<vfloat8>> dest; + + transpose((vfloat4&)bounds[0].lower, + (vfloat4&)bounds[1].lower, + (vfloat4&)bounds[2].lower, + (vfloat4&)bounds[3].lower, + (vfloat4&)bounds[4].lower, + (vfloat4&)bounds[5].lower, + (vfloat4&)bounds[6].lower, + (vfloat4&)bounds[7].lower, + dest.lower.x, + dest.lower.y, + dest.lower.z); + + transpose((vfloat4&)bounds[0].upper, + (vfloat4&)bounds[1].upper, + (vfloat4&)bounds[2].upper, + (vfloat4&)bounds[3].upper, + (vfloat4&)bounds[4].upper, + (vfloat4&)bounds[5].upper, + (vfloat4&)bounds[6].upper, + (vfloat4&)bounds[7].upper, + dest.upper.x, + dest.upper.y, + dest.upper.z); + + return dest; + } +#endif + + template<int N> + __forceinline BBox3fa merge(const BBox3fa* bounds); + + template<> + __forceinline BBox3fa merge<4>(const BBox3fa* bounds) + { + const Vec3fa lower = min(min(bounds[0].lower,bounds[1].lower), + min(bounds[2].lower,bounds[3].lower)); + const Vec3fa upper = max(max(bounds[0].upper,bounds[1].upper), + max(bounds[2].upper,bounds[3].upper)); + return BBox3fa(lower,upper); + } + +#if defined(__AVX__) + template<> + __forceinline BBox3fa merge<8>(const BBox3fa* bounds) + { + const Vec3fa lower = min(min(min(bounds[0].lower,bounds[1].lower),min(bounds[2].lower,bounds[3].lower)), + min(min(bounds[4].lower,bounds[5].lower),min(bounds[6].lower,bounds[7].lower))); + const Vec3fa upper = max(max(max(bounds[0].upper,bounds[1].upper),max(bounds[2].upper,bounds[3].upper)), + max(max(bounds[4].upper,bounds[5].upper),max(bounds[6].upper,bounds[7].upper))); + return BBox3fa(lower,upper); + } +#endif +} + diff --git a/thirdparty/embree/common/math/col3.h b/thirdparty/embree/common/math/col3.h new file mode 100644 index 0000000000..3f50c04393 --- /dev/null +++ b/thirdparty/embree/common/math/col3.h @@ -0,0 +1,47 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// RGB Color Class + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> struct Col3 + { + T r, g, b; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Col3 ( ) { } + __forceinline Col3 ( const Col3& other ) { r = other.r; g = other.g; b = other.b; } + __forceinline Col3& operator=( const Col3& other ) { r = other.r; g = other.g; b = other.b; return *this; } + + __forceinline explicit Col3 (const T& v) : r(v), g(v), b(v) {} + __forceinline Col3 (const T& r, const T& g, const T& b) : r(r), g(g), b(b) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Col3 (ZeroTy) : r(zero) , g(zero) , b(zero) {} + __forceinline Col3 (OneTy) : r(one) , g(one) , b(one) {} + __forceinline Col3 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf) {} + __forceinline Col3 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf) {} + }; + + /*! output operator */ + template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col3<T>& a) { + return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")"; + } + + /*! default template instantiations */ + typedef Col3<unsigned char> Col3uc; + typedef Col3<float > Col3f; +} diff --git a/thirdparty/embree/common/math/col4.h b/thirdparty/embree/common/math/col4.h new file mode 100644 index 0000000000..788508516b --- /dev/null +++ b/thirdparty/embree/common/math/col4.h @@ -0,0 +1,47 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// RGBA Color Class + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> struct Col4 + { + T r, g, b, a; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Col4 ( ) { } + __forceinline Col4 ( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; } + __forceinline Col4& operator=( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; return *this; } + + __forceinline explicit Col4 (const T& v) : r(v), g(v), b(v), a(v) {} + __forceinline Col4 (const T& r, const T& g, const T& b, const T& a) : r(r), g(g), b(b), a(a) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Col4 (ZeroTy) : r(zero) , g(zero) , b(zero) , a(zero) {} + __forceinline Col4 (OneTy) : r(one) , g(one) , b(one) , a(one) {} + __forceinline Col4 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf), a(pos_inf) {} + __forceinline Col4 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf), a(neg_inf) {} + }; + + /*! output operator */ + template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col4<T>& a) { + return cout << "(" << a.r << ", " << a.g << ", " << a.b << ", " << a.a << ")"; + } + + /*! default template instantiations */ + typedef Col4<unsigned char> Col4uc; + typedef Col4<float > Col4f; +} diff --git a/thirdparty/embree/common/math/color.h b/thirdparty/embree/common/math/color.h new file mode 100644 index 0000000000..529584ea16 --- /dev/null +++ b/thirdparty/embree/common/math/color.h @@ -0,0 +1,241 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "constants.h" +#include "col3.h" +#include "col4.h" + +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE RGBA Color Class + //////////////////////////////////////////////////////////////////////////////// + + struct Color4 + { + union { + __m128 m128; + struct { float r,g,b,a; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Color4 () {} + __forceinline Color4 ( const __m128 a ) : m128(a) {} + + __forceinline explicit Color4 (const float v) : m128(_mm_set1_ps(v)) {} + __forceinline Color4 (const float r, const float g, const float b, const float a) : m128(_mm_set_ps(a,b,g,r)) {} + + __forceinline explicit Color4 ( const Col3uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(255.0f,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); } + __forceinline explicit Color4 ( const Col3f& other ) { m128 = _mm_set_ps(1.0f,other.b,other.g,other.r); } + __forceinline explicit Color4 ( const Col4uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(other.a,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); } + __forceinline explicit Color4 ( const Col4f& other ) { m128 = _mm_set_ps(other.a,other.b,other.g,other.r); } + + __forceinline Color4 ( const Color4& other ) : m128(other.m128) {} + __forceinline Color4& operator=( const Color4& other ) { m128 = other.m128; return *this; } + + __forceinline operator const __m128&() const { return m128; } + __forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Set + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; } + __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = a; } + __forceinline void set(Col3uc& d) const + { + vfloat4 s = clamp(vfloat4(m128))*255.0f; + d.r = (unsigned char)(s[0]); + d.g = (unsigned char)(s[1]); + d.b = (unsigned char)(s[2]); + } + __forceinline void set(Col4uc& d) const + { + vfloat4 s = clamp(vfloat4(m128))*255.0f; + d.r = (unsigned char)(s[0]); + d.g = (unsigned char)(s[1]); + d.b = (unsigned char)(s[2]); + d.a = (unsigned char)(s[3]); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Color4( ZeroTy ) : m128(_mm_set1_ps(0.0f)) {} + __forceinline Color4( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Color4( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Color4( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + }; + + //////////////////////////////////////////////////////////////////////////////// + /// SSE RGB Color Class + //////////////////////////////////////////////////////////////////////////////// + + struct Color + { + union { + __m128 m128; + struct { float r,g,b; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Color () {} + __forceinline Color ( const __m128 a ) : m128(a) {} + + __forceinline explicit Color (const float v) : m128(_mm_set1_ps(v)) {} + __forceinline Color (const float r, const float g, const float b) : m128(_mm_set_ps(0.0f,b,g,r)) {} + + __forceinline Color ( const Color& other ) : m128(other.m128) {} + __forceinline Color& operator=( const Color& other ) { m128 = other.m128; return *this; } + + __forceinline Color ( const Color4& other ) : m128(other.m128) {} + __forceinline Color& operator=( const Color4& other ) { m128 = other.m128; return *this; } + + __forceinline operator const __m128&() const { return m128; } + __forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Set + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; } + __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = 1.0f; } + __forceinline void set(Col3uc& d) const + { + vfloat4 s = clamp(vfloat4(m128))*255.0f; + d.r = (unsigned char)(s[0]); + d.g = (unsigned char)(s[1]); + d.b = (unsigned char)(s[2]); + } + __forceinline void set(Col4uc& d) const + { + vfloat4 s = clamp(vfloat4(m128))*255.0f; + d.r = (unsigned char)(s[0]); + d.g = (unsigned char)(s[1]); + d.b = (unsigned char)(s[2]); + d.a = 255; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Color( ZeroTy ) : m128(_mm_set1_ps(0.0f)) {} + __forceinline Color( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Color( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Color( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const Color operator +( const Color& a ) { return a; } + __forceinline const Color operator -( const Color& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return _mm_xor_ps(a.m128, mask); + } + __forceinline const Color abs ( const Color& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(a.m128, mask); + } + __forceinline const Color rcp ( const Color& a ) + { +#if defined(__AVX512VL__) + const Color r = _mm_rcp14_ps(a.m128); +#else + const Color r = _mm_rcp_ps(a.m128); +#endif + return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); + } + __forceinline const Color rsqrt( const Color& a ) + { +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ps(a.m128); +#else + __m128 r = _mm_rsqrt_ps(a.m128); +#endif + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + } + __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const Color operator +( const Color& a, const Color& b ) { return _mm_add_ps(a.m128, b.m128); } + __forceinline const Color operator -( const Color& a, const Color& b ) { return _mm_sub_ps(a.m128, b.m128); } + __forceinline const Color operator *( const Color& a, const Color& b ) { return _mm_mul_ps(a.m128, b.m128); } + __forceinline const Color operator *( const Color& a, const float b ) { return a * Color(b); } + __forceinline const Color operator *( const float a, const Color& b ) { return Color(a) * b; } + __forceinline const Color operator /( const Color& a, const Color& b ) { return a * rcp(b); } + __forceinline const Color operator /( const Color& a, const float b ) { return a * rcp(b); } + + __forceinline const Color min( const Color& a, const Color& b ) { return _mm_min_ps(a.m128,b.m128); } + __forceinline const Color max( const Color& a, const Color& b ) { return _mm_max_ps(a.m128,b.m128); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const Color operator+=(Color& a, const Color& b) { return a = a + b; } + __forceinline const Color operator-=(Color& a, const Color& b) { return a = a - b; } + __forceinline const Color operator*=(Color& a, const Color& b) { return a = a * b; } + __forceinline const Color operator/=(Color& a, const Color& b) { return a = a / b; } + __forceinline const Color operator*=(Color& a, const float b ) { return a = a * b; } + __forceinline const Color operator/=(Color& a, const float b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float reduce_add(const Color& v) { return v.r+v.g+v.b; } + __forceinline float reduce_mul(const Color& v) { return v.r*v.g*v.b; } + __forceinline float reduce_min(const Color& v) { return min(v.r,v.g,v.b); } + __forceinline float reduce_max(const Color& v) { return max(v.r,v.g,v.b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } + __forceinline bool operator !=( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } + __forceinline bool operator < ( const Color& a, const Color& b ) { + if (a.r != b.r) return a.r < b.r; + if (a.g != b.g) return a.g < b.g; + if (a.b != b.b) return a.b < b.b; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const Color select( bool s, const Color& t, const Color& f ) { + __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); + return blendv_ps(f, t, mask); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Special Operators + //////////////////////////////////////////////////////////////////////////////// + + /*! computes luminance of a color */ + __forceinline float luminance (const Color& a) { return madd(0.212671f,a.r,madd(0.715160f,a.g,0.072169f*a.b)); } + + /*! output operator */ + __forceinline embree_ostream operator<<(embree_ostream cout, const Color& a) { + return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")"; + } +} diff --git a/thirdparty/embree/common/math/constants.cpp b/thirdparty/embree/common/math/constants.cpp new file mode 100644 index 0000000000..03919ae20c --- /dev/null +++ b/thirdparty/embree/common/math/constants.cpp @@ -0,0 +1,27 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "constants.h" + +namespace embree +{ + TrueTy True; + FalseTy False; + ZeroTy zero; + OneTy one; + NegInfTy neg_inf; + PosInfTy inf; + PosInfTy pos_inf; + NaNTy nan; + UlpTy ulp; + PiTy pi; + OneOverPiTy one_over_pi; + TwoPiTy two_pi; + OneOverTwoPiTy one_over_two_pi; + FourPiTy four_pi; + OneOverFourPiTy one_over_four_pi; + StepTy step; + ReverseStepTy reverse_step; + EmptyTy empty; + UndefinedTy undefined; +} diff --git a/thirdparty/embree/common/math/constants.h b/thirdparty/embree/common/math/constants.h new file mode 100644 index 0000000000..578473a8ab --- /dev/null +++ b/thirdparty/embree/common/math/constants.h @@ -0,0 +1,197 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" + +#include <limits> + +#define _USE_MATH_DEFINES +#include <math.h> // using cmath causes issues under Windows +#include <cfloat> +#include <climits> + +namespace embree +{ + static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f; + static MAYBE_UNUSED const float min_rcp_input = 1E-18f; // for abs(x) >= min_rcp_input the newton raphson rcp calculation does not fail + + /* we consider floating point numbers in that range as valid input numbers */ + static MAYBE_UNUSED float FLT_LARGE = 1.844E18f; + + struct TrueTy { + __forceinline operator bool( ) const { return true; } + }; + + extern MAYBE_UNUSED TrueTy True; + + struct FalseTy { + __forceinline operator bool( ) const { return false; } + }; + + extern MAYBE_UNUSED FalseTy False; + + struct ZeroTy + { + __forceinline operator double ( ) const { return 0; } + __forceinline operator float ( ) const { return 0; } + __forceinline operator long long( ) const { return 0; } + __forceinline operator unsigned long long( ) const { return 0; } + __forceinline operator long ( ) const { return 0; } + __forceinline operator unsigned long ( ) const { return 0; } + __forceinline operator int ( ) const { return 0; } + __forceinline operator unsigned int ( ) const { return 0; } + __forceinline operator short ( ) const { return 0; } + __forceinline operator unsigned short ( ) const { return 0; } + __forceinline operator char ( ) const { return 0; } + __forceinline operator unsigned char ( ) const { return 0; } + }; + + extern MAYBE_UNUSED ZeroTy zero; + + struct OneTy + { + __forceinline operator double ( ) const { return 1; } + __forceinline operator float ( ) const { return 1; } + __forceinline operator long long( ) const { return 1; } + __forceinline operator unsigned long long( ) const { return 1; } + __forceinline operator long ( ) const { return 1; } + __forceinline operator unsigned long ( ) const { return 1; } + __forceinline operator int ( ) const { return 1; } + __forceinline operator unsigned int ( ) const { return 1; } + __forceinline operator short ( ) const { return 1; } + __forceinline operator unsigned short ( ) const { return 1; } + __forceinline operator char ( ) const { return 1; } + __forceinline operator unsigned char ( ) const { return 1; } + }; + + extern MAYBE_UNUSED OneTy one; + + struct NegInfTy + { + __forceinline operator double ( ) const { return -std::numeric_limits<double>::infinity(); } + __forceinline operator float ( ) const { return -std::numeric_limits<float>::infinity(); } + __forceinline operator long long( ) const { return std::numeric_limits<long long>::min(); } + __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::min(); } + __forceinline operator long ( ) const { return std::numeric_limits<long>::min(); } + __forceinline operator unsigned long ( ) const { return std::numeric_limits<unsigned long>::min(); } + __forceinline operator int ( ) const { return std::numeric_limits<int>::min(); } + __forceinline operator unsigned int ( ) const { return std::numeric_limits<unsigned int>::min(); } + __forceinline operator short ( ) const { return std::numeric_limits<short>::min(); } + __forceinline operator unsigned short ( ) const { return std::numeric_limits<unsigned short>::min(); } + __forceinline operator char ( ) const { return std::numeric_limits<char>::min(); } + __forceinline operator unsigned char ( ) const { return std::numeric_limits<unsigned char>::min(); } + + }; + + extern MAYBE_UNUSED NegInfTy neg_inf; + + struct PosInfTy + { + __forceinline operator double ( ) const { return std::numeric_limits<double>::infinity(); } + __forceinline operator float ( ) const { return std::numeric_limits<float>::infinity(); } + __forceinline operator long long( ) const { return std::numeric_limits<long long>::max(); } + __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::max(); } + __forceinline operator long ( ) const { return std::numeric_limits<long>::max(); } + __forceinline operator unsigned long ( ) const { return std::numeric_limits<unsigned long>::max(); } + __forceinline operator int ( ) const { return std::numeric_limits<int>::max(); } + __forceinline operator unsigned int ( ) const { return std::numeric_limits<unsigned int>::max(); } + __forceinline operator short ( ) const { return std::numeric_limits<short>::max(); } + __forceinline operator unsigned short ( ) const { return std::numeric_limits<unsigned short>::max(); } + __forceinline operator char ( ) const { return std::numeric_limits<char>::max(); } + __forceinline operator unsigned char ( ) const { return std::numeric_limits<unsigned char>::max(); } + }; + + extern MAYBE_UNUSED PosInfTy inf; + extern MAYBE_UNUSED PosInfTy pos_inf; + + struct NaNTy + { + __forceinline operator double( ) const { return std::numeric_limits<double>::quiet_NaN(); } + __forceinline operator float ( ) const { return std::numeric_limits<float>::quiet_NaN(); } + }; + + extern MAYBE_UNUSED NaNTy nan; + + struct UlpTy + { + __forceinline operator double( ) const { return std::numeric_limits<double>::epsilon(); } + __forceinline operator float ( ) const { return std::numeric_limits<float>::epsilon(); } + }; + + extern MAYBE_UNUSED UlpTy ulp; + + struct PiTy + { + __forceinline operator double( ) const { return double(M_PI); } + __forceinline operator float ( ) const { return float(M_PI); } + }; + + extern MAYBE_UNUSED PiTy pi; + + struct OneOverPiTy + { + __forceinline operator double( ) const { return double(M_1_PI); } + __forceinline operator float ( ) const { return float(M_1_PI); } + }; + + extern MAYBE_UNUSED OneOverPiTy one_over_pi; + + struct TwoPiTy + { + __forceinline operator double( ) const { return double(2.0*M_PI); } + __forceinline operator float ( ) const { return float(2.0*M_PI); } + }; + + extern MAYBE_UNUSED TwoPiTy two_pi; + + struct OneOverTwoPiTy + { + __forceinline operator double( ) const { return double(0.5*M_1_PI); } + __forceinline operator float ( ) const { return float(0.5*M_1_PI); } + }; + + extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi; + + struct FourPiTy + { + __forceinline operator double( ) const { return double(4.0*M_PI); } + __forceinline operator float ( ) const { return float(4.0*M_PI); } + }; + + extern MAYBE_UNUSED FourPiTy four_pi; + + struct OneOverFourPiTy + { + __forceinline operator double( ) const { return double(0.25*M_1_PI); } + __forceinline operator float ( ) const { return float(0.25*M_1_PI); } + }; + + extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi; + + struct StepTy { + }; + + extern MAYBE_UNUSED StepTy step; + + struct ReverseStepTy { + }; + + extern MAYBE_UNUSED ReverseStepTy reverse_step; + + struct EmptyTy { + }; + + extern MAYBE_UNUSED EmptyTy empty; + + struct FullTy { + }; + + extern MAYBE_UNUSED FullTy full; + + struct UndefinedTy { + }; + + extern MAYBE_UNUSED UndefinedTy undefined; +} diff --git a/thirdparty/embree/common/math/interval.h b/thirdparty/embree/common/math/interval.h new file mode 100644 index 0000000000..310add2129 --- /dev/null +++ b/thirdparty/embree/common/math/interval.h @@ -0,0 +1,161 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec2.h" +#include "vec3.h" +#include "bbox.h" + +namespace embree +{ + template<typename V> + struct Interval + { + V lower, upper; + + __forceinline Interval() {} + __forceinline Interval ( const Interval& other ) { lower = other.lower; upper = other.upper; } + __forceinline Interval& operator=( const Interval& other ) { lower = other.lower; upper = other.upper; return *this; } + + __forceinline Interval(const V& a) : lower(a), upper(a) {} + __forceinline Interval(const V& lower, const V& upper) : lower(lower), upper(upper) {} + __forceinline Interval(const BBox<V>& a) : lower(a.lower), upper(a.upper) {} + + /*! tests if box is empty */ + //__forceinline bool empty() const { return lower > upper; } + + /*! computes the size of the interval */ + __forceinline V size() const { return upper - lower; } + + __forceinline V center() const { return 0.5f*(lower+upper); } + + __forceinline const Interval& extend(const Interval& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; } + __forceinline const Interval& extend(const V & other) { lower = min(lower,other ); upper = max(upper,other ); return *this; } + + __forceinline friend Interval operator +( const Interval& a, const Interval& b ) { + return Interval(a.lower+b.lower,a.upper+b.upper); + } + + __forceinline friend Interval operator -( const Interval& a, const Interval& b ) { + return Interval(a.lower-b.upper,a.upper-b.lower); + } + + __forceinline friend Interval operator -( const Interval& a, const V& b ) { + return Interval(a.lower-b,a.upper-b); + } + + __forceinline friend Interval operator *( const Interval& a, const Interval& b ) + { + const V ll = a.lower*b.lower; + const V lu = a.lower*b.upper; + const V ul = a.upper*b.lower; + const V uu = a.upper*b.upper; + return Interval(min(ll,lu,ul,uu),max(ll,lu,ul,uu)); + } + + __forceinline friend Interval merge( const Interval& a, const Interval& b) { + return Interval(min(a.lower,b.lower),max(a.upper,b.upper)); + } + + __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c) { + return merge(merge(a,b),c); + } + + __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c, const Interval& d) { + return merge(merge(a,b),merge(c,d)); + } + + /*! intersect bounding boxes */ + __forceinline friend const Interval intersect( const Interval& a, const Interval& b ) { return Interval(max(a.lower, b.lower), min(a.upper, b.upper)); } + __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c ) { return intersect(a,intersect(b,c)); } + __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c, const Interval& d ) { return intersect(intersect(a,b),intersect(c,d)); } + + friend embree_ostream operator<<(embree_ostream cout, const Interval& a) { + return cout << "[" << a.lower << ", " << a.upper << "]"; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Interval( EmptyTy ) : lower(pos_inf), upper(neg_inf) {} + __forceinline Interval( FullTy ) : lower(neg_inf), upper(pos_inf) {} + }; + + __forceinline bool isEmpty(const Interval<float>& v) { + return v.lower > v.upper; + } + + __forceinline vboolx isEmpty(const Interval<vfloatx>& v) { + return v.lower > v.upper; + } + + /*! subset relation */ + template<typename T> __forceinline bool subset( const Interval<T>& a, const Interval<T>& b ) { + return (a.lower > b.lower) && (a.upper < b.upper); + } + + template<typename T> __forceinline bool subset( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) { + return subset(a.x,b.x) && subset(a.y,b.y); + } + + template<typename T> __forceinline const Vec2<Interval<T>> intersect( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) { + return Vec2<Interval<T>>(intersect(a.x,b.x),intersect(a.y,b.y)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Interval<T> select ( bool s, const Interval<T>& t, const Interval<T>& f ) { + return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper)); + } + + template<typename T> __forceinline Interval<T> select ( const typename T::Bool& s, const Interval<T>& t, const Interval<T>& f ) { + return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper)); + } + + __forceinline int numRoots(const Interval<float>& p0, const Interval<float>& p1) + { + float eps = 1E-4f; + bool neg0 = p0.lower < eps; bool pos0 = p0.upper > -eps; + bool neg1 = p1.lower < eps; bool pos1 = p1.upper > -eps; + return (neg0 && pos1) || (pos0 && neg1) || (neg0 && pos0) || (neg1 && pos1); + } + + typedef Interval<float> Interval1f; + typedef Vec2<Interval<float>> Interval2f; + typedef Vec3<Interval<float>> Interval3f; + +inline void swap(float& a, float& b) { float tmp = a; a = b; b = tmp; } + +inline Interval1f shift(const Interval1f& v, float shift) { return Interval1f(v.lower + shift, v.upper + shift); } + +#define TWO_PI (2.0*M_PI) +inline Interval1f sin(Interval1f interval) +{ + if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); } + if (interval.upper > TWO_PI) { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); } + if (interval.lower < 0) { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); } + float sinLower = sin(interval.lower); + float sinUpper = sin(interval.upper); + if (sinLower > sinUpper) swap(sinLower, sinUpper); + if (interval.lower < M_PI / 2.0 && interval.upper > M_PI / 2.0) sinUpper = 1.0; + if (interval.lower < 3.0 * M_PI / 2.0 && interval.upper > 3.0 * M_PI / 2.0) sinLower = -1.0; + return Interval1f(sinLower, sinUpper); +} + +inline Interval1f cos(Interval1f interval) +{ + if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); } + if (interval.upper > TWO_PI) { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); } + if (interval.lower < 0) { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); } + float cosLower = cos(interval.lower); + float cosUpper = cos(interval.upper); + if (cosLower > cosUpper) swap(cosLower, cosUpper); + if (interval.lower < M_PI && interval.upper > M_PI) cosLower = -1.0; + return Interval1f(cosLower, cosUpper); +} +#undef TWO_PI +} diff --git a/thirdparty/embree/common/math/lbbox.h b/thirdparty/embree/common/math/lbbox.h new file mode 100644 index 0000000000..2b397a05c8 --- /dev/null +++ b/thirdparty/embree/common/math/lbbox.h @@ -0,0 +1,289 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bbox.h" +#include "range.h" + +namespace embree +{ + template<typename T> + __forceinline std::pair<T,T> globalLinear(const std::pair<T,T>& v, const BBox1f& dt) + { + const float rcp_dt_size = float(1.0f)/dt.size(); + const T g0 = lerp(v.first,v.second,-dt.lower*rcp_dt_size); + const T g1 = lerp(v.first,v.second,(1.0f-dt.lower)*rcp_dt_size); + return std::make_pair(g0,g1); + } + + template<typename T> + struct LBBox + { + public: + __forceinline LBBox () {} + + template<typename T1> + __forceinline LBBox ( const LBBox<T1>& other ) + : bounds0(other.bounds0), bounds1(other.bounds1) {} + + __forceinline LBBox& operator= ( const LBBox& other ) { + bounds0 = other.bounds0; bounds1 = other.bounds1; return *this; + } + + __forceinline LBBox (EmptyTy) + : bounds0(EmptyTy()), bounds1(EmptyTy()) {} + + __forceinline explicit LBBox ( const BBox<T>& bounds) + : bounds0(bounds), bounds1(bounds) { } + + __forceinline LBBox ( const BBox<T>& bounds0, const BBox<T>& bounds1) + : bounds0(bounds0), bounds1(bounds1) { } + + LBBox ( const avector<BBox<T>>& bounds ) + { + assert(bounds.size()); + BBox<T> b0 = bounds.front(); + BBox<T> b1 = bounds.back(); + for (size_t i=1; i<bounds.size()-1; i++) { + const float f = float(i)/float(bounds.size()-1); + const BBox<T> bt = lerp(b0,b1,f); + const T dlower = min(bounds[i].lower-bt.lower,T(zero)); + const T dupper = max(bounds[i].upper-bt.upper,T(zero)); + b0.lower += dlower; b1.lower += dlower; + b0.upper += dupper; b1.upper += dupper; + } + bounds0 = b0; + bounds1 = b1; + } + + /*! calculates the linear bounds of a primitive for the specified time range */ + template<typename BoundsFunc> + __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range, float numTimeSegments) + { + const float lower = time_range.lower*numTimeSegments; + const float upper = time_range.upper*numTimeSegments; + const float ilowerf = floor(lower); + const float iupperf = ceil(upper); + const int ilower = (int)ilowerf; + const int iupper = (int)iupperf; + + const BBox<T> blower0 = bounds(ilower); + const BBox<T> bupper1 = bounds(iupper); + + if (iupper-ilower == 1) { + bounds0 = lerp(blower0, bupper1, lower-ilowerf); + bounds1 = lerp(bupper1, blower0, iupperf-upper); + return; + } + + const BBox<T> blower1 = bounds(ilower+1); + const BBox<T> bupper0 = bounds(iupper-1); + BBox<T> b0 = lerp(blower0, blower1, lower-ilowerf); + BBox<T> b1 = lerp(bupper1, bupper0, iupperf-upper); + + for (int i = ilower+1; i < iupper; i++) + { + const float f = (float(i)/numTimeSegments - time_range.lower) / time_range.size(); + const BBox<T> bt = lerp(b0, b1, f); + const BBox<T> bi = bounds(i); + const T dlower = min(bi.lower-bt.lower, T(zero)); + const T dupper = max(bi.upper-bt.upper, T(zero)); + b0.lower += dlower; b1.lower += dlower; + b0.upper += dupper; b1.upper += dupper; + } + + bounds0 = b0; + bounds1 = b1; + } + + /*! calculates the linear bounds of a primitive for the specified time range */ + template<typename BoundsFunc> + __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range_in, const BBox1f& geom_time_range, float geom_time_segments) + { + /* normalize global time_range_in to local geom_time_range */ + const BBox1f time_range((time_range_in.lower-geom_time_range.lower)/geom_time_range.size(), + (time_range_in.upper-geom_time_range.lower)/geom_time_range.size()); + + const float lower = time_range.lower*geom_time_segments; + const float upper = time_range.upper*geom_time_segments; + const float ilowerf = floor(lower); + const float iupperf = ceil(upper); + const float ilowerfc = max(0.0f,ilowerf); + const float iupperfc = min(iupperf,geom_time_segments); + const int ilowerc = (int)ilowerfc; + const int iupperc = (int)iupperfc; + assert(iupperc-ilowerc > 0); + + /* this larger iteration range guarantees that we process borders of geom_time_range is (partially) inside time_range_in */ + const int ilower_iter = max(-1,(int)ilowerf); + const int iupper_iter = min((int)iupperf,(int)geom_time_segments+1); + + const BBox<T> blower0 = bounds(ilowerc); + const BBox<T> bupper1 = bounds(iupperc); + if (iupper_iter-ilower_iter == 1) { + bounds0 = lerp(blower0, bupper1, max(0.0f,lower-ilowerfc)); + bounds1 = lerp(bupper1, blower0, max(0.0f,iupperfc-upper)); + return; + } + + const BBox<T> blower1 = bounds(ilowerc+1); + const BBox<T> bupper0 = bounds(iupperc-1); + BBox<T> b0 = lerp(blower0, blower1, max(0.0f,lower-ilowerfc)); + BBox<T> b1 = lerp(bupper1, bupper0, max(0.0f,iupperfc-upper)); + + for (int i = ilower_iter+1; i < iupper_iter; i++) + { + const float f = (float(i)/geom_time_segments - time_range.lower) / time_range.size(); + const BBox<T> bt = lerp(b0, b1, f); + const BBox<T> bi = bounds(i); + const T dlower = min(bi.lower-bt.lower, T(zero)); + const T dupper = max(bi.upper-bt.upper, T(zero)); + b0.lower += dlower; b1.lower += dlower; + b0.upper += dupper; b1.upper += dupper; + } + + bounds0 = b0; + bounds1 = b1; + } + + /*! calculates the linear bounds of a primitive for the specified time range */ + template<typename BoundsFunc> + __forceinline LBBox(const BoundsFunc& bounds, const range<int>& time_range, int numTimeSegments) + { + const int ilower = time_range.begin(); + const int iupper = time_range.end(); + + BBox<T> b0 = bounds(ilower); + BBox<T> b1 = bounds(iupper); + + if (iupper-ilower == 1) + { + bounds0 = b0; + bounds1 = b1; + return; + } + + for (int i = ilower+1; i<iupper; i++) + { + const float f = float(i - time_range.begin()) / float(time_range.size()); + const BBox<T> bt = lerp(b0, b1, f); + const BBox<T> bi = bounds(i); + const T dlower = min(bi.lower-bt.lower, T(zero)); + const T dupper = max(bi.upper-bt.upper, T(zero)); + b0.lower += dlower; b1.lower += dlower; + b0.upper += dupper; b1.upper += dupper; + } + + bounds0 = b0; + bounds1 = b1; + } + + public: + + __forceinline bool empty() const { + return bounds().empty(); + } + + __forceinline BBox<T> bounds () const { + return merge(bounds0,bounds1); + } + + __forceinline BBox<T> interpolate( const float t ) const { + return lerp(bounds0,bounds1,t); + } + + __forceinline LBBox<T> interpolate( const BBox1f& dt ) const { + return LBBox<T>(interpolate(dt.lower),interpolate(dt.upper)); + } + + __forceinline void extend( const LBBox& other ) { + bounds0.extend(other.bounds0); + bounds1.extend(other.bounds1); + } + + __forceinline float expectedHalfArea() const; + + __forceinline float expectedHalfArea(const BBox1f& dt) const { + return interpolate(dt).expectedHalfArea(); + } + + __forceinline float expectedApproxHalfArea() const { + return 0.5f*(halfArea(bounds0) + halfArea(bounds1)); + } + + /* calculates bounds for [0,1] time range from bounds in dt time range */ + __forceinline LBBox global(const BBox1f& dt) const + { + const float rcp_dt_size = 1.0f/dt.size(); + const BBox<T> b0 = interpolate(-dt.lower*rcp_dt_size); + const BBox<T> b1 = interpolate((1.0f-dt.lower)*rcp_dt_size); + return LBBox(b0,b1); + } + + /*! Comparison Operators */ + //template<typename TT> friend __forceinline bool operator==( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; } + //template<typename TT> friend __forceinline bool operator!=( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; } + friend __forceinline bool operator==( const LBBox& a, const LBBox& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; } + friend __forceinline bool operator!=( const LBBox& a, const LBBox& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; } + + /*! output operator */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const LBBox& box) { + return cout << "LBBox { " << box.bounds0 << "; " << box.bounds1 << " }"; + } + + public: + BBox<T> bounds0, bounds1; + }; + + /*! tests if box is finite */ + template<typename T> + __forceinline bool isvalid( const LBBox<T>& v ) { + return isvalid(v.bounds0) && isvalid(v.bounds1); + } + + template<typename T> + __forceinline bool isvalid_non_empty( const LBBox<T>& v ) { + return isvalid_non_empty(v.bounds0) && isvalid_non_empty(v.bounds1); + } + + template<typename T> + __forceinline T expectedArea(const T& a0, const T& a1, const T& b0, const T& b1) + { + const T da = a1-a0; + const T db = b1-b0; + return a0*b0+(a0*db+da*b0)*T(0.5f) + da*db*T(1.0f/3.0f); + } + + template<> __forceinline float LBBox<Vec3fa>::expectedHalfArea() const + { + const Vec3fa d0 = bounds0.size(); + const Vec3fa d1 = bounds1.size(); + return reduce_add(expectedArea(Vec3fa(d0.x,d0.y,d0.z), + Vec3fa(d1.x,d1.y,d1.z), + Vec3fa(d0.y,d0.z,d0.x), + Vec3fa(d1.y,d1.z,d1.x))); + } + + template<typename T> + __forceinline float expectedApproxHalfArea(const LBBox<T>& box) { + return box.expectedApproxHalfArea(); + } + + template<typename T> + __forceinline LBBox<T> merge(const LBBox<T>& a, const LBBox<T>& b) { + return LBBox<T>(merge(a.bounds0, b.bounds0), merge(a.bounds1, b.bounds1)); + } + + /*! subset relation */ + template<typename T> __inline bool subset( const LBBox<T>& a, const LBBox<T>& b ) { + return subset(a.bounds0,b.bounds0) && subset(a.bounds1,b.bounds1); + } + + /*! default template instantiations */ + typedef LBBox<float> LBBox1f; + typedef LBBox<Vec2f> LBBox2f; + typedef LBBox<Vec3f> LBBox3f; + typedef LBBox<Vec3fa> LBBox3fa; + typedef LBBox<Vec3fx> LBBox3fx; +} diff --git a/thirdparty/embree/common/math/linearspace2.h b/thirdparty/embree/common/math/linearspace2.h new file mode 100644 index 0000000000..184ee695fb --- /dev/null +++ b/thirdparty/embree/common/math/linearspace2.h @@ -0,0 +1,148 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec2.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// 2D Linear Transform (2x2 Matrix) + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> struct LinearSpace2 + { + typedef T Vector; + typedef typename T::Scalar Scalar; + + /*! default matrix constructor */ + __forceinline LinearSpace2 ( ) {} + __forceinline LinearSpace2 ( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; } + __forceinline LinearSpace2& operator=( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; return *this; } + + template<typename L1> __forceinline LinearSpace2( const LinearSpace2<L1>& s ) : vx(s.vx), vy(s.vy) {} + + /*! matrix construction from column vectors */ + __forceinline LinearSpace2(const Vector& vx, const Vector& vy) + : vx(vx), vy(vy) {} + + /*! matrix construction from row mayor data */ + __forceinline LinearSpace2(const Scalar& m00, const Scalar& m01, + const Scalar& m10, const Scalar& m11) + : vx(m00,m10), vy(m01,m11) {} + + /*! compute the determinant of the matrix */ + __forceinline const Scalar det() const { return vx.x*vy.y - vx.y*vy.x; } + + /*! compute adjoint matrix */ + __forceinline const LinearSpace2 adjoint() const { return LinearSpace2(vy.y,-vy.x,-vx.y,vx.x); } + + /*! compute inverse matrix */ + __forceinline const LinearSpace2 inverse() const { return adjoint()/det(); } + + /*! compute transposed matrix */ + __forceinline const LinearSpace2 transposed() const { return LinearSpace2(vx.x,vx.y,vy.x,vy.y); } + + /*! returns first row of matrix */ + __forceinline Vector row0() const { return Vector(vx.x,vy.x); } + + /*! returns second row of matrix */ + __forceinline Vector row1() const { return Vector(vx.y,vy.y); } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline LinearSpace2( ZeroTy ) : vx(zero), vy(zero) {} + __forceinline LinearSpace2( OneTy ) : vx(one, zero), vy(zero, one) {} + + /*! return matrix for scaling */ + static __forceinline LinearSpace2 scale(const Vector& s) { + return LinearSpace2(s.x, 0, + 0 , s.y); + } + + /*! return matrix for rotation */ + static __forceinline LinearSpace2 rotate(const Scalar& r) { + Scalar s = sin(r), c = cos(r); + return LinearSpace2(c, -s, + s, c); + } + + /*! return closest orthogonal matrix (i.e. a general rotation including reflection) */ + LinearSpace2 orthogonal() const + { + LinearSpace2 m = *this; + + // mirrored? + Scalar mirror(one); + if (m.det() < Scalar(zero)) { + m.vx = -m.vx; + mirror = -mirror; + } + + // rotation + for (int i = 0; i < 99; i++) { + const LinearSpace2 m_next = 0.5 * (m + m.transposed().inverse()); + const LinearSpace2 d = m_next - m; + m = m_next; + // norm^2 of difference small enough? + if (max(dot(d.vx, d.vx), dot(d.vy, d.vy)) < 1e-8) + break; + } + + // rotation * mirror_x + return LinearSpace2(mirror*m.vx, m.vy); + } + + public: + + /*! the column vectors of the matrix */ + Vector vx,vy; + }; + + //////////////////////////////////////////////////////////////////////////////// + // Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a ) { return LinearSpace2<T>(-a.vx,-a.vy); } + template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a ) { return LinearSpace2<T>(+a.vx,+a.vy); } + template<typename T> __forceinline LinearSpace2<T> rcp ( const LinearSpace2<T>& a ) { return a.inverse(); } + + //////////////////////////////////////////////////////////////////////////////// + // Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx+b.vx,a.vy+b.vy); } + template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx-b.vx,a.vy-b.vy); } + + template<typename T> __forceinline LinearSpace2<T> operator*(const typename T::Scalar & a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); } + template<typename T> __forceinline T operator*(const LinearSpace2<T>& a, const T & b) { return b.x*a.vx + b.y*a.vy; } + template<typename T> __forceinline LinearSpace2<T> operator*(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); } + + template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const typename T::Scalar & b) { return LinearSpace2<T>(a.vx/b, a.vy/b); } + template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return a * rcp(b); } + + template<typename T> __forceinline LinearSpace2<T>& operator *=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a * b; } + template<typename T> __forceinline LinearSpace2<T>& operator /=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline bool operator ==( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx == b.vx && a.vy == b.vy; } + template<typename T> __forceinline bool operator !=( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx != b.vx || a.vy != b.vy; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace2<T>& m) { + return cout << "{ vx = " << m.vx << ", vy = " << m.vy << "}"; + } + + /*! Shortcuts for common linear spaces. */ + typedef LinearSpace2<Vec2f> LinearSpace2f; + typedef LinearSpace2<Vec2fa> LinearSpace2fa; +} diff --git a/thirdparty/embree/common/math/linearspace3.h b/thirdparty/embree/common/math/linearspace3.h new file mode 100644 index 0000000000..9eaa2cc2bb --- /dev/null +++ b/thirdparty/embree/common/math/linearspace3.h @@ -0,0 +1,213 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec3.h" +#include "quaternion.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// 3D Linear Transform (3x3 Matrix) + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> struct LinearSpace3 + { + typedef T Vector; + typedef typename T::Scalar Scalar; + + /*! default matrix constructor */ + __forceinline LinearSpace3 ( ) {} + __forceinline LinearSpace3 ( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; } + __forceinline LinearSpace3& operator=( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; } + + template<typename L1> __forceinline LinearSpace3( const LinearSpace3<L1>& s ) : vx(s.vx), vy(s.vy), vz(s.vz) {} + + /*! matrix construction from column vectors */ + __forceinline LinearSpace3(const Vector& vx, const Vector& vy, const Vector& vz) + : vx(vx), vy(vy), vz(vz) {} + + /*! construction from quaternion */ + __forceinline LinearSpace3( const QuaternionT<Scalar>& q ) + : vx((q.r*q.r + q.i*q.i - q.j*q.j - q.k*q.k), 2.0f*(q.i*q.j + q.r*q.k), 2.0f*(q.i*q.k - q.r*q.j)) + , vy(2.0f*(q.i*q.j - q.r*q.k), (q.r*q.r - q.i*q.i + q.j*q.j - q.k*q.k), 2.0f*(q.j*q.k + q.r*q.i)) + , vz(2.0f*(q.i*q.k + q.r*q.j), 2.0f*(q.j*q.k - q.r*q.i), (q.r*q.r - q.i*q.i - q.j*q.j + q.k*q.k)) {} + + /*! matrix construction from row mayor data */ + __forceinline LinearSpace3(const Scalar& m00, const Scalar& m01, const Scalar& m02, + const Scalar& m10, const Scalar& m11, const Scalar& m12, + const Scalar& m20, const Scalar& m21, const Scalar& m22) + : vx(m00,m10,m20), vy(m01,m11,m21), vz(m02,m12,m22) {} + + /*! compute the determinant of the matrix */ + __forceinline const Scalar det() const { return dot(vx,cross(vy,vz)); } + + /*! compute adjoint matrix */ + __forceinline const LinearSpace3 adjoint() const { return LinearSpace3(cross(vy,vz),cross(vz,vx),cross(vx,vy)).transposed(); } + + /*! compute inverse matrix */ + __forceinline const LinearSpace3 inverse() const { return adjoint()/det(); } + + /*! compute transposed matrix */ + __forceinline const LinearSpace3 transposed() const { return LinearSpace3(vx.x,vx.y,vx.z,vy.x,vy.y,vy.z,vz.x,vz.y,vz.z); } + + /*! returns first row of matrix */ + __forceinline Vector row0() const { return Vector(vx.x,vy.x,vz.x); } + + /*! returns second row of matrix */ + __forceinline Vector row1() const { return Vector(vx.y,vy.y,vz.y); } + + /*! returns third row of matrix */ + __forceinline Vector row2() const { return Vector(vx.z,vy.z,vz.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline LinearSpace3( ZeroTy ) : vx(zero), vy(zero), vz(zero) {} + __forceinline LinearSpace3( OneTy ) : vx(one, zero, zero), vy(zero, one, zero), vz(zero, zero, one) {} + + /*! return matrix for scaling */ + static __forceinline LinearSpace3 scale(const Vector& s) { + return LinearSpace3(s.x, 0, 0, + 0 , s.y, 0, + 0 , 0, s.z); + } + + /*! return matrix for rotation around arbitrary axis */ + static __forceinline LinearSpace3 rotate(const Vector& _u, const Scalar& r) { + Vector u = normalize(_u); + Scalar s = sin(r), c = cos(r); + return LinearSpace3(u.x*u.x+(1-u.x*u.x)*c, u.x*u.y*(1-c)-u.z*s, u.x*u.z*(1-c)+u.y*s, + u.x*u.y*(1-c)+u.z*s, u.y*u.y+(1-u.y*u.y)*c, u.y*u.z*(1-c)-u.x*s, + u.x*u.z*(1-c)-u.y*s, u.y*u.z*(1-c)+u.x*s, u.z*u.z+(1-u.z*u.z)*c); + } + + public: + + /*! the column vectors of the matrix */ + Vector vx,vy,vz; + }; + + /*! compute transposed matrix */ + template<> __forceinline const LinearSpace3<Vec3fa> LinearSpace3<Vec3fa>::transposed() const { + vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz); + return LinearSpace3<Vec3fa>(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz)); + } + + template<typename T> + __forceinline const LinearSpace3<T> transposed(const LinearSpace3<T>& xfm) { + return xfm.transposed(); + } + + //////////////////////////////////////////////////////////////////////////////// + // Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a ) { return LinearSpace3<T>(-a.vx,-a.vy,-a.vz); } + template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a ) { return LinearSpace3<T>(+a.vx,+a.vy,+a.vz); } + template<typename T> __forceinline LinearSpace3<T> rcp ( const LinearSpace3<T>& a ) { return a.inverse(); } + + /* constructs a coordinate frame form a normalized normal */ + template<typename T> __forceinline LinearSpace3<T> frame(const T& N) + { + const T dx0(0,N.z,-N.y); + const T dx1(-N.z,0,N.x); + const T dx = normalize(select(dot(dx0,dx0) > dot(dx1,dx1),dx0,dx1)); + const T dy = normalize(cross(N,dx)); + return LinearSpace3<T>(dx,dy,N); + } + + /* constructs a coordinate frame from a normal and approximate x-direction */ + template<typename T> __forceinline LinearSpace3<T> frame(const T& N, const T& dxi) + { + if (abs(dot(dxi,N)) > 0.99f) return frame(N); // fallback in case N and dxi are very parallel + const T dx = normalize(cross(dxi,N)); + const T dy = normalize(cross(N,dx)); + return LinearSpace3<T>(dx,dy,N); + } + + /* clamps linear space to range -1 to +1 */ + template<typename T> __forceinline LinearSpace3<T> clamp(const LinearSpace3<T>& space) { + return LinearSpace3<T>(clamp(space.vx,T(-1.0f),T(1.0f)), + clamp(space.vy,T(-1.0f),T(1.0f)), + clamp(space.vz,T(-1.0f),T(1.0f))); + } + + //////////////////////////////////////////////////////////////////////////////// + // Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx+b.vx,a.vy+b.vy,a.vz+b.vz); } + template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx-b.vx,a.vy-b.vy,a.vz-b.vz); } + + template<typename T> __forceinline LinearSpace3<T> operator*(const typename T::Scalar & a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); } + template<typename T> __forceinline T operator*(const LinearSpace3<T>& a, const T & b) { return madd(T(b.x),a.vx,madd(T(b.y),a.vy,T(b.z)*a.vz)); } + template<typename T> __forceinline LinearSpace3<T> operator*(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); } + + template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const typename T::Scalar & b) { return LinearSpace3<T>(a.vx/b, a.vy/b, a.vz/b); } + template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return a * rcp(b); } + + template<typename T> __forceinline LinearSpace3<T>& operator *=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a * b; } + template<typename T> __forceinline LinearSpace3<T>& operator /=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a / b; } + + template<typename T> __forceinline T xfmPoint (const LinearSpace3<T>& s, const T & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); } + template<typename T> __forceinline T xfmVector(const LinearSpace3<T>& s, const T & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); } + template<typename T> __forceinline T xfmNormal(const LinearSpace3<T>& s, const T & a) { return xfmVector(s.inverse().transposed(),a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline bool operator ==( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; } + template<typename T> __forceinline bool operator !=( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline LinearSpace3<T> select ( const typename T::Scalar::Bool& s, const LinearSpace3<T>& t, const LinearSpace3<T>& f ) { + return LinearSpace3<T>(select(s,t.vx,f.vx),select(s,t.vy,f.vy),select(s,t.vz,f.vz)); + } + + /*! blending */ + template<typename T> + __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0, const LinearSpace3<T>& l1, const float t) + { + return LinearSpace3<T>(lerp(l0.vx,l1.vx,t), + lerp(l0.vy,l1.vy,t), + lerp(l0.vz,l1.vz,t)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace3<T>& m) { + return cout << "{ vx = " << m.vx << ", vy = " << m.vy << ", vz = " << m.vz << "}"; + } + + /*! Shortcuts for common linear spaces. */ + typedef LinearSpace3<Vec3f> LinearSpace3f; + typedef LinearSpace3<Vec3fa> LinearSpace3fa; + typedef LinearSpace3<Vec3fx> LinearSpace3fx; + typedef LinearSpace3<Vec3ff> LinearSpace3ff; + + template<int N> using LinearSpace3vf = LinearSpace3<Vec3<vfloat<N>>>; + typedef LinearSpace3<Vec3<vfloat<4>>> LinearSpace3vf4; + typedef LinearSpace3<Vec3<vfloat<8>>> LinearSpace3vf8; + typedef LinearSpace3<Vec3<vfloat<16>>> LinearSpace3vf16; + + /*! blending */ + template<typename T, typename S> + __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0, + const LinearSpace3<T>& l1, + const S& t) + { + return LinearSpace3<T>(lerp(l0.vx,l1.vx,t), + lerp(l0.vy,l1.vy,t), + lerp(l0.vz,l1.vz,t)); + } + +} diff --git a/thirdparty/embree/common/math/math.h b/thirdparty/embree/common/math/math.h new file mode 100644 index 0000000000..4bc54c1a6a --- /dev/null +++ b/thirdparty/embree/common/math/math.h @@ -0,0 +1,369 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/intrinsics.h" +#include "constants.h" +#include <cmath> + +#if defined(__ARM_NEON) +#include "../simd/arm/emulation.h" +#else +#include <emmintrin.h> +#include <xmmintrin.h> +#include <immintrin.h> +#endif + +#if defined(__WIN32__) +#if defined(_MSC_VER) && (_MSC_VER <= 1700) +namespace std +{ + __forceinline bool isinf ( const float x ) { return _finite(x) == 0; } + __forceinline bool isnan ( const float x ) { return _isnan(x) != 0; } + __forceinline bool isfinite (const float x) { return _finite(x) != 0; } +} +#endif +#endif + +namespace embree +{ + __forceinline bool isvalid ( const float& v ) { + return (v > -FLT_LARGE) & (v < +FLT_LARGE); + } + + __forceinline int cast_f2i(float f) { + union { float f; int i; } v; v.f = f; return v.i; + } + + __forceinline float cast_i2f(int i) { + union { float f; int i; } v; v.i = i; return v.f; + } + + __forceinline int toInt (const float& a) { return int(a); } + __forceinline float toFloat(const int& a) { return float(a); } + +#if defined(__WIN32__) + __forceinline bool finite ( const float x ) { return _finite(x) != 0; } +#endif + + __forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; } + __forceinline float sqr ( const float x ) { return x*x; } + + __forceinline float rcp ( const float x ) + { + const __m128 a = _mm_set_ss(x); + +#if defined(__AVX512VL__) + const __m128 r = _mm_rcp14_ss(_mm_set_ss(0.0f),a); +#else + const __m128 r = _mm_rcp_ss(a); +#endif + +#if defined(__AVX2__) + return _mm_cvtss_f32(_mm_mul_ss(r,_mm_fnmadd_ss(r, a, _mm_set_ss(2.0f)))); +#else + return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a)))); +#endif + } + + __forceinline float signmsk ( const float x ) { + return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); + } + __forceinline float xorf( const float x, const float y ) { + return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y))); + } + __forceinline float andf( const float x, const unsigned y ) { + return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y)))); + } + __forceinline float rsqrt( const float x ) + { + const __m128 a = _mm_set_ss(x); +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a); +#else + __m128 r = _mm_rsqrt_ss(a); +#endif + r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); +#if defined(__ARM_NEON) + r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); +#endif + return _mm_cvtss_f32(r); + } + +#if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700) + __forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); } + __forceinline double nextafter(double x, double y) { return _nextafter(x, y); } + __forceinline int roundf(float f) { return (int)(f + 0.5f); } +#else + __forceinline float nextafter(float x, float y) { return ::nextafterf(x, y); } + __forceinline double nextafter(double x, double y) { return ::nextafter(x, y); } +#endif + + __forceinline float abs ( const float x ) { return ::fabsf(x); } + __forceinline float acos ( const float x ) { return ::acosf (x); } + __forceinline float asin ( const float x ) { return ::asinf (x); } + __forceinline float atan ( const float x ) { return ::atanf (x); } + __forceinline float atan2( const float y, const float x ) { return ::atan2f(y, x); } + __forceinline float cos ( const float x ) { return ::cosf (x); } + __forceinline float cosh ( const float x ) { return ::coshf (x); } + __forceinline float exp ( const float x ) { return ::expf (x); } + __forceinline float fmod ( const float x, const float y ) { return ::fmodf (x, y); } + __forceinline float log ( const float x ) { return ::logf (x); } + __forceinline float log10( const float x ) { return ::log10f(x); } + __forceinline float pow ( const float x, const float y ) { return ::powf (x, y); } + __forceinline float sin ( const float x ) { return ::sinf (x); } + __forceinline float sinh ( const float x ) { return ::sinhf (x); } + __forceinline float sqrt ( const float x ) { return ::sqrtf (x); } + __forceinline float tan ( const float x ) { return ::tanf (x); } + __forceinline float tanh ( const float x ) { return ::tanhf (x); } + __forceinline float floor( const float x ) { return ::floorf (x); } + __forceinline float ceil ( const float x ) { return ::ceilf (x); } + __forceinline float frac ( const float x ) { return x-floor(x); } + + __forceinline double abs ( const double x ) { return ::fabs(x); } + __forceinline double sign ( const double x ) { return x<0?-1.0:1.0; } + __forceinline double acos ( const double x ) { return ::acos (x); } + __forceinline double asin ( const double x ) { return ::asin (x); } + __forceinline double atan ( const double x ) { return ::atan (x); } + __forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); } + __forceinline double cos ( const double x ) { return ::cos (x); } + __forceinline double cosh ( const double x ) { return ::cosh (x); } + __forceinline double exp ( const double x ) { return ::exp (x); } + __forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); } + __forceinline double log ( const double x ) { return ::log (x); } + __forceinline double log10( const double x ) { return ::log10(x); } + __forceinline double pow ( const double x, const double y ) { return ::pow (x, y); } + __forceinline double rcp ( const double x ) { return 1.0/x; } + __forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); } + __forceinline double sin ( const double x ) { return ::sin (x); } + __forceinline double sinh ( const double x ) { return ::sinh (x); } + __forceinline double sqr ( const double x ) { return x*x; } + __forceinline double sqrt ( const double x ) { return ::sqrt (x); } + __forceinline double tan ( const double x ) { return ::tan (x); } + __forceinline double tanh ( const double x ) { return ::tanh (x); } + __forceinline double floor( const double x ) { return ::floor (x); } + __forceinline double ceil ( const double x ) { return ::ceil (x); } + +#if defined(__SSE4_1__) + __forceinline float mini(float a, float b) { + const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); + const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); + const __m128i ci = _mm_min_epi32(ai,bi); + return _mm_cvtss_f32(_mm_castsi128_ps(ci)); + } +#endif + +#if defined(__SSE4_1__) + __forceinline float maxi(float a, float b) { + const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); + const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); + const __m128i ci = _mm_max_epi32(ai,bi); + return _mm_cvtss_f32(_mm_castsi128_ps(ci)); + } +#endif + + template<typename T> + __forceinline T twice(const T& a) { return a+a; } + + __forceinline int min(int a, int b) { return a<b ? a:b; } + __forceinline unsigned min(unsigned a, unsigned b) { return a<b ? a:b; } + __forceinline int64_t min(int64_t a, int64_t b) { return a<b ? a:b; } + __forceinline float min(float a, float b) { return a<b ? a:b; } + __forceinline double min(double a, double b) { return a<b ? a:b; } +#if defined(__64BIT__) + __forceinline size_t min(size_t a, size_t b) { return a<b ? a:b; } +#endif + + template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); } + template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); } + template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); } + + template<typename T> __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); } + template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); } + template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); } + + __forceinline int max(int a, int b) { return a<b ? b:a; } + __forceinline unsigned max(unsigned a, unsigned b) { return a<b ? b:a; } + __forceinline int64_t max(int64_t a, int64_t b) { return a<b ? b:a; } + __forceinline float max(float a, float b) { return a<b ? b:a; } + __forceinline double max(double a, double b) { return a<b ? b:a; } +#if defined(__64BIT__) + __forceinline size_t max(size_t a, size_t b) { return a<b ? b:a; } +#endif + + template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); } + template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); } + template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); } + + template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); } + template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); } + template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); } + +#if defined(__MACOSX__) + __forceinline ssize_t min(ssize_t a, ssize_t b) { return a<b ? a:b; } + __forceinline ssize_t max(ssize_t a, ssize_t b) { return a<b ? b:a; } +#endif + +#if defined(__MACOSX__) && !defined(__INTEL_COMPILER) + __forceinline void sincosf(float x, float *sin, float *cos) { + __sincosf(x,sin,cos); + } +#endif + +#if defined(__WIN32__) || defined(__FreeBSD__) + __forceinline void sincosf(float x, float *s, float *c) { + *s = sinf(x); *c = cosf(x); + } +#endif + + template<typename T> __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); } + template<typename T> __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); } + + template<typename T> __forceinline T deg2rad ( const T& x ) { return x * T(1.74532925199432957692e-2f); } + template<typename T> __forceinline T rad2deg ( const T& x ) { return x * T(5.72957795130823208768e1f); } + template<typename T> __forceinline T sin2cos ( const T& x ) { return sqrt(max(T(zero),T(one)-x*x)); } + template<typename T> __forceinline T cos2sin ( const T& x ) { return sin2cos(x); } + +#if defined(__AVX2__) + __forceinline float madd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } + __forceinline float msub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } + __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } + __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } +#else + __forceinline float madd ( const float a, const float b, const float c) { return a*b+c; } + __forceinline float msub ( const float a, const float b, const float c) { return a*b-c; } + __forceinline float nmadd ( const float a, const float b, const float c) { return -a*b+c;} + __forceinline float nmsub ( const float a, const float b, const float c) { return -a*b-c; } +#endif + + /*! random functions */ + template<typename T> T random() { return T(0); } +#if defined(_WIN32) + template<> __forceinline int random() { return int(rand()) ^ (int(rand()) << 8) ^ (int(rand()) << 16); } + template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 8) ^ (uint32_t(rand()) << 16); } +#else + template<> __forceinline int random() { return int(rand()); } + template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); } +#endif + template<> __forceinline float random() { return rand()/float(RAND_MAX); } + template<> __forceinline double random() { return rand()/double(RAND_MAX); } + +#if _WIN32 + __forceinline double drand48() { + return double(rand())/double(RAND_MAX); + } + + __forceinline void srand48(long seed) { + return srand(seed); + } +#endif + + /*! selects */ + __forceinline bool select(bool s, bool t , bool f) { return s ? t : f; } + __forceinline int select(bool s, int t, int f) { return s ? t : f; } + __forceinline float select(bool s, float t, float f) { return s ? t : f; } + + __forceinline bool all(bool s) { return s; } + + __forceinline float lerp(const float v0, const float v1, const float t) { + return madd(1.0f-t,v0,t*v1); + } + + template<typename T> + __forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) { + return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3))); + } + + /*! exchange */ + template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; } + + /* load/store */ + template<typename Ty> struct mem; + + template<> struct mem<float> { + static __forceinline float load (bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; } + static __forceinline float loadu(bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; } + + static __forceinline void store (bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; } + static __forceinline void storeu(bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; } + }; + + /*! bit reverse operation */ + template<class T> + __forceinline T bitReverse(const T& vin) + { + T v = vin; + v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); + v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); + v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); + v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); + v = ( v >> 16 ) | ( v << 16); + return v; + } + + /*! bit interleave operation */ + template<class T> + __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin) + { + T x = xin, y = yin, z = zin; + x = (x | (x << 16)) & 0x030000FF; + x = (x | (x << 8)) & 0x0300F00F; + x = (x | (x << 4)) & 0x030C30C3; + x = (x | (x << 2)) & 0x09249249; + + y = (y | (y << 16)) & 0x030000FF; + y = (y | (y << 8)) & 0x0300F00F; + y = (y | (y << 4)) & 0x030C30C3; + y = (y | (y << 2)) & 0x09249249; + + z = (z | (z << 16)) & 0x030000FF; + z = (z | (z << 8)) & 0x0300F00F; + z = (z | (z << 4)) & 0x030C30C3; + z = (z | (z << 2)) & 0x09249249; + + return x | (y << 1) | (z << 2); + } + +#if defined(__AVX2__) + + template<> + __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi) + { + const unsigned int xx = pdep(xi,0x49249249 /* 0b01001001001001001001001001001001 */ ); + const unsigned int yy = pdep(yi,0x92492492 /* 0b10010010010010010010010010010010 */); + const unsigned int zz = pdep(zi,0x24924924 /* 0b00100100100100100100100100100100 */); + return xx | yy | zz; + } + +#endif + + /*! bit interleave operation for 64bit data types*/ + template<class T> + __forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){ + T x = xin & 0x1fffff; + T y = yin & 0x1fffff; + T z = zin & 0x1fffff; + + x = (x | x << 32) & 0x1f00000000ffff; + x = (x | x << 16) & 0x1f0000ff0000ff; + x = (x | x << 8) & 0x100f00f00f00f00f; + x = (x | x << 4) & 0x10c30c30c30c30c3; + x = (x | x << 2) & 0x1249249249249249; + + y = (y | y << 32) & 0x1f00000000ffff; + y = (y | y << 16) & 0x1f0000ff0000ff; + y = (y | y << 8) & 0x100f00f00f00f00f; + y = (y | y << 4) & 0x10c30c30c30c30c3; + y = (y | y << 2) & 0x1249249249249249; + + z = (z | z << 32) & 0x1f00000000ffff; + z = (z | z << 16) & 0x1f0000ff0000ff; + z = (z | z << 8) & 0x100f00f00f00f00f; + z = (z | z << 4) & 0x10c30c30c30c30c3; + z = (z | z << 2) & 0x1249249249249249; + + return x | (y << 1) | (z << 2); + } +} diff --git a/thirdparty/embree/common/math/obbox.h b/thirdparty/embree/common/math/obbox.h new file mode 100644 index 0000000000..2fe8bbf071 --- /dev/null +++ b/thirdparty/embree/common/math/obbox.h @@ -0,0 +1,39 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bbox.h" +#include "linearspace3.h" + +namespace embree +{ + /*! Oriented bounding box */ + template<typename T> + struct OBBox + { + public: + + __forceinline OBBox () {} + + __forceinline OBBox (EmptyTy) + : space(one), bounds(empty) {} + + __forceinline OBBox (const BBox<T>& bounds) + : space(one), bounds(bounds) {} + + __forceinline OBBox (const LinearSpace3<T>& space, const BBox<T>& bounds) + : space(space), bounds(bounds) {} + + friend embree_ostream operator<<(embree_ostream cout, const OBBox& p) { + return cout << "{ space = " << p.space << ", bounds = " << p.bounds << "}"; + } + + public: + LinearSpace3<T> space; //!< orthonormal transformation + BBox<T> bounds; //!< bounds in transformed space + }; + + typedef OBBox<Vec3f> OBBox3f; + typedef OBBox<Vec3fa> OBBox3fa; +} diff --git a/thirdparty/embree/common/math/quaternion.h b/thirdparty/embree/common/math/quaternion.h new file mode 100644 index 0000000000..080800efcd --- /dev/null +++ b/thirdparty/embree/common/math/quaternion.h @@ -0,0 +1,254 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec3.h" +#include "vec4.h" + +#include "transcendental.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////// + // Quaternion Struct + //////////////////////////////////////////////////////////////// + + template<typename T> + struct QuaternionT + { + typedef Vec3<T> Vector; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline QuaternionT () { } + __forceinline QuaternionT ( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; } + __forceinline QuaternionT& operator=( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; return *this; } + + __forceinline QuaternionT( const T& r ) : r(r), i(zero), j(zero), k(zero) {} + __forceinline explicit QuaternionT( const Vec3<T>& v ) : r(zero), i(v.x), j(v.y), k(v.z) {} + __forceinline explicit QuaternionT( const Vec4<T>& v ) : r(v.x), i(v.y), j(v.z), k(v.w) {} + __forceinline QuaternionT( const T& r, const T& i, const T& j, const T& k ) : r(r), i(i), j(j), k(k) {} + __forceinline QuaternionT( const T& r, const Vec3<T>& v ) : r(r), i(v.x), j(v.y), k(v.z) {} + + __inline QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz ); + __inline QuaternionT( const T& yaw, const T& pitch, const T& roll ); + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline QuaternionT( ZeroTy ) : r(zero), i(zero), j(zero), k(zero) {} + __forceinline QuaternionT( OneTy ) : r( one), i(zero), j(zero), k(zero) {} + + /*! return quaternion for rotation around arbitrary axis */ + static __forceinline QuaternionT rotate(const Vec3<T>& u, const T& r) { + return QuaternionT<T>(cos(T(0.5)*r),sin(T(0.5)*r)*normalize(u)); + } + + /*! returns the rotation axis of the quaternion as a vector */ + __forceinline Vec3<T> v( ) const { return Vec3<T>(i, j, k); } + + public: + T r, i, j, k; + }; + + template<typename T> __forceinline QuaternionT<T> operator *( const T & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a * b.r, a * b.i, a * b.j, a * b.k); } + template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const T & b ) { return QuaternionT<T>(a.r * b, a.i * b, a.j * b, a.k * b); } + + //////////////////////////////////////////////////////////////// + // Unary Operators + //////////////////////////////////////////////////////////////// + + template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a ) { return QuaternionT<T>(+a.r, +a.i, +a.j, +a.k); } + template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a ) { return QuaternionT<T>(-a.r, -a.i, -a.j, -a.k); } + template<typename T> __forceinline QuaternionT<T> conj ( const QuaternionT<T>& a ) { return QuaternionT<T>(a.r, -a.i, -a.j, -a.k); } + template<typename T> __forceinline T abs ( const QuaternionT<T>& a ) { return sqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } + template<typename T> __forceinline QuaternionT<T> rcp ( const QuaternionT<T>& a ) { return conj(a)*rcp(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } + template<typename T> __forceinline QuaternionT<T> normalize ( const QuaternionT<T>& a ) { return a*rsqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } + + // evaluates a*q-r + template<typename T> __forceinline QuaternionT<T> + msub(const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p) + { + return QuaternionT<T>(msub(a, q.r, p.r), + msub(a, q.i, p.i), + msub(a, q.j, p.j), + msub(a, q.k, p.k)); + } + // evaluates a*q-r + template<typename T> __forceinline QuaternionT<T> + madd (const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p) + { + return QuaternionT<T>(madd(a, q.r, p.r), + madd(a, q.i, p.i), + madd(a, q.j, p.j), + madd(a, q.k, p.k)); + } + + //////////////////////////////////////////////////////////////// + // Binary Operators + //////////////////////////////////////////////////////////////// + + template<typename T> __forceinline QuaternionT<T> operator +( const T & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a + b.r, b.i, b.j, b.k); } + template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const T & b ) { return QuaternionT<T>(a.r + b, a.i, a.j, a.k); } + template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r + b.r, a.i + b.i, a.j + b.j, a.k + b.k); } + template<typename T> __forceinline QuaternionT<T> operator -( const T & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a - b.r, -b.i, -b.j, -b.k); } + template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const T & b ) { return QuaternionT<T>(a.r - b, a.i, a.j, a.k); } + template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r - b.r, a.i - b.i, a.j - b.j, a.k - b.k); } + + template<typename T> __forceinline Vec3<T> operator *( const QuaternionT<T>& a, const Vec3<T> & b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); } + template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const QuaternionT<T>& b ) { + return QuaternionT<T>(a.r*b.r - a.i*b.i - a.j*b.j - a.k*b.k, + a.r*b.i + a.i*b.r + a.j*b.k - a.k*b.j, + a.r*b.j - a.i*b.k + a.j*b.r + a.k*b.i, + a.r*b.k + a.i*b.j - a.j*b.i + a.k*b.r); + } + template<typename T> __forceinline QuaternionT<T> operator /( const T & a, const QuaternionT<T>& b ) { return a*rcp(b); } + template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const T & b ) { return a*rcp(b); } + template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a*rcp(b); } + + template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const T & b ) { return a = a+b; } + template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a+b; } + template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const T & b ) { return a = a-b; } + template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a-b; } + template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const T & b ) { return a = a*b; } + template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*b; } + template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const T & b ) { return a = a*rcp(b); } + template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*rcp(b); } + + template<typename T, typename M> __forceinline QuaternionT<T> + select(const M& m, const QuaternionT<T>& q, const QuaternionT<T>& p) + { + return QuaternionT<T>(select(m, q.r, p.r), + select(m, q.i, p.i), + select(m, q.j, p.j), + select(m, q.k, p.k)); + } + + + template<typename T> __forceinline Vec3<T> xfmPoint ( const QuaternionT<T>& a, const Vec3<T>& b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); } + template<typename T> __forceinline Vec3<T> xfmVector( const QuaternionT<T>& a, const Vec3<T>& b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); } + template<typename T> __forceinline Vec3<T> xfmNormal( const QuaternionT<T>& a, const Vec3<T>& b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); } + + template<typename T> __forceinline T dot(const QuaternionT<T>& a, const QuaternionT<T>& b) { return a.r*b.r + a.i*b.i + a.j*b.j + a.k*b.k; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline bool operator ==( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r == b.r && a.i == b.i && a.j == b.j && a.k == b.k; } + template<typename T> __forceinline bool operator !=( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r != b.r || a.i != b.i || a.j != b.j || a.k != b.k; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Orientation Functions + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> QuaternionT<T>::QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz ) + { + if ( vx.x + vy.y + vz.z >= T(zero) ) + { + const T t = T(one) + (vx.x + vy.y + vz.z); + const T s = rsqrt(t)*T(0.5f); + r = t*s; + i = (vy.z - vz.y)*s; + j = (vz.x - vx.z)*s; + k = (vx.y - vy.x)*s; + } + else if ( vx.x >= max(vy.y, vz.z) ) + { + const T t = (T(one) + vx.x) - (vy.y + vz.z); + const T s = rsqrt(t)*T(0.5f); + r = (vy.z - vz.y)*s; + i = t*s; + j = (vx.y + vy.x)*s; + k = (vz.x + vx.z)*s; + } + else if ( vy.y >= vz.z ) // if ( vy.y >= max(vz.z, vx.x) ) + { + const T t = (T(one) + vy.y) - (vz.z + vx.x); + const T s = rsqrt(t)*T(0.5f); + r = (vz.x - vx.z)*s; + i = (vx.y + vy.x)*s; + j = t*s; + k = (vy.z + vz.y)*s; + } + else //if ( vz.z >= max(vy.y, vx.x) ) + { + const T t = (T(one) + vz.z) - (vx.x + vy.y); + const T s = rsqrt(t)*T(0.5f); + r = (vx.y - vy.x)*s; + i = (vz.x + vx.z)*s; + j = (vy.z + vz.y)*s; + k = t*s; + } + } + + template<typename T> QuaternionT<T>::QuaternionT( const T& yaw, const T& pitch, const T& roll ) + { + const T cya = cos(yaw *T(0.5f)); + const T cpi = cos(pitch*T(0.5f)); + const T cro = cos(roll *T(0.5f)); + const T sya = sin(yaw *T(0.5f)); + const T spi = sin(pitch*T(0.5f)); + const T sro = sin(roll *T(0.5f)); + r = cro*cya*cpi + sro*sya*spi; + i = cro*cya*spi + sro*sya*cpi; + j = cro*sya*cpi - sro*cya*spi; + k = sro*cya*cpi - cro*sya*spi; + } + + ////////////////////////////////////////////////////////////////////////////// + /// Output Operators + ////////////////////////////////////////////////////////////////////////////// + + template<typename T> static embree_ostream operator<<(embree_ostream cout, const QuaternionT<T>& q) { + return cout << "{ r = " << q.r << ", i = " << q.i << ", j = " << q.j << ", k = " << q.k << " }"; + } + + /*! default template instantiations */ + typedef QuaternionT<float> Quaternion3f; + typedef QuaternionT<double> Quaternion3d; + + template<int N> using Quaternion3vf = QuaternionT<vfloat<N>>; + typedef QuaternionT<vfloat<4>> Quaternion3vf4; + typedef QuaternionT<vfloat<8>> Quaternion3vf8; + typedef QuaternionT<vfloat<16>> Quaternion3vf16; + + ////////////////////////////////////////////////////////////////////////////// + /// Interpolation + ////////////////////////////////////////////////////////////////////////////// + template<typename T> + __forceinline QuaternionT<T>lerp(const QuaternionT<T>& q0, + const QuaternionT<T>& q1, + const T& factor) + { + QuaternionT<T> q; + q.r = lerp(q0.r, q1.r, factor); + q.i = lerp(q0.i, q1.i, factor); + q.j = lerp(q0.j, q1.j, factor); + q.k = lerp(q0.k, q1.k, factor); + return q; + } + + template<typename T> + __forceinline QuaternionT<T> slerp(const QuaternionT<T>& q0, + const QuaternionT<T>& q1_, + const T& t) + { + T cosTheta = dot(q0, q1_); + QuaternionT<T> q1 = select(cosTheta < 0.f, -q1_, q1_); + cosTheta = select(cosTheta < 0.f, -cosTheta, cosTheta); + if (unlikely(all(cosTheta > 0.9995f))) { + return normalize(lerp(q0, q1, t)); + } + const T phi = t * fastapprox::acos(cosTheta); + T sinPhi, cosPhi; + fastapprox::sincos(phi, sinPhi, cosPhi); + QuaternionT<T> qperp = sinPhi * normalize(msub(cosTheta, q0, q1)); + return msub(cosPhi, q0, qperp); + } +} diff --git a/thirdparty/embree/common/math/range.h b/thirdparty/embree/common/math/range.h new file mode 100644 index 0000000000..909fadb995 --- /dev/null +++ b/thirdparty/embree/common/math/range.h @@ -0,0 +1,137 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../math/math.h" + +namespace embree +{ + template<typename Ty> + struct range + { + __forceinline range() {} + + __forceinline range(const Ty& begin) + : _begin(begin), _end(begin+1) {} + + __forceinline range(const Ty& begin, const Ty& end) + : _begin(begin), _end(end) {} + + __forceinline range(const range& other) + : _begin(other._begin), _end(other._end) {} + + template<typename T1> + __forceinline range(const range<T1>& other) + : _begin(Ty(other._begin)), _end(Ty(other._end)) {} + + template<typename T1> + __forceinline range& operator =(const range<T1>& other) { + _begin = other._begin; + _end = other._end; + return *this; + } + + __forceinline Ty begin() const { + return _begin; + } + + __forceinline Ty end() const { + return _end; + } + + __forceinline range intersect(const range& r) const { + return range (max(_begin,r._begin),min(_end,r._end)); + } + + __forceinline Ty size() const { + return _end - _begin; + } + + __forceinline bool empty() const { + return _end <= _begin; + } + + __forceinline Ty center() const { + return (_begin + _end)/2; + } + + __forceinline std::pair<range,range> split() const + { + const Ty _center = center(); + return std::make_pair(range(_begin,_center),range(_center,_end)); + } + + __forceinline void split(range& left_o, range& right_o) const + { + const Ty _center = center(); + left_o = range(_begin,_center); + right_o = range(_center,_end); + } + + __forceinline friend bool operator< (const range& r0, const range& r1) { + return r0.size() < r1.size(); + } + + friend embree_ostream operator<<(embree_ostream cout, const range& r) { + return cout << "range [" << r.begin() << ", " << r.end() << "]"; + } + + Ty _begin, _end; + }; + + template<typename Ty> + range<Ty> make_range(const Ty& begin, const Ty& end) { + return range<Ty>(begin,end); + } + + template<typename Ty> + struct extended_range : public range<Ty> + { + __forceinline extended_range () {} + + __forceinline extended_range (const Ty& begin) + : range<Ty>(begin), _ext_end(begin+1) {} + + __forceinline extended_range (const Ty& begin, const Ty& end) + : range<Ty>(begin,end), _ext_end(end) {} + + __forceinline extended_range (const Ty& begin, const Ty& end, const Ty& ext_end) + : range<Ty>(begin,end), _ext_end(ext_end) {} + + __forceinline Ty ext_end() const { + return _ext_end; + } + + __forceinline Ty ext_size() const { + return _ext_end - range<Ty>::_begin; + } + + __forceinline Ty ext_range_size() const { + return _ext_end - range<Ty>::_end; + } + + __forceinline bool has_ext_range() const { + assert(_ext_end >= range<Ty>::_end); + return (_ext_end - range<Ty>::_end) > 0; + } + + __forceinline void set_ext_range(const size_t ext_end){ + assert(ext_end >= range<Ty>::_end); + _ext_end = ext_end; + } + + __forceinline void move_right(const size_t plus){ + range<Ty>::_begin += plus; + range<Ty>::_end += plus; + _ext_end += plus; + } + + friend embree_ostream operator<<(embree_ostream cout, const extended_range& r) { + return cout << "extended_range [" << r.begin() << ", " << r.end() << " (" << r.ext_end() << ")]"; + } + + Ty _ext_end; + }; +} diff --git a/thirdparty/embree/common/math/transcendental.h b/thirdparty/embree/common/math/transcendental.h new file mode 100644 index 0000000000..fd16c26e81 --- /dev/null +++ b/thirdparty/embree/common/math/transcendental.h @@ -0,0 +1,525 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +// Transcendental functions from "ispc": https://github.com/ispc/ispc/ +// Most of the transcendental implementations in ispc code come from +// Solomon Boulos's "syrah": https://github.com/boulos/syrah/ + +#include "../simd/simd.h" + +namespace embree +{ + +namespace fastapprox +{ + +template <typename T> +__forceinline T sin(const T &v) +{ + static const float piOverTwoVec = 1.57079637050628662109375; + static const float twoOverPiVec = 0.636619746685028076171875; + auto scaled = v * twoOverPiVec; + auto kReal = floor(scaled); + auto k = toInt(kReal); + + // Reduced range version of x + auto x = v - kReal * piOverTwoVec; + auto kMod4 = k & 3; + auto sinUseCos = (kMod4 == 1 | kMod4 == 3); + auto flipSign = (kMod4 > 1); + + // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2, + // 4, 6, 8, 10|], [|single...|], [0;Pi/2]); + static const float sinC2 = -0.16666667163372039794921875; + static const float sinC4 = +8.333347737789154052734375e-3; + static const float sinC6 = -1.9842604524455964565277099609375e-4; + static const float sinC8 = +2.760012648650445044040679931640625e-6; + static const float sinC10 = -2.50293279435709337121807038784027099609375e-8; + + static const float cosC2 = -0.5; + static const float cosC4 = +4.166664183139801025390625e-2; + static const float cosC6 = -1.388833043165504932403564453125e-3; + static const float cosC8 = +2.47562347794882953166961669921875e-5; + static const float cosC10 = -2.59630184018533327616751194000244140625e-7; + + auto outside = select(sinUseCos, 1., x); + auto c2 = select(sinUseCos, T(cosC2), T(sinC2)); + auto c4 = select(sinUseCos, T(cosC4), T(sinC4)); + auto c6 = select(sinUseCos, T(cosC6), T(sinC6)); + auto c8 = select(sinUseCos, T(cosC8), T(sinC8)); + auto c10 = select(sinUseCos, T(cosC10), T(sinC10)); + + auto x2 = x * x; + auto formula = x2 * c10 + c8; + formula = x2 * formula + c6; + formula = x2 * formula + c4; + formula = x2 * formula + c2; + formula = x2 * formula + 1.; + formula *= outside; + + formula = select(flipSign, -formula, formula); + return formula; +} + +template <typename T> +__forceinline T cos(const T &v) +{ + static const float piOverTwoVec = 1.57079637050628662109375; + static const float twoOverPiVec = 0.636619746685028076171875; + auto scaled = v * twoOverPiVec; + auto kReal = floor(scaled); + auto k = toInt(kReal); + + // Reduced range version of x + auto x = v - kReal * piOverTwoVec; + + auto kMod4 = k & 3; + auto cosUseCos = (kMod4 == 0 | kMod4 == 2); + auto flipSign = (kMod4 == 1 | kMod4 == 2); + + const float sinC2 = -0.16666667163372039794921875; + const float sinC4 = +8.333347737789154052734375e-3; + const float sinC6 = -1.9842604524455964565277099609375e-4; + const float sinC8 = +2.760012648650445044040679931640625e-6; + const float sinC10 = -2.50293279435709337121807038784027099609375e-8; + + const float cosC2 = -0.5; + const float cosC4 = +4.166664183139801025390625e-2; + const float cosC6 = -1.388833043165504932403564453125e-3; + const float cosC8 = +2.47562347794882953166961669921875e-5; + const float cosC10 = -2.59630184018533327616751194000244140625e-7; + + auto outside = select(cosUseCos, 1., x); + auto c2 = select(cosUseCos, T(cosC2), T(sinC2)); + auto c4 = select(cosUseCos, T(cosC4), T(sinC4)); + auto c6 = select(cosUseCos, T(cosC6), T(sinC6)); + auto c8 = select(cosUseCos, T(cosC8), T(sinC8)); + auto c10 = select(cosUseCos, T(cosC10), T(sinC10)); + + auto x2 = x * x; + auto formula = x2 * c10 + c8; + formula = x2 * formula + c6; + formula = x2 * formula + c4; + formula = x2 * formula + c2; + formula = x2 * formula + 1.; + formula *= outside; + + formula = select(flipSign, -formula, formula); + return formula; +} + +template <typename T> +__forceinline void sincos(const T &v, T &sinResult, T &cosResult) +{ + const float piOverTwoVec = 1.57079637050628662109375; + const float twoOverPiVec = 0.636619746685028076171875; + auto scaled = v * twoOverPiVec; + auto kReal = floor(scaled); + auto k = toInt(kReal); + + // Reduced range version of x + auto x = v - kReal * piOverTwoVec; + auto kMod4 = k & 3; + auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2)); + auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3)); + auto sinFlipSign = (kMod4 > 1); + auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2)); + + const float oneVec = +1.; + const float sinC2 = -0.16666667163372039794921875; + const float sinC4 = +8.333347737789154052734375e-3; + const float sinC6 = -1.9842604524455964565277099609375e-4; + const float sinC8 = +2.760012648650445044040679931640625e-6; + const float sinC10 = -2.50293279435709337121807038784027099609375e-8; + + const float cosC2 = -0.5; + const float cosC4 = +4.166664183139801025390625e-2; + const float cosC6 = -1.388833043165504932403564453125e-3; + const float cosC8 = +2.47562347794882953166961669921875e-5; + const float cosC10 = -2.59630184018533327616751194000244140625e-7; + + auto x2 = x * x; + + auto sinFormula = x2 * sinC10 + sinC8; + auto cosFormula = x2 * cosC10 + cosC8; + sinFormula = x2 * sinFormula + sinC6; + cosFormula = x2 * cosFormula + cosC6; + + sinFormula = x2 * sinFormula + sinC4; + cosFormula = x2 * cosFormula + cosC4; + + sinFormula = x2 * sinFormula + sinC2; + cosFormula = x2 * cosFormula + cosC2; + + sinFormula = x2 * sinFormula + oneVec; + cosFormula = x2 * cosFormula + oneVec; + + sinFormula *= x; + + sinResult = select(sinUseCos, cosFormula, sinFormula); + cosResult = select(cosUseCos, cosFormula, sinFormula); + + sinResult = select(sinFlipSign, -sinResult, sinResult); + cosResult = select(cosFlipSign, -cosResult, cosResult); +} + +template <typename T> +__forceinline T tan(const T &v) +{ + const float piOverFourVec = 0.785398185253143310546875; + const float fourOverPiVec = 1.27323949337005615234375; + + auto xLt0 = v < 0.; + auto y = select(xLt0, -v, v); + auto scaled = y * fourOverPiVec; + + auto kReal = floor(scaled); + auto k = toInt(kReal); + + auto x = y - kReal * piOverFourVec; + + // If k & 1, x -= Pi/4 + auto needOffset = (k & 1) != 0; + x = select(needOffset, x - piOverFourVec, x); + + // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To... + auto kMod4 = k & 3; + auto useCotan = (kMod4 == 1) | (kMod4 == 2); + + const float oneVec = 1.0; + + const float tanC2 = +0.33333075046539306640625; + const float tanC4 = +0.13339905440807342529296875; + const float tanC6 = +5.3348250687122344970703125e-2; + const float tanC8 = +2.46033705770969390869140625e-2; + const float tanC10 = +2.892402000725269317626953125e-3; + const float tanC12 = +9.500005282461643218994140625e-3; + + const float cotC2 = -0.3333333432674407958984375; + const float cotC4 = -2.222204394638538360595703125e-2; + const float cotC6 = -2.11752182804048061370849609375e-3; + const float cotC8 = -2.0846328698098659515380859375e-4; + const float cotC10 = -2.548247357481159269809722900390625e-5; + const float cotC12 = -3.5257363606433500535786151885986328125e-7; + + auto x2 = x * x; + T z; + if (any(useCotan)) + { + auto cotVal = x2 * cotC12 + cotC10; + cotVal = x2 * cotVal + cotC8; + cotVal = x2 * cotVal + cotC6; + cotVal = x2 * cotVal + cotC4; + cotVal = x2 * cotVal + cotC2; + cotVal = x2 * cotVal + oneVec; + // The equation is for x * cot(x) but we need -x * cot(x) for the tan part. + cotVal /= -x; + z = cotVal; + } + auto useTan = !useCotan; + if (any(useTan)) + { + auto tanVal = x2 * tanC12 + tanC10; + tanVal = x2 * tanVal + tanC8; + tanVal = x2 * tanVal + tanC6; + tanVal = x2 * tanVal + tanC4; + tanVal = x2 * tanVal + tanC2; + tanVal = x2 * tanVal + oneVec; + // Equation was for tan(x)/x + tanVal *= x; + z = select(useTan, tanVal, z); + } + return select(xLt0, -z, z); +} + +template <typename T> +__forceinline T asin(const T &x0) +{ + auto isneg = (x0 < 0.f); + auto x = abs(x0); + auto isnan = (x > 1.f); + + // sollya + // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|], + // [1e-20;.9999999999999999]); + // avg error: 1.1105439e-06, max error 1.3187528e-06 + auto v = 1.57079517841339111328125f + + x * (-0.21450997889041900634765625f + + x * (8.78556668758392333984375e-2f + + x * (-4.489909112453460693359375e-2f + + x * (1.928029954433441162109375e-2f + + x * (-4.3095736764371395111083984375e-3f))))); + + v *= -sqrt(1.f - x); + v = v + 1.57079637050628662109375f; + + v = select(v < 0.f, T(0.f), v); + v = select(isneg, -v, v); + v = select(isnan, T(cast_i2f(0x7fc00000)), v); + + return v; +} + +template <typename T> +__forceinline T acos(const T &v) +{ + return 1.57079637050628662109375f - asin(v); +} + +template <typename T> +__forceinline T atan(const T &v) +{ + const float piOverTwoVec = 1.57079637050628662109375; + // atan(-x) = -atan(x) (so flip from negative to positive first) + // If x > 1 -> atan(x) = Pi/2 - atan(1/x) + auto xNeg = v < 0.f; + auto xFlipped = select(xNeg, -v, v); + + auto xGt1 = xFlipped > 1.; + auto x = select(xGt1, rcpSafe(xFlipped), xFlipped); + + // These coefficients approximate atan(x)/x + const float atanC0 = +0.99999988079071044921875; + const float atanC2 = -0.3333191573619842529296875; + const float atanC4 = +0.199689209461212158203125; + const float atanC6 = -0.14015688002109527587890625; + const float atanC8 = +9.905083477497100830078125e-2; + const float atanC10 = -5.93664981424808502197265625e-2; + const float atanC12 = +2.417283318936824798583984375e-2; + const float atanC14 = -4.6721356920897960662841796875e-3; + + auto x2 = x * x; + auto result = x2 * atanC14 + atanC12; + result = x2 * result + atanC10; + result = x2 * result + atanC8; + result = x2 * result + atanC6; + result = x2 * result + atanC4; + result = x2 * result + atanC2; + result = x2 * result + atanC0; + result *= x; + + result = select(xGt1, piOverTwoVec - result, result); + result = select(xNeg, -result, result); + return result; +} + +template <typename T> +__forceinline T atan2(const T &y, const T &x) +{ + const float piVec = 3.1415926536; + // atan2(y, x) = + // + // atan2(y > 0, x = +-0) -> Pi/2 + // atan2(y < 0, x = +-0) -> -Pi/2 + // atan2(y = +-0, x < +0) -> +-Pi + // atan2(y = +-0, x >= +0) -> +-0 + // + // atan2(y >= 0, x < 0) -> Pi + atan(y/x) + // atan2(y < 0, x < 0) -> -Pi + atan(y/x) + // atan2(y, x > 0) -> atan(y/x) + // + // and then a bunch of code for dealing with infinities. + auto yOverX = y * rcpSafe(x); + auto atanArg = atan(yOverX); + auto xLt0 = x < 0.f; + auto yLt0 = y < 0.f; + auto offset = select(xLt0, + select(yLt0, T(-piVec), T(piVec)), 0.f); + return offset + atanArg; +} + +template <typename T> +__forceinline T exp(const T &v) +{ + const float ln2Part1 = 0.6931457519; + const float ln2Part2 = 1.4286067653e-6; + const float oneOverLn2 = 1.44269502162933349609375; + + auto scaled = v * oneOverLn2; + auto kReal = floor(scaled); + auto k = toInt(kReal); + + // Reduced range version of x + auto x = v - kReal * ln2Part1; + x -= kReal * ln2Part2; + + // These coefficients are for e^x in [0, ln(2)] + const float one = 1.; + const float c2 = 0.4999999105930328369140625; + const float c3 = 0.166668415069580078125; + const float c4 = 4.16539050638675689697265625e-2; + const float c5 = 8.378830738365650177001953125e-3; + const float c6 = 1.304379315115511417388916015625e-3; + const float c7 = 2.7555381529964506626129150390625e-4; + + auto result = x * c7 + c6; + result = x * result + c5; + result = x * result + c4; + result = x * result + c3; + result = x * result + c2; + result = x * result + one; + result = x * result + one; + + // Compute 2^k (should differ for float and double, but I'll avoid + // it for now and just do floats) + const int fpbias = 127; + auto biasedN = k + fpbias; + auto overflow = kReal > fpbias; + // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0) + // we've got underflow. -127 * ln(2) -> -88.02. So the most + // negative float input that doesn't result in zero is like -88. + auto underflow = kReal <= -fpbias; + const int infBits = 0x7f800000; + biasedN <<= 23; + // Reinterpret this thing as float + auto twoToTheN = asFloat(biasedN); + // Handle both doubles and floats (hopefully eliding the copy for float) + auto elemtype2n = twoToTheN; + result *= elemtype2n; + result = select(overflow, cast_i2f(infBits), result); + result = select(underflow, 0., result); + return result; +} + +// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n +// * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)). +template <typename T, typename R> +__forceinline void __rangeReduceLog(const T &input, + T &reduced, + R &exponent) +{ + auto intVersion = asInt(input); + // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM + // exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000 + // 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0 + // non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111 + // = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF + + //const int exponentMask(0x7F800000) + static const int nonexponentMask = 0x807FFFFF; + + // We want the reduced version to have an exponent of -1 which is + // -1 + 127 after biasing or 126 + static const int exponentNeg1 = (126l << 23); + // NOTE(boulos): We don't need to mask anything out since we know + // the sign bit has to be 0. If it's 1, we need to return infinity/nan + // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN). + auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128] + + auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2 + exponent = offsetExponent - 127; // get the real value + + // Blend the offset_exponent with the original input (do this in + // int for now, until I decide if float can have & and ¬) + auto blended = (intVersion & nonexponentMask) | (exponentNeg1); + reduced = asFloat(blended); +} + +template <typename T> struct ExponentType { }; +template <int N> struct ExponentType<vfloat_impl<N>> { typedef vint<N> Ty; }; +template <> struct ExponentType<float> { typedef int Ty; }; + +template <typename T> +__forceinline T log(const T &v) +{ + T reduced; + typename ExponentType<T>::Ty exponent; + + const int nanBits = 0x7fc00000; + const int negInfBits = 0xFF800000; + const float nan = cast_i2f(nanBits); + const float negInf = cast_i2f(negInfBits); + auto useNan = v < 0.; + auto useInf = v == 0.; + auto exceptional = useNan | useInf; + const float one = 1.0; + + auto patched = select(exceptional, one, v); + __rangeReduceLog(patched, reduced, exponent); + + const float ln2 = 0.693147182464599609375; + + auto x1 = one - reduced; + const float c1 = +0.50000095367431640625; + const float c2 = +0.33326041698455810546875; + const float c3 = +0.2519190013408660888671875; + const float c4 = +0.17541764676570892333984375; + const float c5 = +0.3424419462680816650390625; + const float c6 = -0.599632322788238525390625; + const float c7 = +1.98442304134368896484375; + const float c8 = -2.4899270534515380859375; + const float c9 = +1.7491014003753662109375; + + auto result = x1 * c9 + c8; + result = x1 * result + c7; + result = x1 * result + c6; + result = x1 * result + c5; + result = x1 * result + c4; + result = x1 * result + c3; + result = x1 * result + c2; + result = x1 * result + c1; + result = x1 * result + one; + + // Equation was for -(ln(red)/(1-red)) + result *= -x1; + result += toFloat(exponent) * ln2; + + return select(exceptional, + select(useNan, T(nan), T(negInf)), + result); +} + +template <typename T> +__forceinline T pow(const T &x, const T &y) +{ + auto x1 = abs(x); + auto z = exp(y * log(x1)); + + // Handle special cases + const float twoOver23 = 8388608.0f; + auto yInt = y == round(y); + auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit + + // x == 0 + z = select(x == 0.0f, + select(y < 0.0f, T(inf) | signmsk(x), + select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z); + + // x < 0 + auto xNegative = x < 0.0f; + if (any(xNegative)) + { + auto z1 = z | asFloat(yOddInt); + z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN()); + z = select(xNegative, z1, z); + } + + auto xFinite = isfinite(x); + auto yFinite = isfinite(y); + if (all(xFinite & yFinite)) + return z; + + // x finite and y infinite + z = select(andn(xFinite, yFinite), + select(x1 == 1.0f, 1.0f, + select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z); + + // x infinite + z = select(xFinite, z, + select(y == 0.0f, 1.0f, + select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x))); + + return z; +} + +template <typename T> +__forceinline T pow(const T &x, float y) +{ + return pow(x, T(y)); +} + +} // namespace fastapprox + +} // namespace embree diff --git a/thirdparty/embree/common/math/vec2.h b/thirdparty/embree/common/math/vec2.h new file mode 100644 index 0000000000..d62aef51f3 --- /dev/null +++ b/thirdparty/embree/common/math/vec2.h @@ -0,0 +1,235 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" + +namespace embree +{ + struct Vec2fa; + + //////////////////////////////////////////////////////////////////////////////// + /// Generic 2D vector Class + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> struct Vec2 + { + enum { N = 2 }; + union { + struct { T x, y; }; +#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler + T components[N]; +#endif + }; + + typedef T Scalar; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2( ) {} + __forceinline explicit Vec2( const T& a ) : x(a), y(a) {} + __forceinline Vec2( const T& x, const T& y ) : x(x), y(y) {} + + __forceinline Vec2( const Vec2& other ) { x = other.x; y = other.y; } + __forceinline Vec2( const Vec2fa& other ); + + template<typename T1> __forceinline Vec2( const Vec2<T1>& a ) : x(T(a.x)), y(T(a.y)) {} + template<typename T1> __forceinline Vec2& operator =( const Vec2<T1>& other ) { x = other.x; y = other.y; return *this; } + + __forceinline Vec2& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2( ZeroTy ) : x(zero), y(zero) {} + __forceinline Vec2( OneTy ) : x(one), y(one) {} + __forceinline Vec2( PosInfTy ) : x(pos_inf), y(pos_inf) {} + __forceinline Vec2( NegInfTy ) : x(neg_inf), y(neg_inf) {} + +#if defined(__WIN32__) && _MSC_VER == 1800 // workaround for older VS 2013 compiler + __forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return (&x)[axis]; } + __forceinline T& operator [](const size_t axis) { assert(axis < 2); return (&x)[axis]; } +#else + __forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return components[axis]; } + __forceinline T& operator [](const size_t axis ) { assert(axis < 2); return components[axis]; } +#endif + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a ) { return Vec2<T>(+a.x, +a.y); } + template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a ) { return Vec2<T>(-a.x, -a.y); } + template<typename T> __forceinline Vec2<T> abs ( const Vec2<T>& a ) { return Vec2<T>(abs (a.x), abs (a.y)); } + template<typename T> __forceinline Vec2<T> rcp ( const Vec2<T>& a ) { return Vec2<T>(rcp (a.x), rcp (a.y)); } + template<typename T> __forceinline Vec2<T> rsqrt ( const Vec2<T>& a ) { return Vec2<T>(rsqrt(a.x), rsqrt(a.y)); } + template<typename T> __forceinline Vec2<T> sqrt ( const Vec2<T>& a ) { return Vec2<T>(sqrt (a.x), sqrt (a.y)); } + template<typename T> __forceinline Vec2<T> frac ( const Vec2<T>& a ) { return Vec2<T>(frac (a.x), frac (a.y)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x + b.x, a.y + b.y); } + template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const T& b ) { return Vec2<T>(a.x + b , a.y + b ); } + template<typename T> __forceinline Vec2<T> operator +( const T& a, const Vec2<T>& b ) { return Vec2<T>(a + b.x, a + b.y); } + template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x - b.x, a.y - b.y); } + template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const T& b ) { return Vec2<T>(a.x - b , a.y - b ); } + template<typename T> __forceinline Vec2<T> operator -( const T& a, const Vec2<T>& b ) { return Vec2<T>(a - b.x, a - b.y); } + template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x * b.x, a.y * b.y); } + template<typename T> __forceinline Vec2<T> operator *( const T& a, const Vec2<T>& b ) { return Vec2<T>(a * b.x, a * b.y); } + template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const T& b ) { return Vec2<T>(a.x * b , a.y * b ); } + template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x / b.x, a.y / b.y); } + template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const T& b ) { return Vec2<T>(a.x / b , a.y / b ); } + template<typename T> __forceinline Vec2<T> operator /( const T& a, const Vec2<T>& b ) { return Vec2<T>(a / b.x, a / b.y); } + + template<typename T> __forceinline Vec2<T> min(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(min(a.x, b.x), min(a.y, b.y)); } + template<typename T> __forceinline Vec2<T> max(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(max(a.x, b.x), max(a.y, b.y)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec2<T> madd ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y) ); } + template<typename T> __forceinline Vec2<T> msub ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y) ); } + template<typename T> __forceinline Vec2<T> nmadd ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y) ); } + template<typename T> __forceinline Vec2<T> nmsub ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y) ); } + + template<typename T> __forceinline Vec2<T> madd ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a,b.x,c.x), madd(a,b.y,c.y) ); } + template<typename T> __forceinline Vec2<T> msub ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a,b.x,c.x), msub(a,b.y,c.y) ); } + template<typename T> __forceinline Vec2<T> nmadd ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y) ); } + template<typename T> __forceinline Vec2<T> nmsub ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y) ); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec2<T>& operator +=( Vec2<T>& a, const Vec2<T>& b ) { a.x += b.x; a.y += b.y; return a; } + template<typename T> __forceinline Vec2<T>& operator -=( Vec2<T>& a, const Vec2<T>& b ) { a.x -= b.x; a.y -= b.y; return a; } + template<typename T> __forceinline Vec2<T>& operator *=( Vec2<T>& a, const T& b ) { a.x *= b ; a.y *= b ; return a; } + template<typename T> __forceinline Vec2<T>& operator /=( Vec2<T>& a, const T& b ) { a.x /= b ; a.y /= b ; return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline T reduce_add( const Vec2<T>& a ) { return a.x + a.y; } + template<typename T> __forceinline T reduce_mul( const Vec2<T>& a ) { return a.x * a.y; } + template<typename T> __forceinline T reduce_min( const Vec2<T>& a ) { return min(a.x, a.y); } + template<typename T> __forceinline T reduce_max( const Vec2<T>& a ) { return max(a.x, a.y); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline bool operator ==( const Vec2<T>& a, const Vec2<T>& b ) { return a.x == b.x && a.y == b.y; } + template<typename T> __forceinline bool operator !=( const Vec2<T>& a, const Vec2<T>& b ) { return a.x != b.x || a.y != b.y; } + template<typename T> __forceinline bool operator < ( const Vec2<T>& a, const Vec2<T>& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Shift Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec2<T> shift_right_1( const Vec2<T>& a ) { + return Vec2<T>(shift_right_1(a.x),shift_right_1(a.y)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline T dot ( const Vec2<T>& a, const Vec2<T>& b ) { return madd(a.x,b.x,a.y*b.y); } + template<typename T> __forceinline Vec2<T> cross ( const Vec2<T>& a ) { return Vec2<T>(-a.y,a.x); } + template<typename T> __forceinline T length ( const Vec2<T>& a ) { return sqrt(dot(a,a)); } + template<typename T> __forceinline Vec2<T> normalize( const Vec2<T>& a ) { return a*rsqrt(dot(a,a)); } + template<typename T> __forceinline T distance ( const Vec2<T>& a, const Vec2<T>& b ) { return length(a-b); } + template<typename T> __forceinline T det ( const Vec2<T>& a, const Vec2<T>& b ) { return a.x*b.y - a.y*b.x; } + + template<typename T> __forceinline Vec2<T> normalize_safe( const Vec2<T>& a ) { + const T d = dot(a,a); return select(d == T( zero ),a, a*rsqrt(d) ); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec2<T> select ( bool s, const Vec2<T>& t, const Vec2<T>& f ) { + return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y)); + } + + template<typename T> __forceinline Vec2<T> select ( const Vec2<bool>& s, const Vec2<T>& t, const Vec2<T>& f ) { + return Vec2<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y)); + } + + template<typename T> __forceinline Vec2<T> select ( const typename T::Bool& s, const Vec2<T>& t, const Vec2<T>& f ) { + return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y)); + } + + template<typename T> + __forceinline Vec2<T> lerp(const Vec2<T>& v0, const Vec2<T>& v1, const T& t) { + return madd(Vec2<T>(T(1.0f)-t),v0,t*v1); + } + + template<typename T> __forceinline int maxDim ( const Vec2<T>& a ) + { + const Vec2<T> b = abs(a); + if (b.x > b.y) return 0; + else return 1; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2<T>& a) { + return cout << "(" << a.x << ", " << a.y << ")"; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Default template instantiations + //////////////////////////////////////////////////////////////////////////////// + + typedef Vec2<bool > Vec2b; + typedef Vec2<int > Vec2i; + typedef Vec2<float> Vec2f; +} + +#include "vec2fa.h" + +#if defined __SSE__ +#include "../simd/sse.h" +#endif + +#if defined __AVX__ +#include "../simd/avx.h" +#endif + +#if defined(__AVX512F__) +#include "../simd/avx512.h" +#endif + +namespace embree +{ + template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} + +#if defined(__SSE__) + template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} +#endif + +#if defined(__AVX__) + template<> __forceinline Vec2<vfloat8>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} +#endif + +#if defined(__AVX512F__) + template<> __forceinline Vec2<vfloat16>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} +#endif +} diff --git a/thirdparty/embree/common/math/vec2fa.h b/thirdparty/embree/common/math/vec2fa.h new file mode 100644 index 0000000000..a51fb68fd0 --- /dev/null +++ b/thirdparty/embree/common/math/vec2fa.h @@ -0,0 +1,301 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/alloc.h" +#include "math.h" +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec2fa Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec2fa + { + ALIGNED_STRUCT_(16); + + typedef float Scalar; + enum { N = 2 }; + union { + __m128 m128; + struct { float x,y,az,aw; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa( ) {} + __forceinline Vec2fa( const __m128 a ) : m128(a) {} + + __forceinline Vec2fa ( const Vec2<float>& other ) { x = other.x; y = other.y; } + __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; } + + __forceinline Vec2fa ( const Vec2fa& other ) { m128 = other.m128; } + __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; } + + __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {} + __forceinline Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {} + + __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} + + __forceinline operator const __m128&() const { return m128; } + __forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline Vec2fa load( const void* const a ) { + return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); + } + + static __forceinline Vec2fa loadu( const void* const a ) { + return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); + } + + static __forceinline void storeu ( void* ptr, const Vec2fa& v ) { + _mm_storeu_ps((float*)ptr,v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa( ZeroTy ) : m128(_mm_setzero_ps()) {} + __forceinline Vec2fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; } + __forceinline float& operator []( const size_t index ) { assert(index < 2); return (&x)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; } + __forceinline Vec2fa operator -( const Vec2fa& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return _mm_xor_ps(a.m128, mask); + } + __forceinline Vec2fa abs ( const Vec2fa& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(a.m128, mask); + } + __forceinline Vec2fa sign ( const Vec2fa& a ) { + return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero))); + } + + __forceinline Vec2fa rcp ( const Vec2fa& a ) + { +#if defined(__AVX512VL__) + const Vec2fa r = _mm_rcp14_ps(a.m128); +#else + const Vec2fa r = _mm_rcp_ps(a.m128); +#endif + +#if defined(__AVX2__) + const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f))); +#else + const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); + //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); +#endif + + return res; + } + + __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); } + __forceinline Vec2fa sqr ( const Vec2fa& a ) { return _mm_mul_ps(a,a); } + + __forceinline Vec2fa rsqrt( const Vec2fa& a ) + { +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ps(a.m128); +#else + __m128 r = _mm_rsqrt_ps(a.m128); +#endif + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + } + + __forceinline Vec2fa zero_fix(const Vec2fa& a) { + return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); + } + __forceinline Vec2fa rcp_safe(const Vec2fa& a) { + return rcp(zero_fix(a)); + } + __forceinline Vec2fa log ( const Vec2fa& a ) { + return Vec2fa(logf(a.x),logf(a.y)); + } + + __forceinline Vec2fa exp ( const Vec2fa& a ) { + return Vec2fa(expf(a.x),expf(a.y)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); } + __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); } + __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); } + __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); } + __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; } + __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); } + __forceinline Vec2fa operator /( const Vec2fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } + __forceinline Vec2fa operator /( const float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } + + __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); } + __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); } + +#if defined(__SSE4_1__) + __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_min_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + +#if defined(__SSE4_1__) + __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_max_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + + __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) { + return Vec2fa(powf(a.x,b),powf(a.y,b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); } + __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); } + __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); } + __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); } +#else + __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; } + __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; } + __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;} + __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; } +#endif + + __forceinline Vec2fa madd ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); } + __forceinline Vec2fa msub ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); } + __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); } + __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; } + __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; } + __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; } + __forceinline Vec2fa& operator *=( Vec2fa& a, const float b ) { return a = a * b; } + __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; } + __forceinline Vec2fa& operator /=( Vec2fa& a, const float b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; } + __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; } + __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); } + __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; } + __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE4_1__) + __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { + return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F)); + } +#else + __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { + return reduce_add(a*b); + } +#endif + + __forceinline Vec2fa cross ( const Vec2fa& a ) { + return Vec2fa(-a.y,a.x); + } + + __forceinline float sqr_length ( const Vec2fa& a ) { return dot(a,a); } + __forceinline float rcp_length ( const Vec2fa& a ) { return rsqrt(dot(a,a)); } + __forceinline float rcp_length2( const Vec2fa& a ) { return rcp(dot(a,a)); } + __forceinline float length ( const Vec2fa& a ) { return sqrt(dot(a,a)); } + __forceinline Vec2fa normalize( const Vec2fa& a ) { return a*rsqrt(dot(a,a)); } + __forceinline float distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) { + __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); + return blendv_ps(f, t, mask); + } + + __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) { + return madd(1.0f-t,v0,t*v1); + } + + __forceinline int maxDim ( const Vec2fa& a ) + { + const Vec2fa b = abs(a); + if (b.x > b.y) return 0; + else return 1; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__aarch64__) + //__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); } + __forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); } + __forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); } +#elif defined (__SSE4_1__) + //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } + __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } + __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } +#else + //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); } + __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); } + __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) { + return cout << "(" << a.x << ", " << a.y << ")"; + } + + typedef Vec2fa Vec2fa_t; +} diff --git a/thirdparty/embree/common/math/vec3.h b/thirdparty/embree/common/math/vec3.h new file mode 100644 index 0000000000..ce94eff327 --- /dev/null +++ b/thirdparty/embree/common/math/vec3.h @@ -0,0 +1,337 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" + +namespace embree +{ + struct Vec3fa; + + //////////////////////////////////////////////////////////////////////////////// + /// Generic 3D vector Class + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> struct Vec3 + { + enum { N = 3 }; + + union { + struct { + T x, y, z; + }; +#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler + T components[N]; +#endif + }; + + typedef T Scalar; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3( ) {} + __forceinline explicit Vec3( const T& a ) : x(a), y(a), z(a) {} + __forceinline Vec3( const T& x, const T& y, const T& z ) : x(x), y(y), z(z) {} + + __forceinline Vec3( const Vec3& other ) { x = other.x; y = other.y; z = other.z; } + __forceinline Vec3( const Vec3fa& other ); + + template<typename T1> __forceinline Vec3( const Vec3<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)) {} + template<typename T1> __forceinline Vec3& operator =(const Vec3<T1>& other) { x = other.x; y = other.y; z = other.z; return *this; } + + __forceinline Vec3& operator =(const Vec3& other) { x = other.x; y = other.y; z = other.z; return *this; } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3( ZeroTy ) : x(zero), y(zero), z(zero) {} + __forceinline Vec3( OneTy ) : x(one), y(one), z(one) {} + __forceinline Vec3( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf) {} + __forceinline Vec3( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf) {} + +#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler + __forceinline const T& operator []( const size_t axis ) const { assert(axis < 3); return (&x)[axis]; } + __forceinline T& operator []( const size_t axis ) { assert(axis < 3); return (&x)[axis]; } +#else + __forceinline const T& operator [](const size_t axis) const { assert(axis < 3); return components[axis]; } + __forceinline T& operator [](const size_t axis) { assert(axis < 3); return components[axis]; } +#endif + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a ) { return Vec3<T>(+a.x, +a.y, +a.z); } + template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a ) { return Vec3<T>(-a.x, -a.y, -a.z); } + template<typename T> __forceinline Vec3<T> abs ( const Vec3<T>& a ) { return Vec3<T>(abs (a.x), abs (a.y), abs (a.z)); } + template<typename T> __forceinline Vec3<T> rcp ( const Vec3<T>& a ) { return Vec3<T>(rcp (a.x), rcp (a.y), rcp (a.z)); } + template<typename T> __forceinline Vec3<T> rsqrt ( const Vec3<T>& a ) { return Vec3<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z)); } + template<typename T> __forceinline Vec3<T> sqrt ( const Vec3<T>& a ) { return Vec3<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z)); } + + template<typename T> __forceinline Vec3<T> zero_fix( const Vec3<T>& a ) + { + return Vec3<T>(select(abs(a.x)<min_rcp_input,T(min_rcp_input),a.x), + select(abs(a.y)<min_rcp_input,T(min_rcp_input),a.y), + select(abs(a.z)<min_rcp_input,T(min_rcp_input),a.z)); + } + template<typename T> __forceinline Vec3<T> rcp_safe(const Vec3<T>& a) { return rcp(zero_fix(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x + b.x, a.y + b.y, a.z + b.z); } + template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x - b.x, a.y - b.y, a.z - b.z); } + template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x * b.x, a.y * b.y, a.z * b.z); } + template<typename T> __forceinline Vec3<T> operator *( const T& a, const Vec3<T>& b ) { return Vec3<T>(a * b.x, a * b.y, a * b.z); } + template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const T& b ) { return Vec3<T>(a.x * b , a.y * b , a.z * b ); } + template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const T& b ) { return Vec3<T>(a.x / b , a.y / b , a.z / b ); } + template<typename T> __forceinline Vec3<T> operator /( const T& a, const Vec3<T>& b ) { return Vec3<T>(a / b.x, a / b.y, a / b.z); } + template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x / b.x, a.y / b.y, a.z / b.z); } + + template<typename T> __forceinline Vec3<T> min(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); } + template<typename T> __forceinline Vec3<T> max(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); } + + template<typename T> __forceinline Vec3<T> operator >>( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x >> b, a.y >> b, a.z >> b); } + template<typename T> __forceinline Vec3<T> operator <<( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x << b, a.y << b, a.z << b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec3<T> madd ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z)); } + template<typename T> __forceinline Vec3<T> msub ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z)); } + template<typename T> __forceinline Vec3<T> nmadd ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z));} + template<typename T> __forceinline Vec3<T> nmsub ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z)); } + + template<typename T> __forceinline Vec3<T> madd ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z)); } + template<typename T> __forceinline Vec3<T> msub ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z)); } + template<typename T> __forceinline Vec3<T> nmadd ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z));} + template<typename T> __forceinline Vec3<T> nmsub ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const T b ) { a.x += b; a.y += b; a.z += b; return a; } + template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const Vec3<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; return a; } + template<typename T> __forceinline Vec3<T>& operator -=( Vec3<T>& a, const Vec3<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; } + template<typename T> __forceinline Vec3<T>& operator *=( Vec3<T>& a, const T& b ) { a.x *= b ; a.y *= b ; a.z *= b ; return a; } + template<typename T> __forceinline Vec3<T>& operator /=( Vec3<T>& a, const T& b ) { a.x /= b ; a.y /= b ; a.z /= b ; return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline T reduce_add( const Vec3<T>& a ) { return a.x + a.y + a.z; } + template<typename T> __forceinline T reduce_mul( const Vec3<T>& a ) { return a.x * a.y * a.z; } + template<typename T> __forceinline T reduce_min( const Vec3<T>& a ) { return min(a.x, a.y, a.z); } + template<typename T> __forceinline T reduce_max( const Vec3<T>& a ) { return max(a.x, a.y, a.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline bool operator ==( const Vec3<T>& a, const Vec3<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; } + template<typename T> __forceinline bool operator !=( const Vec3<T>& a, const Vec3<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; } + template<typename T> __forceinline bool operator < ( const Vec3<T>& a, const Vec3<T>& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + if (a.z != b.z) return a.z < b.z; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Shift Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec3<T> shift_right_1( const Vec3<T>& a ) { + return Vec3<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec3<T> select ( bool s, const Vec3<T>& t, const Vec3<T>& f ) { + return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z)); + } + + template<typename T> __forceinline Vec3<T> select ( const Vec3<bool>& s, const Vec3<T>& t, const Vec3<T>& f ) { + return Vec3<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z)); + } + + template<typename T> __forceinline Vec3<T> select ( const typename T::Bool& s, const Vec3<T>& t, const Vec3<T>& f ) { + return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z)); + } + + template<typename T> + __forceinline Vec3<T> lerp(const Vec3<T>& v0, const Vec3<T>& v1, const T& t) { + return madd(Vec3<T>(T(1.0f)-t),v0,t*v1); + } + + template<typename T> __forceinline int maxDim ( const Vec3<T>& a ) + { + const Vec3<T> b = abs(a); + if (b.x > b.y) { + if (b.x > b.z) return 0; else return 2; + } else { + if (b.y > b.z) return 1; else return 2; + } + } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec3<bool> eq_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x==b.x,a.y==b.y,a.z==b.z); } + template<typename T> __forceinline Vec3<bool> neq_mask(const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x!=b.x,a.y!=b.y,a.z!=b.z); } + template<typename T> __forceinline Vec3<bool> lt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x< b.x,a.y< b.y,a.z< b.z); } + template<typename T> __forceinline Vec3<bool> le_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x<=b.x,a.y<=b.y,a.z<=b.z); } + template<typename T> __forceinline Vec3<bool> gt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x> b.x,a.y> b.y,a.z> b.z); } + template<typename T> __forceinline Vec3<bool> ge_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x>=b.x,a.y>=b.y,a.z>=b.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline T sqr ( const Vec3<T>& a ) { return dot(a,a); } + template<typename T> __forceinline T dot ( const Vec3<T>& a, const Vec3<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,a.z*b.z)); } + template<typename T> __forceinline T length ( const Vec3<T>& a ) { return sqrt(sqr(a)); } + template<typename T> __forceinline T rcp_length( const Vec3<T>& a ) { return rsqrt(sqr(a)); } + template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a ) { return a*rsqrt(sqr(a)); } + template<typename T> __forceinline T distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); } + template<typename T> __forceinline Vec3<T> cross ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x)); } + + template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c ) + { + const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x; + const T bc_x = b.z*c.y, bc_y = b.x*c.z, bc_z = b.y*c.x; + const Vec3<T> cross_ab(msub(a.y,b.z,ab_x), msub(a.z,b.x,ab_y), msub(a.x,b.y,ab_z)); + const Vec3<T> cross_bc(msub(b.y,c.z,bc_x), msub(b.z,c.x,bc_y), msub(b.x,c.y,bc_z)); + const auto sx = abs(ab_x) < abs(bc_x); + const auto sy = abs(ab_y) < abs(bc_y); + const auto sz = abs(ab_z) < abs(bc_z); + return Vec3<T>(select(sx,cross_ab.x,cross_bc.x), + select(sy,cross_ab.y,cross_bc.y), + select(sz,cross_ab.z,cross_bc.z)); + } + + template<typename T> __forceinline T sum ( const Vec3<T>& a ) { return a.x+a.y+a.z; } + + template<typename T> __forceinline T halfArea ( const Vec3<T>& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } + template<typename T> __forceinline T area ( const Vec3<T>& d ) { return 2.0f*halfArea(d); } + + template<typename T> __forceinline Vec3<T> normalize_safe( const Vec3<T>& a ) { + const T d = dot(a,a); return select(d == T( zero ), a , a*rsqrt(d) ); + } + + template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& P, const Vec3<T>& Q0, const Vec3<T>& Q1) + { + const Vec3<T> N = cross(P-Q0,Q1-Q0); + const Vec3<T> D = Q1-Q0; + return dot(N,N)*rcp(dot(D,D)); + } + + template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& PmQ0, const Vec3<T>& Q1mQ0) + { + const Vec3<T> N = cross(PmQ0,Q1mQ0); + const Vec3<T> D = Q1mQ0; + return dot(N,N)*rcp(dot(D,D)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3<T>& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; + } + + typedef Vec3<bool > Vec3b; + typedef Vec3<int > Vec3i; + typedef Vec3<float> Vec3f; +} + +#include "vec3ba.h" +#include "vec3ia.h" +#include "vec3fa.h" + +//////////////////////////////////////////////////////////////////////////////// +/// SSE / AVX / MIC specializations +//////////////////////////////////////////////////////////////////////////////// + +#if defined __SSE__ +#include "../simd/sse.h" +#endif + +#if defined __AVX__ +#include "../simd/avx.h" +#endif + +#if defined(__AVX512F__) +#include "../simd/avx512.h" +#endif + +namespace embree +{ + template<typename Out, typename In> + __forceinline Vec3<Out> broadcast(const Vec3<In>& a, const size_t k) { + return Vec3<Out>(Out(a.x[k]), Out(a.y[k]), Out(a.z[k])); + } + + template<> __forceinline Vec3<float>::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; } + +#if defined(__AVX__) + template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) { + x = a.x; y = a.y; z = a.z; + } +#elif defined(__SSE__) + template<> + __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) { + const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); + } +#endif + +#if defined(__SSE__) + template<> + __forceinline Vec3<vfloat4> broadcast<vfloat4,vfloat4>(const Vec3<vfloat4>& a, const size_t k) { + return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k])); + } + + template<int i0, int i1, int i2, int i3> + __forceinline Vec3<vfloat4> shuffle(const Vec3<vfloat4>& b) { + return Vec3<vfloat4>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z)); + } +#endif + +#if defined(__AVX__) + template<> + __forceinline Vec3<vfloat8>::Vec3(const Vec3fa& a) { + x = a.x; y = a.y; z = a.z; + } + + template<> + __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat4>(const Vec3<vfloat4>& a, const size_t k) { + return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); + } + template<> + __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat8>(const Vec3<vfloat8>& a, const size_t k) { + return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); + } + + template<int i0, int i1, int i2, int i3> + __forceinline Vec3<vfloat8> shuffle(const Vec3<vfloat8>& b) { + return Vec3<vfloat8>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z)); + } +#endif + +#if defined(__AVX512F__) + template<> __forceinline Vec3<vfloat16>::Vec3(const Vec3fa& a) : x(a.x), y(a.y), z(a.z) {} +#endif +} diff --git a/thirdparty/embree/common/math/vec3ba.h b/thirdparty/embree/common/math/vec3ba.h new file mode 100644 index 0000000000..a021b522dc --- /dev/null +++ b/thirdparty/embree/common/math/vec3ba.h @@ -0,0 +1,120 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/alloc.h" +#include "math.h" +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec3ba Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec3ba + { + ALIGNED_STRUCT_(16); + + union { + __m128 m128; + struct { int x,y,z; }; + }; + + typedef int Scalar; + enum { N = 3 }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba( ) {} + __forceinline Vec3ba( const __m128 input ) : m128(input) {} + __forceinline Vec3ba( const Vec3ba& other ) : m128(other.m128) {} + __forceinline Vec3ba& operator =(const Vec3ba& other) { m128 = other.m128; return *this; } + + __forceinline explicit Vec3ba( bool a ) + : m128(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {} + __forceinline Vec3ba( bool a, bool b, bool c) + : m128(mm_lookupmask_ps[(size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {} + + __forceinline operator const __m128&() const { return m128; } + __forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba( FalseTy ) : m128(_mm_setzero_ps()) {} + __forceinline Vec3ba( TrueTy ) : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } + __forceinline int& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } + }; + + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba operator !( const Vec3ba& a ) { return _mm_xor_ps(a.m128, Vec3ba(embree::True)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba operator &( const Vec3ba& a, const Vec3ba& b ) { return _mm_and_ps(a.m128, b.m128); } + __forceinline Vec3ba operator |( const Vec3ba& a, const Vec3ba& b ) { return _mm_or_ps (a.m128, b.m128); } + __forceinline Vec3ba operator ^( const Vec3ba& a, const Vec3ba& b ) { return _mm_xor_ps(a.m128, b.m128); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba& operator &=( Vec3ba& a, const Vec3ba& b ) { return a = a & b; } + __forceinline Vec3ba& operator |=( Vec3ba& a, const Vec3ba& b ) { return a = a | b; } + __forceinline Vec3ba& operator ^=( Vec3ba& a, const Vec3ba& b ) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec3ba& a, const Vec3ba& b ) { + return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) == 7; + } + __forceinline bool operator !=( const Vec3ba& a, const Vec3ba& b ) { + return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) != 7; + } + __forceinline bool operator < ( const Vec3ba& a, const Vec3ba& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + if (a.z != b.z) return a.z < b.z; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool reduce_and( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) == 0x7; } + __forceinline bool reduce_or ( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) != 0x0; } + + __forceinline bool all ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x7; } + __forceinline bool any ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) != 0x0; } + __forceinline bool none ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x0; } + + __forceinline size_t movemask(const Vec3ba& a) { return _mm_movemask_ps(a) & 0x7; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ba& a) { + return cout << "(" << (a.x ? "1" : "0") << ", " << (a.y ? "1" : "0") << ", " << (a.z ? "1" : "0") << ")"; + } +} diff --git a/thirdparty/embree/common/math/vec3fa.h b/thirdparty/embree/common/math/vec3fa.h new file mode 100644 index 0000000000..586039741d --- /dev/null +++ b/thirdparty/embree/common/math/vec3fa.h @@ -0,0 +1,727 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/alloc.h" +#include "math.h" +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec3fa Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec3fa + { + ALIGNED_STRUCT_(16); + + typedef float Scalar; + enum { N = 3 }; + union { + __m128 m128; + struct { float x,y,z; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa( ) {} + __forceinline Vec3fa( const __m128 a ) : m128(a) {} + + __forceinline Vec3fa ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); } + //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; } + + __forceinline Vec3fa ( const Vec3fa& other ) { m128 = other.m128; } + __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; } + + __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {} + __forceinline Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {} + + __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} + + __forceinline explicit operator const vfloat4() const { return vfloat4(m128); } + __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); } + __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); } + __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); } + + //__forceinline operator const __m128&() const { return m128; } + //__forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline Vec3fa load( const void* const a ) { + return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); + } + + static __forceinline Vec3fa loadu( const void* const a ) { + return Vec3fa(_mm_loadu_ps((float*)a)); + } + + static __forceinline void storeu ( void* ptr, const Vec3fa& v ) { + _mm_storeu_ps((float*)ptr,v.m128); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa( ZeroTy ) : m128(_mm_setzero_ps()) {} + __forceinline Vec3fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } + __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; } + __forceinline Vec3fa operator -( const Vec3fa& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return _mm_xor_ps(a.m128, mask); + } + __forceinline Vec3fa abs ( const Vec3fa& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(a.m128, mask); + } + __forceinline Vec3fa sign ( const Vec3fa& a ) { + return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128)); + } + + __forceinline Vec3fa rcp ( const Vec3fa& a ) + { +#if defined(__AVX512VL__) + const Vec3fa r = _mm_rcp14_ps(a.m128); +#else + const Vec3fa r = _mm_rcp_ps(a.m128); +#endif + +#if defined(__AVX2__) + const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f))); +#else + const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128))); + //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); +#endif + + return res; + } + + __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); } + __forceinline Vec3fa sqr ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); } + + __forceinline Vec3fa rsqrt( const Vec3fa& a ) + { +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ps(a.m128); +#else + __m128 r = _mm_rsqrt_ps(a.m128); +#endif + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + } + + __forceinline Vec3fa zero_fix(const Vec3fa& a) { + return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); + } + __forceinline Vec3fa rcp_safe(const Vec3fa& a) { + return rcp(zero_fix(a)); + } + __forceinline Vec3fa log ( const Vec3fa& a ) { + return Vec3fa(logf(a.x),logf(a.y),logf(a.z)); + } + + __forceinline Vec3fa exp ( const Vec3fa& a ) { + return Vec3fa(expf(a.x),expf(a.y),expf(a.z)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); } + __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); } + __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); } + __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); } + __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; } + __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); } + __forceinline Vec3fa operator /( const Vec3fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } + __forceinline Vec3fa operator /( const float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } + + __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); } + __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); } + +#if defined(__SSE4_1__) + __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) { + const vint4 ai = _mm_castps_si128(a.m128); + const vint4 bi = _mm_castps_si128(b.m128); + const vint4 ci = _mm_min_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + +#if defined(__SSE4_1__) + __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) { + const vint4 ai = _mm_castps_si128(a.m128); + const vint4 bi = _mm_castps_si128(b.m128); + const vint4 ci = _mm_max_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + + __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) { + return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } +#else + __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; } + __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; } + __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;} + __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; } +#endif + + __forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); } + __forceinline Vec3fa msub ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); } + __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); } + __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; } + __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; } + __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; } + __forceinline Vec3fa& operator *=( Vec3fa& a, const float b ) { return a = a * b; } + __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; } + __forceinline Vec3fa& operator /=( Vec3fa& a, const float b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float reduce_add(const Vec3fa& v) { + const vfloat4 a(v.m128); + const vfloat4 b = shuffle<1>(a); + const vfloat4 c = shuffle<2>(a); + return _mm_cvtss_f32(a+b+c); + } + + __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } + __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); } + __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } + __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } + + __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); } + __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } + __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); } + __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); } + __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a.m128, b.m128); } + __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); } + + __forceinline bool isvalid ( const Vec3fa& v ) { + return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE))); + } + + __forceinline bool is_finite ( const Vec3fa& a ) { + return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX))); + } + + __forceinline bool isvalid4 ( const Vec3fa& v ) { + return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE))); + } + + __forceinline bool is_finite4 ( const Vec3fa& a ) { + return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE4_1__) + __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) { + return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F)); + } +#else + __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) { + return reduce_add(a*b); + } +#endif + + __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b ) + { + vfloat4 a0 = vfloat4(a.m128); + vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128)); + vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128)); + vfloat4 b1 = vfloat4(b.m128); + return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1))); + } + + __forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,a); } + __forceinline float rcp_length ( const Vec3fa& a ) { return rsqrt(dot(a,a)); } + __forceinline float rcp_length2( const Vec3fa& a ) { return rcp(dot(a,a)); } + __forceinline float length ( const Vec3fa& a ) { return sqrt(dot(a,a)); } + __forceinline Vec3fa normalize( const Vec3fa& a ) { return a*rsqrt(dot(a,a)); } + __forceinline float distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); } + __forceinline float halfArea ( const Vec3fa& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } + __forceinline float area ( const Vec3fa& d ) { return 2.0f*halfArea(d); } + + __forceinline Vec3fa normalize_safe( const Vec3fa& a ) { + const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); + } + + /*! differentiated normalization */ + __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp) + { + const float pp = dot(p,p); + const float pdp = dot(p,dp); + return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) { + __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); + return blendv_ps(f.m128, t.m128, mask); + } + + __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) { + return blendv_ps(f.m128, t.m128, s); + } + + __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) { + return madd(1.0f-t,v0,t*v1); + } + + __forceinline int maxDim ( const Vec3fa& a ) + { + const Vec3fa b = abs(a); + if (b.x > b.y) { + if (b.x > b.z) return 0; else return 2; + } else { + if (b.y > b.z) return 1; else return 2; + } + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if defined (__SSE4_1__) + __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } + __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } + __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } +#else + __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); } + __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); } + __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; + } + + typedef Vec3fa Vec3fa_t; + + + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec3fx Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec3fx + { + ALIGNED_STRUCT_(16); + + typedef float Scalar; + enum { N = 3 }; + union { + __m128 m128; + struct { float x,y,z; union { int a; unsigned u; float w; }; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx( ) {} + __forceinline Vec3fx( const __m128 a ) : m128(a) {} + + __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {} + __forceinline operator Vec3fa () const { return Vec3fa(m128); } + + __forceinline explicit Vec3fx ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); } + //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; } + + __forceinline Vec3fx ( const Vec3fx& other ) { m128 = other.m128; } + + __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; } + + __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {} + __forceinline Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {} + + __forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; } + __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; } + __forceinline Vec3fx( const Vec3fa& other, const float w1) { +#if defined (__SSE4_1__) + m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4); +#else + const vint4 mask(-1,-1,-1,0); + m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1)); +#endif + } + //__forceinline Vec3fx( const float x, const float y, const float z, const int a) : x(x), y(y), z(z), a(a) {} // not working properly! + //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly! + __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {} + + //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} + + __forceinline explicit operator const vfloat4() const { return vfloat4(m128); } + __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); } + __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); } + __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); } + + //__forceinline operator const __m128&() const { return m128; } + //__forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline Vec3fx load( const void* const a ) { + return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); + } + + static __forceinline Vec3fx loadu( const void* const a ) { + return Vec3fx(_mm_loadu_ps((float*)a)); + } + + static __forceinline void storeu ( void* ptr, const Vec3fx& v ) { + _mm_storeu_ps((float*)ptr,v.m128); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx( ZeroTy ) : m128(_mm_setzero_ps()) {} + __forceinline Vec3fx( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } + __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; } + __forceinline Vec3fx operator -( const Vec3fx& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return _mm_xor_ps(a.m128, mask); + } + __forceinline Vec3fx abs ( const Vec3fx& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(a.m128, mask); + } + __forceinline Vec3fx sign ( const Vec3fx& a ) { + return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128)); + } + + __forceinline Vec3fx rcp ( const Vec3fx& a ) + { +#if defined(__AVX512VL__) + const Vec3fx r = _mm_rcp14_ps(a.m128); +#else + const Vec3fx r = _mm_rcp_ps(a.m128); +#endif + +#if defined(__AVX2__) + const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f))); +#else + const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128))); + //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); +#endif + + return res; + } + + __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); } + __forceinline Vec3fx sqr ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); } + + __forceinline Vec3fx rsqrt( const Vec3fx& a ) + { +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ps(a.m128); +#else + __m128 r = _mm_rsqrt_ps(a.m128); +#endif + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + } + + __forceinline Vec3fx zero_fix(const Vec3fx& a) { + return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); + } + __forceinline Vec3fx rcp_safe(const Vec3fx& a) { + return rcp(zero_fix(a)); + } + __forceinline Vec3fx log ( const Vec3fx& a ) { + return Vec3fx(logf(a.x),logf(a.y),logf(a.z)); + } + + __forceinline Vec3fx exp ( const Vec3fx& a ) { + return Vec3fx(expf(a.x),expf(a.y),expf(a.z)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); } + __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); } + __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); } + __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); } + __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; } + __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); } + __forceinline Vec3fx operator /( const Vec3fx& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } + __forceinline Vec3fx operator /( const float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } + + __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); } + __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); } + +#if defined(__SSE4_1__) + __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) { + const vint4 ai = _mm_castps_si128(a.m128); + const vint4 bi = _mm_castps_si128(b.m128); + const vint4 ci = _mm_min_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + +#if defined(__SSE4_1__) + __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) { + const vint4 ai = _mm_castps_si128(a.m128); + const vint4 bi = _mm_castps_si128(b.m128); + const vint4 ci = _mm_max_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + + __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) { + return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } +#else + __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; } + __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; } + __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;} + __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; } +#endif + + __forceinline Vec3fx madd ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); } + __forceinline Vec3fx msub ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); } + __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); } + __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; } + __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; } + __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; } + __forceinline Vec3fx& operator *=( Vec3fx& a, const float b ) { return a = a * b; } + __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; } + __forceinline Vec3fx& operator /=( Vec3fx& a, const float b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float reduce_add(const Vec3fx& v) { + const vfloat4 a(v.m128); + const vfloat4 b = shuffle<1>(a); + const vfloat4 c = shuffle<2>(a); + return _mm_cvtss_f32(a+b+c); + } + + __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; } + __forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); } + __forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } + __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } + + __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); } + __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } + __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); } + __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); } + __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); } + __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); } + + __forceinline bool isvalid ( const Vec3fx& v ) { + return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE))); + } + + __forceinline bool is_finite ( const Vec3fx& a ) { + return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX))); + } + + __forceinline bool isvalid4 ( const Vec3fx& v ) { + return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE))); + } + + __forceinline bool is_finite4 ( const Vec3fx& a ) { + return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE4_1__) + __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) { + return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F)); + } +#else + __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) { + return reduce_add(a*b); + } +#endif + + __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b ) + { + vfloat4 a0 = vfloat4(a.m128); + vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128)); + vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128)); + vfloat4 b1 = vfloat4(b.m128); + return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1))); + } + + __forceinline float sqr_length ( const Vec3fx& a ) { return dot(a,a); } + __forceinline float rcp_length ( const Vec3fx& a ) { return rsqrt(dot(a,a)); } + __forceinline float rcp_length2( const Vec3fx& a ) { return rcp(dot(a,a)); } + __forceinline float length ( const Vec3fx& a ) { return sqrt(dot(a,a)); } + __forceinline Vec3fx normalize( const Vec3fx& a ) { return a*rsqrt(dot(a,a)); } + __forceinline float distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); } + __forceinline float halfArea ( const Vec3fx& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } + __forceinline float area ( const Vec3fx& d ) { return 2.0f*halfArea(d); } + + __forceinline Vec3fx normalize_safe( const Vec3fx& a ) { + const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); + } + + /*! differentiated normalization */ + __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp) + { + const float pp = dot(p,p); + const float pdp = dot(p,dp); + return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) { + __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); + return blendv_ps(f.m128, t.m128, mask); + } + + __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) { + return blendv_ps(f.m128, t.m128, s); + } + + __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) { + return madd(1.0f-t,v0,t*v1); + } + + __forceinline int maxDim ( const Vec3fx& a ) + { + const Vec3fx b = abs(a); + if (b.x > b.y) { + if (b.x > b.z) return 0; else return 2; + } else { + if (b.y > b.z) return 1; else return 2; + } + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__aarch64__) + __forceinline Vec3fx trunc(const Vec3fx& a) { return vrndq_f32(a.m128); } + __forceinline Vec3fx floor(const Vec3fx& a) { return vrndmq_f32(a.m128); } + __forceinline Vec3fx ceil (const Vec3fx& a) { return vrndpq_f32(a.m128); } +#elif defined (__SSE4_1__) + __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } + __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } + __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } +#else + __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); } + __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); } + __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; + } + + + typedef Vec3fx Vec3ff; +} diff --git a/thirdparty/embree/common/math/vec3ia.h b/thirdparty/embree/common/math/vec3ia.h new file mode 100644 index 0000000000..694804c40d --- /dev/null +++ b/thirdparty/embree/common/math/vec3ia.h @@ -0,0 +1,186 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/alloc.h" +#include "math.h" +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec3ia Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec3ia + { + ALIGNED_STRUCT_(16); + + union { + __m128i m128; + struct { int x,y,z; }; + }; + + typedef int Scalar; + enum { N = 3 }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia( ) {} + __forceinline Vec3ia( const __m128i a ) : m128(a) {} + __forceinline Vec3ia( const Vec3ia& other ) : m128(other.m128) {} + __forceinline Vec3ia& operator =(const Vec3ia& other) { m128 = other.m128; return *this; } + + __forceinline explicit Vec3ia( const int a ) : m128(_mm_set1_epi32(a)) {} + __forceinline Vec3ia( const int x, const int y, const int z) : m128(_mm_set_epi32(z, z, y, x)) {} + __forceinline explicit Vec3ia( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {} + + __forceinline operator const __m128i&() const { return m128; } + __forceinline operator __m128i&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia( ZeroTy ) : m128(_mm_setzero_si128()) {} + __forceinline Vec3ia( OneTy ) : m128(_mm_set1_epi32(1)) {} + __forceinline Vec3ia( PosInfTy ) : m128(_mm_set1_epi32(pos_inf)) {} + __forceinline Vec3ia( NegInfTy ) : m128(_mm_set1_epi32(neg_inf)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } + __forceinline int& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } + }; + + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; } + __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); } +#if defined(__SSSE3__) + __forceinline Vec3ia abs ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia operator +( const Vec3ia& a, const Vec3ia& b ) { return _mm_add_epi32(a.m128, b.m128); } + __forceinline Vec3ia operator +( const Vec3ia& a, const int b ) { return a+Vec3ia(b); } + __forceinline Vec3ia operator +( const int a, const Vec3ia& b ) { return Vec3ia(a)+b; } + + __forceinline Vec3ia operator -( const Vec3ia& a, const Vec3ia& b ) { return _mm_sub_epi32(a.m128, b.m128); } + __forceinline Vec3ia operator -( const Vec3ia& a, const int b ) { return a-Vec3ia(b); } + __forceinline Vec3ia operator -( const int a, const Vec3ia& b ) { return Vec3ia(a)-b; } + +#if defined(__SSE4_1__) + __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); } + __forceinline Vec3ia operator *( const Vec3ia& a, const int b ) { return a * Vec3ia(b); } + __forceinline Vec3ia operator *( const int a, const Vec3ia& b ) { return Vec3ia(a) * b; } +#endif + + __forceinline Vec3ia operator &( const Vec3ia& a, const Vec3ia& b ) { return _mm_and_si128(a.m128, b.m128); } + __forceinline Vec3ia operator &( const Vec3ia& a, const int b ) { return a & Vec3ia(b); } + __forceinline Vec3ia operator &( const int a, const Vec3ia& b ) { return Vec3ia(a) & b; } + + __forceinline Vec3ia operator |( const Vec3ia& a, const Vec3ia& b ) { return _mm_or_si128(a.m128, b.m128); } + __forceinline Vec3ia operator |( const Vec3ia& a, const int b ) { return a | Vec3ia(b); } + __forceinline Vec3ia operator |( const int a, const Vec3ia& b ) { return Vec3ia(a) | b; } + + __forceinline Vec3ia operator ^( const Vec3ia& a, const Vec3ia& b ) { return _mm_xor_si128(a.m128, b.m128); } + __forceinline Vec3ia operator ^( const Vec3ia& a, const int b ) { return a ^ Vec3ia(b); } + __forceinline Vec3ia operator ^( const int a, const Vec3ia& b ) { return Vec3ia(a) ^ b; } + + __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); } + __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); } + + __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); } + __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); } + __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia& operator +=( Vec3ia& a, const Vec3ia& b ) { return a = a + b; } + __forceinline Vec3ia& operator +=( Vec3ia& a, const int& b ) { return a = a + b; } + + __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; } + __forceinline Vec3ia& operator -=( Vec3ia& a, const int& b ) { return a = a - b; } + +#if defined(__SSE4_1__) + __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; } + __forceinline Vec3ia& operator *=( Vec3ia& a, const int& b ) { return a = a * b; } +#endif + + __forceinline Vec3ia& operator &=( Vec3ia& a, const Vec3ia& b ) { return a = a & b; } + __forceinline Vec3ia& operator &=( Vec3ia& a, const int& b ) { return a = a & b; } + + __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; } + __forceinline Vec3ia& operator |=( Vec3ia& a, const int& b ) { return a = a | b; } + + __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; } + __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; } + __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; } + __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); } + __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) == 7; } + __forceinline bool operator !=( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) != 7; } + __forceinline bool operator < ( const Vec3ia& a, const Vec3ia& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + if (a.z != b.z) return a.z < b.z; + return false; + } + + __forceinline Vec3ba eq_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32 (a.m128, b.m128)); } + __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); } + __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) { +#if defined(__SSE4_1__) + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); +#else + return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); +#endif + } + +#if defined(__SSE4_1__) + __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); } + __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); } +#else + __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return select(lt_mask(a,b),a,b); } + __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return select(gt_mask(a,b),a,b); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ia& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; + } +} diff --git a/thirdparty/embree/common/math/vec4.h b/thirdparty/embree/common/math/vec4.h new file mode 100644 index 0000000000..0ed107928a --- /dev/null +++ b/thirdparty/embree/common/math/vec4.h @@ -0,0 +1,243 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" +#include "vec3.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// Generic 4D vector Class + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> struct Vec4 + { + enum { N = 4 }; + union { + struct { T x, y, z, w; }; +#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler + T components[N]; +#endif + }; + + typedef T Scalar; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec4( ) {} + __forceinline explicit Vec4( const T& a ) : x(a), y(a), z(a), w(a) {} + __forceinline Vec4( const T& x, const T& y, const T& z, const T& w ) : x(x), y(y), z(z), w(w) {} + __forceinline Vec4( const Vec3<T>& xyz, const T& w ) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {} + + __forceinline Vec4( const Vec4& other ) { x = other.x; y = other.y; z = other.z; w = other.w; } + __forceinline Vec4( const Vec3fx& other ); + + template<typename T1> __forceinline Vec4( const Vec4<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)), w(T(a.w)) {} + template<typename T1> __forceinline Vec4& operator =(const Vec4<T1>& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; } + + __forceinline Vec4& operator =(const Vec4& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; } + + __forceinline operator Vec3<T> () const { return Vec3<T>(x,y,z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec4( ZeroTy ) : x(zero), y(zero), z(zero), w(zero) {} + __forceinline Vec4( OneTy ) : x(one), y(one), z(one), w(one) {} + __forceinline Vec4( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf), w(pos_inf) {} + __forceinline Vec4( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf), w(neg_inf) {} + +#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler + __forceinline const T& operator [](const size_t axis) const { assert(axis < 4); return (&x)[axis]; } + __forceinline T& operator [](const size_t axis) { assert(axis < 4); return (&x)[axis]; } +#else + __forceinline const T& operator [](const size_t axis ) const { assert(axis < 4); return components[axis]; } + __forceinline T& operator [](const size_t axis) { assert(axis < 4); return components[axis]; } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Swizzles + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3<T> xyz() const { return Vec3<T>(x, y, z); } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a ) { return Vec4<T>(+a.x, +a.y, +a.z, +a.w); } + template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a ) { return Vec4<T>(-a.x, -a.y, -a.z, -a.w); } + template<typename T> __forceinline Vec4<T> abs ( const Vec4<T>& a ) { return Vec4<T>(abs (a.x), abs (a.y), abs (a.z), abs (a.w)); } + template<typename T> __forceinline Vec4<T> rcp ( const Vec4<T>& a ) { return Vec4<T>(rcp (a.x), rcp (a.y), rcp (a.z), rcp (a.w)); } + template<typename T> __forceinline Vec4<T> rsqrt ( const Vec4<T>& a ) { return Vec4<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z), rsqrt(a.w)); } + template<typename T> __forceinline Vec4<T> sqrt ( const Vec4<T>& a ) { return Vec4<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z), sqrt (a.w)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } + template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } + template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } + template<typename T> __forceinline Vec4<T> operator *( const T& a, const Vec4<T>& b ) { return Vec4<T>(a * b.x, a * b.y, a * b.z, a * b.w); } + template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const T& b ) { return Vec4<T>(a.x * b , a.y * b , a.z * b , a.w * b ); } + template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); } + template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const T& b ) { return Vec4<T>(a.x / b , a.y / b , a.z / b , a.w / b ); } + template<typename T> __forceinline Vec4<T> operator /( const T& a, const Vec4<T>& b ) { return Vec4<T>(a / b.x, a / b.y, a / b.z, a / b.w); } + + template<typename T> __forceinline Vec4<T> min(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); } + template<typename T> __forceinline Vec4<T> max(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec4<T> madd ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z), madd(a.w,b.w,c.w)); } + template<typename T> __forceinline Vec4<T> msub ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z), msub(a.w,b.w,c.w)); } + template<typename T> __forceinline Vec4<T> nmadd ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z),nmadd(a.w,b.w,c.w)); } + template<typename T> __forceinline Vec4<T> nmsub ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z),nmsub(a.w,b.w,c.w)); } + + template<typename T> __forceinline Vec4<T> madd ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z), madd(a,b.w,c.w)); } + template<typename T> __forceinline Vec4<T> msub ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z), msub(a,b.w,c.w)); } + template<typename T> __forceinline Vec4<T> nmadd ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z),nmadd(a,b.w,c.w)); } + template<typename T> __forceinline Vec4<T> nmsub ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z),nmsub(a,b.w,c.w)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec4<T>& operator +=( Vec4<T>& a, const Vec4<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; } + template<typename T> __forceinline Vec4<T>& operator -=( Vec4<T>& a, const Vec4<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; } + template<typename T> __forceinline Vec4<T>& operator *=( Vec4<T>& a, const T& b ) { a.x *= b ; a.y *= b ; a.z *= b ; a.w *= b ; return a; } + template<typename T> __forceinline Vec4<T>& operator /=( Vec4<T>& a, const T& b ) { a.x /= b ; a.y /= b ; a.z /= b ; a.w /= b ; return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline T reduce_add( const Vec4<T>& a ) { return a.x + a.y + a.z + a.w; } + template<typename T> __forceinline T reduce_mul( const Vec4<T>& a ) { return a.x * a.y * a.z * a.w; } + template<typename T> __forceinline T reduce_min( const Vec4<T>& a ) { return min(a.x, a.y, a.z, a.w); } + template<typename T> __forceinline T reduce_max( const Vec4<T>& a ) { return max(a.x, a.y, a.z, a.w); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline bool operator ==( const Vec4<T>& a, const Vec4<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; } + template<typename T> __forceinline bool operator !=( const Vec4<T>& a, const Vec4<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; } + template<typename T> __forceinline bool operator < ( const Vec4<T>& a, const Vec4<T>& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + if (a.z != b.z) return a.z < b.z; + if (a.w != b.w) return a.w < b.w; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Shift Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec4<T> shift_right_1( const Vec4<T>& a ) { + return Vec4<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z),shift_right_1(a.w)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline T dot ( const Vec4<T>& a, const Vec4<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); } + + template<typename T> __forceinline T length ( const Vec4<T>& a ) { return sqrt(dot(a,a)); } + template<typename T> __forceinline Vec4<T> normalize( const Vec4<T>& a ) { return a*rsqrt(dot(a,a)); } + template<typename T> __forceinline T distance ( const Vec4<T>& a, const Vec4<T>& b ) { return length(a-b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec4<T> select ( bool s, const Vec4<T>& t, const Vec4<T>& f ) { + return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w)); + } + + template<typename T> __forceinline Vec4<T> select ( const Vec4<bool>& s, const Vec4<T>& t, const Vec4<T>& f ) { + return Vec4<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z),select(s.w,t.w,f.w)); + } + + template<typename T> __forceinline Vec4<T> select ( const typename T::Bool& s, const Vec4<T>& t, const Vec4<T>& f ) { + return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w)); + } + + template<typename T> + __forceinline Vec4<T> lerp(const Vec4<T>& v0, const Vec4<T>& v1, const T& t) { + return madd(Vec4<T>(T(1.0f)-t),v0,t*v1); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec4<T>& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ", " << a.w << ")"; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Default template instantiations + //////////////////////////////////////////////////////////////////////////////// + + typedef Vec4<bool > Vec4b; + typedef Vec4<unsigned char> Vec4uc; + typedef Vec4<int > Vec4i; + typedef Vec4<float > Vec4f; +} + +#include "vec3ba.h" +#include "vec3ia.h" +#include "vec3fa.h" + +//////////////////////////////////////////////////////////////////////////////// +/// SSE / AVX / MIC specializations +//////////////////////////////////////////////////////////////////////////////// + +#if defined __SSE__ +#include "../simd/sse.h" +#endif + +#if defined __AVX__ +#include "../simd/avx.h" +#endif + +#if defined __AVX512F__ +#include "../simd/avx512.h" +#endif + +namespace embree +{ + template<> __forceinline Vec4<float>::Vec4( const Vec3fx& a ) { x = a.x; y = a.y; z = a.z; w = a.w; } + +#if defined(__AVX__) + template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) { + x = a.x; y = a.y; z = a.z; w = a.w; + } +#elif defined(__SSE__) + template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) { + const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v); + } +#endif + +#if defined(__AVX__) + template<> __forceinline Vec4<vfloat8>::Vec4( const Vec3fx& a ) { + x = a.x; y = a.y; z = a.z; w = a.w; + } +#endif + +#if defined(__AVX512F__) + template<> __forceinline Vec4<vfloat16>::Vec4( const Vec3fx& a ) : x(a.x), y(a.y), z(a.z), w(a.w) {} +#endif +} diff --git a/thirdparty/embree/common/simd/arm/emulation.h b/thirdparty/embree/common/simd/arm/emulation.h new file mode 100644 index 0000000000..1c3875fb27 --- /dev/null +++ b/thirdparty/embree/common/simd/arm/emulation.h @@ -0,0 +1,50 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +/* Make precision match SSE, at the cost of some performance */ +#if !defined(__aarch64__) +# define SSE2NEON_PRECISE_DIV 1 +# define SSE2NEON_PRECISE_SQRT 1 +#endif + +#include "sse2neon.h" + +__forceinline __m128 _mm_fmsub_ps(__m128 a, __m128 b, __m128 c) { + __m128 neg_c = vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(c))); + return _mm_fmadd_ps(a, b, neg_c); +} + +__forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) { +#if defined(__aarch64__) + return vreinterpretq_m128_f32(vfmsq_f32(vreinterpretq_f32_m128(c), + vreinterpretq_f32_m128(b), + vreinterpretq_f32_m128(a))); +#else + return _mm_sub_ps(c, _mm_mul_ps(a, b)); +#endif +} + +__forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) { + return vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(_mm_fmadd_ps(a,b,c)))); +} + + +/* Dummy defines for floating point control */ +#define _MM_MASK_MASK 0x1f80 +#define _MM_MASK_DIV_ZERO 0x200 +#define _MM_FLUSH_ZERO_ON 0x8000 +#define _MM_MASK_DENORM 0x100 +#define _MM_SET_EXCEPTION_MASK(x) +#define _MM_SET_FLUSH_ZERO_MODE(x) + +__forceinline int _mm_getcsr() +{ + return 0; +} + +__forceinline void _mm_mfence() +{ + __sync_synchronize(); +} diff --git a/thirdparty/embree/common/simd/arm/sse2neon.h b/thirdparty/embree/common/simd/arm/sse2neon.h new file mode 100644 index 0000000000..7eb25cf2c5 --- /dev/null +++ b/thirdparty/embree/common/simd/arm/sse2neon.h @@ -0,0 +1,6996 @@ +#ifndef SSE2NEON_H +#define SSE2NEON_H + +// This header file provides a simple API translation layer +// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions +// +// This header file does not yet translate all of the SSE intrinsics. +// +// Contributors to this work are: +// John W. Ratcliff <jratcliffscarab@gmail.com> +// Brandon Rowlett <browlett@nvidia.com> +// Ken Fast <kfast@gdeb.com> +// Eric van Beurden <evanbeurden@nvidia.com> +// Alexander Potylitsin <apotylitsin@nvidia.com> +// Hasindu Gamaarachchi <hasindu2008@gmail.com> +// Jim Huang <jserv@biilabs.io> +// Mark Cheng <marktwtn@biilabs.io> +// Malcolm James MacLeod <malcolm@gulden.com> +// Devin Hussey (easyaspi314) <husseydevin@gmail.com> +// Sebastian Pop <spop@amazon.com> +// Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com> +// Danila Kutenin <danilak@google.com> +// François Turban (JishinMaster) <francois.turban@gmail.com> +// Pei-Hsuan Hung <afcidk@gmail.com> +// Yang-Hao Yuan <yanghau@biilabs.io> +// Syoyo Fujita <syoyo@lighttransport.com> +// Brecht Van Lommel <brecht@blender.org> + +/* + * sse2neon is freely redistributable under the MIT License. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* Tunable configurations */ + +/* Enable precise implementation of math operations + * This would slow down the computation a bit, but gives consistent result with + * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result) + */ +/* _mm_min_ps and _mm_max_ps */ +#ifndef SSE2NEON_PRECISE_MINMAX +#define SSE2NEON_PRECISE_MINMAX (0) +#endif +/* _mm_rcp_ps and _mm_div_ps */ +#ifndef SSE2NEON_PRECISE_DIV +#define SSE2NEON_PRECISE_DIV (0) +#endif +/* _mm_sqrt_ps and _mm_rsqrt_ps */ +#ifndef SSE2NEON_PRECISE_SQRT +#define SSE2NEON_PRECISE_SQRT (0) +#endif +#ifndef SSE2NEON_PRECISE_RSQRT +#define SSE2NEON_PRECISE_RSQRT (0) +#endif + +#if defined(__GNUC__) || defined(__clang__) +#pragma push_macro("FORCE_INLINE") +#pragma push_macro("ALIGN_STRUCT") +#define FORCE_INLINE static inline __attribute__((always_inline)) +#define ALIGN_STRUCT(x) __attribute__((aligned(x))) +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#endif +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#else +#error "Macro name collisions may happen with unsupported compiler." +#ifdef FORCE_INLINE +#undef FORCE_INLINE +#endif +#define FORCE_INLINE static inline +#ifndef ALIGN_STRUCT +#define ALIGN_STRUCT(x) __declspec(align(x)) +#endif +#endif +#ifndef likely +#define likely(x) (x) +#endif +#ifndef unlikely +#define unlikely(x) (x) +#endif + +#include <stdint.h> +#include <stdlib.h> + +/* Architecture-specific build options */ +/* FIXME: #pragma GCC push_options is only available on GCC */ +#if defined(__GNUC__) +#if defined(__arm__) && __ARM_ARCH == 7 +/* According to ARM C Language Extensions Architecture specification, + * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON) + * architecture supported. + */ +#if !defined(__ARM_NEON) || !defined(__ARM_NEON__) +#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON." +#endif +#if !defined(__clang__) +#pragma GCC push_options +#pragma GCC target("fpu=neon") +#endif +#elif defined(__aarch64__) +#if !defined(__clang__) +#pragma GCC push_options +#pragma GCC target("+simd") +#endif +#else +#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A." +#endif +#endif + +#include <arm_neon.h> + +/* Rounding functions require either Aarch64 instructions or libm failback */ +#if !defined(__aarch64__) +#include <math.h> +#endif + +/* "__has_builtin" can be used to query support for built-in functions + * provided by gcc/clang and other compilers that support it. + */ +#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */ +/* Compatibility with gcc <= 9 */ +#if __GNUC__ <= 9 +#define __has_builtin(x) HAS##x +#define HAS__builtin_popcount 1 +#define HAS__builtin_popcountll 1 +#else +#define __has_builtin(x) 0 +#endif +#endif + +/** + * MACRO for shuffle parameter for _mm_shuffle_ps(). + * Argument fp3 is a digit[0123] that represents the fp from argument "b" + * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same + * for fp2 in result. fp1 is a digit[0123] that represents the fp from + * argument "a" of mm_shuffle_ps that will be places in fp1 of result. + * fp0 is the same for fp0 of result. + */ +#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) + +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 +#define _MM_FROUND_NO_EXC 0x08 +#define _MM_ROUND_NEAREST 0x0000 +#define _MM_ROUND_DOWN 0x2000 +#define _MM_ROUND_UP 0x4000 +#define _MM_ROUND_TOWARD_ZERO 0x6000 + +/* indicate immediate constant argument in a given range */ +#define __constrange(a, b) const + +/* A few intrinsics accept traditional data types like ints or floats, but + * most operate on data types that are specific to SSE. + * If a vector type ends in d, it contains doubles, and if it does not have + * a suffix, it contains floats. An integer vector type can contain any type + * of integer, from chars to shorts to unsigned long longs. + */ +typedef int64x1_t __m64; +typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */ +// On ARM 32-bit architecture, the float64x2_t is not supported. +// The data type __m128d should be represented in a different way for related +// intrinsic conversion. +#if defined(__aarch64__) +typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */ +#else +typedef float32x4_t __m128d; +#endif +typedef int64x2_t __m128i; /* 128-bit vector containing integers */ + +/* type-safe casting between types */ + +#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x) +#define vreinterpretq_m128_f32(x) (x) +#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x) + +#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x) +#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x) +#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x) +#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x) + +#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x) +#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x) +#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x) +#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x) + +#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x) +#define vreinterpretq_f32_m128(x) (x) +#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x) + +#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x) +#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x) +#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x) +#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x) + +#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x) +#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x) +#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x) +#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x) + +#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x) +#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x) +#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) +#define vreinterpretq_m128i_s64(x) (x) + +#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x) +#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x) +#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x) +#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x) + +#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x) +#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x) + +#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x) +#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x) +#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) +#define vreinterpretq_s64_m128i(x) (x) + +#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x) +#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x) +#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x) +#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x) + +#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x) +#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x) +#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x) +#define vreinterpret_m64_s64(x) (x) + +#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x) +#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x) +#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x) +#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x) + +#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x) +#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x) +#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x) + +#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x) +#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x) +#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x) +#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x) + +#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x) +#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x) +#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x) +#define vreinterpret_s64_m64(x) (x) + +#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) + +#if defined(__aarch64__) +#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) +#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) + +#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x) + +#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x) +#define vreinterpretq_m128d_f64(x) (x) + +#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x) + +#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x) + +#define vreinterpretq_f64_m128d(x) (x) +#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x) +#else +#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x) +#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x) + +#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x) +#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x) + +#define vreinterpretq_m128d_f32(x) (x) + +#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x) + +#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x) +#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x) + +#define vreinterpretq_f32_m128d(x) (x) +#endif + +// A struct is defined in this header file called 'SIMDVec' which can be used +// by applications which attempt to access the contents of an _m128 struct +// directly. It is important to note that accessing the __m128 struct directly +// is bad coding practice by Microsoft: @see: +// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx +// +// However, some legacy source code may try to access the contents of an __m128 +// struct directly so the developer can use the SIMDVec as an alias for it. Any +// casting must be done manually by the developer, as you cannot cast or +// otherwise alias the base NEON data type for intrinsic operations. +// +// union intended to allow direct access to an __m128 variable using the names +// that the MSVC compiler provides. This union should really only be used when +// trying to access the members of the vector as integer values. GCC/clang +// allow native access to the float members through a simple array access +// operator (in C since 4.6, in C++ since 4.8). +// +// Ideally direct accesses to SIMD vectors should not be used since it can cause +// a performance hit. If it really is needed however, the original __m128 +// variable can be aliased with a pointer to this union and used to access +// individual components. The use of this union should be hidden behind a macro +// that is used throughout the codebase to access the members instead of always +// declaring this type of variable. +typedef union ALIGN_STRUCT(16) SIMDVec { + float m128_f32[4]; // as floats - DON'T USE. Added for convenience. + int8_t m128_i8[16]; // as signed 8-bit integers. + int16_t m128_i16[8]; // as signed 16-bit integers. + int32_t m128_i32[4]; // as signed 32-bit integers. + int64_t m128_i64[2]; // as signed 64-bit integers. + uint8_t m128_u8[16]; // as unsigned 8-bit integers. + uint16_t m128_u16[8]; // as unsigned 16-bit integers. + uint32_t m128_u32[4]; // as unsigned 32-bit integers. + uint64_t m128_u64[2]; // as unsigned 64-bit integers. +} SIMDVec; + +// casting using SIMDVec +#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n]) +#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n]) +#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n]) + +/* Backwards compatibility for compilers with lack of specific type support */ + +// Older gcc does not define vld1q_u8_x4 type +#if defined(__GNUC__) && !defined(__clang__) && \ + ((__GNUC__ == 10 && (__GNUC_MINOR__ <= 1)) || \ + (__GNUC__ == 9 && (__GNUC_MINOR__ <= 3)) || \ + (__GNUC__ == 8 && (__GNUC_MINOR__ <= 4)) || __GNUC__ <= 7) +FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) +{ + uint8x16x4_t ret; + ret.val[0] = vld1q_u8(p + 0); + ret.val[1] = vld1q_u8(p + 16); + ret.val[2] = vld1q_u8(p + 32); + ret.val[3] = vld1q_u8(p + 48); + return ret; +} +#else +// Wraps vld1q_u8_x4 +FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) +{ + return vld1q_u8_x4(p); +} +#endif + +/* Function Naming Conventions + * The naming convention of SSE intrinsics is straightforward. A generic SSE + * intrinsic function is given as follows: + * _mm_<name>_<data_type> + * + * The parts of this format are given as follows: + * 1. <name> describes the operation performed by the intrinsic + * 2. <data_type> identifies the data type of the function's primary arguments + * + * This last part, <data_type>, is a little complicated. It identifies the + * content of the input values, and can be set to any of the following values: + * + ps - vectors contain floats (ps stands for packed single-precision) + * + pd - vectors cantain doubles (pd stands for packed double-precision) + * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit + * signed integers + * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit + * unsigned integers + * + si128 - unspecified 128-bit vector or 256-bit vector + * + m128/m128i/m128d - identifies input vector types when they are different + * than the type of the returned vector + * + * For example, _mm_setzero_ps. The _mm implies that the function returns + * a 128-bit vector. The _ps at the end implies that the argument vectors + * contain floats. + * + * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8) + * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits + * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + * // Set packed 8-bit integers + * // 128 bits, 16 chars, per 8 bits + * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11, + * 4, 5, 12, 13, 6, 7, 14, 15); + * // Shuffle packed 8-bit integers + * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb + * + * Data (Number, Binary, Byte Index): + +------+------+-------------+------+------+-------------+ + | 1 | 2 | 3 | 4 | Number + +------+------+------+------+------+------+------+------+ + | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary + +------+------+------+------+------+------+------+------+ + | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index + +------+------+------+------+------+------+------+------+ + + +------+------+------+------+------+------+------+------+ + | 5 | 6 | 7 | 8 | Number + +------+------+------+------+------+------+------+------+ + | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary + +------+------+------+------+------+------+------+------+ + | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index + +------+------+------+------+------+------+------+------+ + * Index (Byte Index): + +------+------+------+------+------+------+------+------+ + | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | + +------+------+------+------+------+------+------+------+ + + +------+------+------+------+------+------+------+------+ + | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | + +------+------+------+------+------+------+------+------+ + * Result: + +------+------+------+------+------+------+------+------+ + | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index + +------+------+------+------+------+------+------+------+ + | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary + +------+------+------+------+------+------+------+------+ + | 256 | 2 | 5 | 6 | Number + +------+------+------+------+------+------+------+------+ + + +------+------+------+------+------+------+------+------+ + | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index + +------+------+------+------+------+------+------+------+ + | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary + +------+------+------+------+------+------+------+------+ + | 3 | 7 | 4 | 8 | Number + +------+------+------+------+------+------+-------------+ + */ + +/* Set/get methods */ + +/* Constants for use with _mm_prefetch. */ +enum _mm_hint { + _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ + _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */ + _MM_HINT_T1 = 2, /* load data to L2 cache only */ + _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */ + _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */ + _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */ + _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */ + _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */ +}; + +// Loads one cache line of data from address p to a location closer to the +// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx +FORCE_INLINE void _mm_prefetch(const void *p, int i) +{ + (void) i; + __builtin_prefetch(p); +} + +// Pause the processor. This is typically used in spin-wait loops and depending +// on the x86 processor typical values are in the 40-100 cycle range. The +// 'yield' instruction isn't a good fit beacuse it's effectively a nop on most +// Arm cores. Experience with several databases has shown has shown an 'isb' is +// a reasonable approximation. +FORCE_INLINE void _mm_pause() +{ + __asm__ __volatile__("isb\n"); +} + +// Copy the lower single-precision (32-bit) floating-point element of a to dst. +// +// dst[31:0] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32 +FORCE_INLINE float _mm_cvtss_f32(__m128 a) +{ + return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); +} + +// Convert the lower single-precision (32-bit) floating-point element in b to a +// double-precision (64-bit) floating-point element, store the result in the +// lower element of dst, and copy the upper element from a to the upper element +// of dst. +// +// dst[63:0] := Convert_FP32_To_FP64(b[31:0]) +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd +FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) +{ + double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); +#else + return vreinterpretq_m128d_s64( + vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0)); +#endif +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// +// dst[31:0] := Convert_FP32_To_Int32(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32 +#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 64-bit integer, and store the result in dst. +// +// dst[63:0] := Convert_FP32_To_Int64(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64 +FORCE_INLINE int _mm_cvtss_si64(__m128 a) +{ +#if defined(__aarch64__) + return vgetq_lane_s64( + vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0); +#else + float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32_t diff = data - floor(data); + if (diff > 0.5) + return (int64_t) ceil(data); + if (unlikely(diff == 0.5)) { + int64_t f = (int64_t) floor(data); + int64_t c = (int64_t) ceil(data); + return c & 1 ? f : c; + } + return (int64_t) floor(data); +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi +FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a) +{ + return vreinterpret_m64_s32( + vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// +// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si +FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) +{ + return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0); +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32 +#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// +// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32 +#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// +// dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64 +FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) +{ + return vgetq_lane_s64( + vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0); +} + +// Sets the 128-bit value to zero +// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx +FORCE_INLINE __m128i _mm_setzero_si128(void) +{ + return vreinterpretq_m128i_s32(vdupq_n_s32(0)); +} + +// Clears the four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setzero_ps(void) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(0)); +} + +// Return vector of type __m128d with all elements set to zero. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd +FORCE_INLINE __m128d _mm_setzero_pd(void) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vdupq_n_f64(0)); +#else + return vreinterpretq_m128d_f32(vdupq_n_f32(0)); +#endif +} + +// Sets the four single-precision, floating-point values to w. +// +// r0 := r1 := r2 := r3 := w +// +// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set1_ps(float _w) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); +} + +// Sets the four single-precision, floating-point values to w. +// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set_ps1(float _w) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); +} + +// Sets the four single-precision, floating-point values to the four inputs. +// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) +{ + float ALIGN_STRUCT(16) data[4] = {x, y, z, w}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Copy single-precision (32-bit) floating-point element a to the lower element +// of dst, and zero the upper 3 elements. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss +FORCE_INLINE __m128 _mm_set_ss(float a) +{ + float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Sets the four single-precision, floating-point values to the four inputs in +// reverse order. +// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) +{ + float ALIGN_STRUCT(16) data[4] = {w, z, y, x}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Sets the 8 signed 16-bit integer values in reverse order. +// +// Return Value +// r0 := w0 +// r1 := w1 +// ... +// r7 := w7 +FORCE_INLINE __m128i _mm_setr_epi16(short w0, + short w1, + short w2, + short w3, + short w4, + short w5, + short w6, + short w7) +{ + int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7}; + return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data)); +} + +// Sets the 4 signed 32-bit integer values in reverse order +// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx +FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) +{ + int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0}; + return vreinterpretq_m128i_s32(vld1q_s32(data)); +} + +// Set packed 64-bit integers in dst with the supplied values in reverse order. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64 +FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) +{ + return vreinterpretq_m128i_s64(vcombine_s64(e1, e0)); +} + +// Sets the 16 signed 8-bit integer values to b. +// +// r0 := b +// r1 := b +// ... +// r15 := b +// +// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set1_epi8(signed char w) +{ + return vreinterpretq_m128i_s8(vdupq_n_s8(w)); +} + +// Broadcast double-precision (64-bit) floating-point value a to all elements of +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd +FORCE_INLINE __m128d _mm_set1_pd(double d) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vdupq_n_f64(d)); +#else + return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d)); +#endif +} + +// Sets the 8 signed 16-bit integer values to w. +// +// r0 := w +// r1 := w +// ... +// r7 := w +// +// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx +FORCE_INLINE __m128i _mm_set1_epi16(short w) +{ + return vreinterpretq_m128i_s16(vdupq_n_s16(w)); +} + +// Sets the 16 signed 8-bit integer values. +// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx +FORCE_INLINE __m128i _mm_set_epi8(signed char b15, + signed char b14, + signed char b13, + signed char b12, + signed char b11, + signed char b10, + signed char b9, + signed char b8, + signed char b7, + signed char b6, + signed char b5, + signed char b4, + signed char b3, + signed char b2, + signed char b1, + signed char b0) +{ + int8_t ALIGN_STRUCT(16) + data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + return (__m128i) vld1q_s8(data); +} + +// Sets the 8 signed 16-bit integer values. +// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx +FORCE_INLINE __m128i _mm_set_epi16(short i7, + short i6, + short i5, + short i4, + short i3, + short i2, + short i1, + short i0) +{ + int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + return vreinterpretq_m128i_s16(vld1q_s16(data)); +} + +// Sets the 16 signed 8-bit integer values in reverse order. +// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx +FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, + signed char b1, + signed char b2, + signed char b3, + signed char b4, + signed char b5, + signed char b6, + signed char b7, + signed char b8, + signed char b9, + signed char b10, + signed char b11, + signed char b12, + signed char b13, + signed char b14, + signed char b15) +{ + int8_t ALIGN_STRUCT(16) + data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + return (__m128i) vld1q_s8(data); +} + +// Sets the 4 signed 32-bit integer values to i. +// +// r0 := i +// r1 := i +// r2 := i +// r3 := I +// +// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set1_epi32(int _i) +{ + return vreinterpretq_m128i_s32(vdupq_n_s32(_i)); +} + +// Sets the 2 signed 64-bit integer values to i. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100) +FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) +{ + return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i)); +} + +// Sets the 2 signed 64-bit integer values to i. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x +FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) +{ + return vreinterpretq_m128i_s64(vdupq_n_s64(_i)); +} + +// Sets the 4 signed 32-bit integer values. +// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) +{ + int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3}; + return vreinterpretq_m128i_s32(vld1q_s32(data)); +} + +// Returns the __m128i structure with its two 64-bit integer values +// initialized to the values of the two 64-bit integers passed in. +// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx +FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) +{ + return vreinterpretq_m128i_s64( + vcombine_s64(vcreate_s64(i2), vcreate_s64(i1))); +} + +// Returns the __m128i structure with its two 64-bit integer values +// initialized to the values of the two 64-bit integers passed in. +// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx +FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) +{ + return _mm_set_epi64x((int64_t) i1, (int64_t) i2); +} + +// Set packed double-precision (64-bit) floating-point elements in dst with the +// supplied values. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd +FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) +{ + double ALIGN_STRUCT(16) data[2] = {e0, e1}; +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data)); +#else + return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data)); +#endif +} + +// Set packed double-precision (64-bit) floating-point elements in dst with the +// supplied values in reverse order. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd +FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) +{ + return _mm_set_pd(e0, e1); +} + +// Copy double-precision (64-bit) floating-point element a to the lower element +// of dst, and zero the upper element. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd +FORCE_INLINE __m128d _mm_set_sd(double a) +{ + return _mm_set_pd(0, a); +} + +// Broadcast double-precision (64-bit) floating-point value a to all elements of +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1 +#define _mm_set_pd1 _mm_set1_pd + +// Stores four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx +FORCE_INLINE void _mm_store_ps(float *p, __m128 a) +{ + vst1q_f32(p, vreinterpretq_f32_m128(a)); +} + +// Store the lower single-precision (32-bit) floating-point element from a into +// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// +// MEM[mem_addr+31:mem_addr] := a[31:0] +// MEM[mem_addr+63:mem_addr+32] := a[31:0] +// MEM[mem_addr+95:mem_addr+64] := a[31:0] +// MEM[mem_addr+127:mem_addr+96] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1 +FORCE_INLINE void _mm_store_ps1(float *p, __m128 a) +{ + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + vst1q_f32(p, vdupq_n_f32(a0)); +} + +// Store the lower single-precision (32-bit) floating-point element from a into +// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// +// MEM[mem_addr+31:mem_addr] := a[31:0] +// MEM[mem_addr+63:mem_addr+32] := a[31:0] +// MEM[mem_addr+95:mem_addr+64] := a[31:0] +// MEM[mem_addr+127:mem_addr+96] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps +#define _mm_store1_ps _mm_store_ps1 + +// Store 4 single-precision (32-bit) floating-point elements from a into memory +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// MEM[mem_addr+31:mem_addr] := a[127:96] +// MEM[mem_addr+63:mem_addr+32] := a[95:64] +// MEM[mem_addr+95:mem_addr+64] := a[63:32] +// MEM[mem_addr+127:mem_addr+96] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps +FORCE_INLINE void _mm_storer_ps(float *p, __m128 a) +{ + float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a)); + float32x4_t rev = vextq_f32(tmp, tmp, 2); + vst1q_f32(p, rev); +} + +// Stores four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx +FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) +{ + vst1q_f32(p, vreinterpretq_f32_m128(a)); +} + +// Stores four 32-bit integer values as (as a __m128i value) at the address p. +// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx +FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) +{ + vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); +} + +// Stores four 32-bit integer values as (as a __m128i value) at the address p. +// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx +FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) +{ + vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); +} + +// Stores the lower single - precision, floating - point value. +// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx +FORCE_INLINE void _mm_store_ss(float *p, __m128 a) +{ + vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0); +} + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary +// or a general-protection exception may be generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd +FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) + vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a)); +#else + vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a)); +#endif +} + +// Store the upper double-precision (64-bit) floating-point element from a into +// memory. +// +// MEM[mem_addr+63:mem_addr] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd +FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) + vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a))); +#endif +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// memory. +// +// MEM[mem_addr+63:mem_addr] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd +FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) + vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a))); +#endif +} + +// Store 2 double-precision (64-bit) floating-point elements from a into memory +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// MEM[mem_addr+63:mem_addr] := a[127:64] +// MEM[mem_addr+127:mem_addr+64] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd +FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a) +{ + float32x4_t f = vreinterpretq_f32_m128d(a); + _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2))); +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1 +FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) + float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a)); + vst1q_f64((float64_t *) mem_addr, + vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low))); +#else + float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a)); + vst1q_f32((float32_t *) mem_addr, + vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low))); +#endif +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd +#define _mm_store1_pd _mm_store_pd1 + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr does not need to be aligned on any +// particular boundary. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd +FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) +{ + _mm_store_pd(mem_addr, a); +} + +// Reads the lower 64 bits of b and stores them into the lower 64 bits of a. +// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx +FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) +{ + uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a)); + uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b)); + *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi)); +} + +// Stores the lower two single-precision floating point values of a to the +// address p. +// +// *p0 := a0 +// *p1 := a1 +// +// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx +FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) +{ + *p = vreinterpret_m64_f32(vget_low_f32(a)); +} + +// Stores the upper two single-precision, floating-point values of a to the +// address p. +// +// *p0 := a2 +// *p1 := a3 +// +// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx +FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) +{ + *p = vreinterpret_m64_f32(vget_high_f32(a)); +} + +// Loads a single single-precision, floating-point value, copying it into all +// four words +// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx +FORCE_INLINE __m128 _mm_load1_ps(const float *p) +{ + return vreinterpretq_m128_f32(vld1q_dup_f32(p)); +} + +// Load a single-precision (32-bit) floating-point element from memory into all +// elements of dst. +// +// dst[31:0] := MEM[mem_addr+31:mem_addr] +// dst[63:32] := MEM[mem_addr+31:mem_addr] +// dst[95:64] := MEM[mem_addr+31:mem_addr] +// dst[127:96] := MEM[mem_addr+31:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1 +#define _mm_load_ps1 _mm_load1_ps + +// Sets the lower two single-precision, floating-point values with 64 +// bits of data loaded from the address p; the upper two values are passed +// through from a. +// +// Return Value +// r0 := *p0 +// r1 := *p1 +// r2 := a2 +// r3 := a3 +// +// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx +FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a))); +} + +// Load 4 single-precision (32-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// dst[31:0] := MEM[mem_addr+127:mem_addr+96] +// dst[63:32] := MEM[mem_addr+95:mem_addr+64] +// dst[95:64] := MEM[mem_addr+63:mem_addr+32] +// dst[127:96] := MEM[mem_addr+31:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps +FORCE_INLINE __m128 _mm_loadr_ps(const float *p) +{ + float32x4_t v = vrev64q_f32(vld1q_f32(p)); + return vreinterpretq_m128_f32(vextq_f32(v, v, 2)); +} + +// Sets the upper two single-precision, floating-point values with 64 +// bits of data loaded from the address p; the lower two values are passed +// through from a. +// +// r0 := a0 +// r1 := a1 +// r2 := *p0 +// r3 := *p1 +// +// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx +FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p))); +} + +// Loads four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx +FORCE_INLINE __m128 _mm_load_ps(const float *p) +{ + return vreinterpretq_m128_f32(vld1q_f32(p)); +} + +// Loads four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_loadu_ps(const float *p) +{ + // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are + // equivalent for neon + return vreinterpretq_m128_f32(vld1q_f32(p)); +} + +// Load unaligned 16-bit integer from memory into the first element of dst. +// +// dst[15:0] := MEM[mem_addr+15:mem_addr] +// dst[MAX:16] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16 +FORCE_INLINE __m128i _mm_loadu_si16(const void *p) +{ + return vreinterpretq_m128i_s16( + vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0)); +} + +// Load unaligned 64-bit integer from memory into the first element of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[MAX:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64 +FORCE_INLINE __m128i _mm_loadu_si64(const void *p) +{ + return vreinterpretq_m128i_s64( + vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0))); +} + +// Load a double-precision (64-bit) floating-point element from memory into the +// lower of dst, and zero the upper element. mem_addr does not need to be +// aligned on any particular boundary. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd +FORCE_INLINE __m128d _mm_load_sd(const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0)); +#else + const float *fp = (const float *) p; + float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0}; + return vreinterpretq_m128d_f32(vld1q_f32(data)); +#endif +} + +// Loads two double-precision from 16-byte aligned memory, floating-point +// values. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd +FORCE_INLINE __m128d _mm_load_pd(const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_f64(p)); +#else + const float *fp = (const float *) p; + float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]}; + return vreinterpretq_m128d_f32(vld1q_f32(data)); +#endif +} + +// Loads two double-precision from unaligned memory, floating-point values. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd +FORCE_INLINE __m128d _mm_loadu_pd(const double *p) +{ + return _mm_load_pd(p); +} + +// Loads an single - precision, floating - point value into the low word and +// clears the upper three words. +// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_load_ss(const float *p) +{ + return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0)); +} + +// Load 64-bit integer from memory into the first element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64 +FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) +{ + /* Load the lower 64 bits of the value pointed to by p into the + * lower 64 bits of the result, zeroing the upper 64 bits of the result. + */ + return vreinterpretq_m128i_s32( + vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0))); +} + +// Load a double-precision (64-bit) floating-point element from memory into the +// lower element of dst, and copy the upper element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd +FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a)))); +#else + return vreinterpretq_m128d_f32( + vcombine_f32(vld1_f32((const float *) p), + vget_high_f32(vreinterpretq_f32_m128d(a)))); +#endif +} + +// Load 2 double-precision (64-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// dst[63:0] := MEM[mem_addr+127:mem_addr+64] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd +FORCE_INLINE __m128d _mm_loadr_pd(const double *p) +{ +#if defined(__aarch64__) + float64x2_t v = vld1q_f64(p); + return vreinterpretq_m128d_f64(vextq_f64(v, v, 1)); +#else + int64x2_t v = vld1q_s64((const int64_t *) p); + return vreinterpretq_m128d_s64(vextq_s64(v, v, 1)); +#endif +} + +// Sets the low word to the single-precision, floating-point value of b +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100) +FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0), + vreinterpretq_f32_m128(a), 0)); +} + +// Move the lower double-precision (64-bit) floating-point element from b to the +// lower element of dst, and copy the upper element from a to the upper element +// of dst. +// +// dst[63:0] := b[63:0] +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd +FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_f32( + vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)), + vget_high_f32(vreinterpretq_f32_m128d(a)))); +} + +// Copy the lower 64-bit integer in a to the lower element of dst, and zero the +// upper element. +// +// dst[63:0] := a[63:0] +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64 +FORCE_INLINE __m128i _mm_move_epi64(__m128i a) +{ + return vreinterpretq_m128i_s64( + vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1)); +} + +// Return vector of type __m128 with undefined elements. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps +FORCE_INLINE __m128 _mm_undefined_ps(void) +{ +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#endif + __m128 a; + return a; +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +/* Logic/Binary operations */ + +// Computes the bitwise AND-NOT of the four single-precision, floating-point +// values of a and b. +// +// r0 := ~a0 & b0 +// r1 := ~a1 & b1 +// r2 := ~a2 & b2 +// r3 := ~a3 & b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx +FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vbicq_s32(vreinterpretq_s32_m128(b), + vreinterpretq_s32_m128(a))); // *NOTE* argument swap +} + +// Compute the bitwise NOT of packed double-precision (64-bit) floating-point +// elements in a and then AND with b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd +FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) +{ + // *NOTE* argument swap + return vreinterpretq_m128d_s64( + vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a))); +} + +// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the +// 128-bit value in a. +// +// r := (~a) & b +// +// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vbicq_s32(vreinterpretq_s32_m128i(b), + vreinterpretq_s32_m128i(a))); // *NOTE* argument swap +} + +// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in +// b. +// +// r := a & b +// +// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Computes the bitwise AND of the four single-precision, floating-point values +// of a and b. +// +// r0 := a0 & b0 +// r1 := a1 & b1 +// r2 := a2 & b2 +// r3 := a3 & b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Compute the bitwise AND of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := a[i+63:i] AND b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd +FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_s64( + vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Computes the bitwise OR of the four single-precision, floating-point values +// of a and b. +// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx +FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Computes bitwise EXOR (exclusive-or) of the four single-precision, +// floating-point values of a and b. +// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx +FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Compute the bitwise XOR of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := a[i+63:i] XOR b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd +FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_s64( + veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Compute the bitwise OR of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd +FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_s64( + vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. +// +// r := a | b +// +// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx +FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in +// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx +FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Duplicate the low double-precision (64-bit) floating-point element from a, +// and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd +FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) +{ +#if (__aarch64__) + return vreinterpretq_m128d_f64( + vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0)); +#else + return vreinterpretq_m128d_u64( + vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0))); +#endif +} + +// Duplicate odd-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps +FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) +{ +#if __has_builtin(__builtin_shufflevector) + return vreinterpretq_m128_f32(__builtin_shufflevector( + vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3)); +#else + float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); + float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3); + float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +#endif +} + +// Duplicate even-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps +FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) +{ +#if __has_builtin(__builtin_shufflevector) + return vreinterpretq_m128_f32(__builtin_shufflevector( + vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2)); +#else + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2); + float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +#endif +} + +// Moves the upper two values of B into the lower two values of A. +// +// r3 := a3 +// r2 := a2 +// r1 := b3 +// r0 := b2 +FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B) +{ + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B)); + return vreinterpretq_m128_f32(vcombine_f32(b32, a32)); +} + +// Moves the lower two values of B into the upper two values of A. +// +// r3 := b1 +// r2 := b0 +// r1 := a1 +// r0 := a0 +FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); +} + +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// dst[i+31:i] := ABS(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32 +FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) +{ + return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a))); +} + +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// dst[i+15:i] := ABS(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16 +FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) +{ + return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a))); +} + +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 15 +// i := j*8 +// dst[i+7:i] := ABS(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8 +FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) +{ + return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a))); +} + +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 1 +// i := j*32 +// dst[i+31:i] := ABS(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32 +FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) +{ + return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a))); +} + +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := ABS(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16 +FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) +{ + return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a))); +} + +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := ABS(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8 +FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) +{ + return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a))); +} + +// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift +// the result right by imm8 bytes, and store the low 16 bytes in dst. +// +// tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) +// dst[127:0] := tmp[127:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8 +#define _mm_alignr_epi8(a, b, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) >= 32)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + uint8x16_t tmp_low, tmp_high; \ + if (imm >= 16) { \ + const int idx = imm - 16; \ + tmp_low = vreinterpretq_u8_m128i(a); \ + tmp_high = vdupq_n_u8(0); \ + ret = \ + vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \ + } else { \ + const int idx = imm; \ + tmp_low = vreinterpretq_u8_m128i(b); \ + tmp_high = vreinterpretq_u8_m128i(a); \ + ret = \ + vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \ + } \ + } \ + ret; \ + }) + +// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift +// the result right by imm8 bytes, and store the low 8 bytes in dst. +// +// tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8) +// dst[63:0] := tmp[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8 +#define _mm_alignr_pi8(a, b, imm) \ + __extension__({ \ + __m64 ret; \ + if (unlikely((imm) >= 16)) { \ + ret = vreinterpret_m64_s8(vdup_n_s8(0)); \ + } else { \ + uint8x8_t tmp_low, tmp_high; \ + if (imm >= 8) { \ + const int idx = imm - 8; \ + tmp_low = vreinterpret_u8_m64(a); \ + tmp_high = vdup_n_u8(0); \ + ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ + } else { \ + const int idx = imm; \ + tmp_low = vreinterpret_u8_m64(b); \ + tmp_high = vreinterpret_u8_m64(a); \ + ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ + } \ + } \ + ret; \ + }) + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of b and places it into the high end of the result. +FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) +{ + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a32, b10)); +} + +// takes the lower two 32-bit values from a and swaps them and places in high +// end of result takes the higher two 32 bit values from b and swaps them and +// places in low end of result. +FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vcombine_f32(a01, b23)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) +{ + float32x2_t a21 = vget_high_f32( + vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); + float32x2_t b03 = vget_low_f32( + vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); + return vreinterpretq_m128_f32(vcombine_f32(a21, b03)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) +{ + float32x2_t a03 = vget_low_f32( + vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); + float32x2_t b21 = vget_high_f32( + vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); + return vreinterpretq_m128_f32(vcombine_f32(a03, b21)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a01, b10)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vcombine_f32(a01, b01)); +} + +// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the +// high +FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b32)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) +{ + float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a11, b00)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) +{ + float32x2_t a22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a22, b00)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) +{ + float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t b22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a00, b22)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) +{ + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32x2_t a22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/ + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) +{ + float32x2_t a33 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1); + float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1); + return vreinterpretq_m128_f32(vcombine_f32(a33, b11)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a10, b20)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a01, b20)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) +{ + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a32, b20)); +} + +// NEON does not support a general purpose permute intrinsic +// Selects four specific single-precision, floating-point values from a and b, +// based on the mask i. +// +// C equivalent: +// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, +// __constrange(0, 255) int imm) { +// __m128 ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; +// return ret; +// } +// +// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx +#define _mm_shuffle_ps_default(a, b, imm) \ + __extension__({ \ + float32x4_t ret; \ + ret = vmovq_n_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \ + ret, 1); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \ + ret, 2); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \ + ret, 3); \ + vreinterpretq_m128_f32(ret); \ + }) + +// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) +// int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_ps(a, b, imm) \ + __extension__({ \ + float32x4_t _input1 = vreinterpretq_f32_m128(a); \ + float32x4_t _input2 = vreinterpretq_f32_m128(b); \ + float32x4_t _shuf = __builtin_shufflevector( \ + _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \ + (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \ + vreinterpretq_m128_f32(_shuf); \ + }) +#else // generic +#define _mm_shuffle_ps(a, b, imm) \ + __extension__({ \ + __m128 ret; \ + switch (imm) { \ + case _MM_SHUFFLE(1, 0, 3, 2): \ + ret = _mm_shuffle_ps_1032((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 3, 0, 1): \ + ret = _mm_shuffle_ps_2301((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 3, 2, 1): \ + ret = _mm_shuffle_ps_0321((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 1, 0, 3): \ + ret = _mm_shuffle_ps_2103((a), (b)); \ + break; \ + case _MM_SHUFFLE(1, 0, 1, 0): \ + ret = _mm_movelh_ps((a), (b)); \ + break; \ + case _MM_SHUFFLE(1, 0, 0, 1): \ + ret = _mm_shuffle_ps_1001((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 1, 0, 1): \ + ret = _mm_shuffle_ps_0101((a), (b)); \ + break; \ + case _MM_SHUFFLE(3, 2, 1, 0): \ + ret = _mm_shuffle_ps_3210((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 0, 1, 1): \ + ret = _mm_shuffle_ps_0011((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 0, 2, 2): \ + ret = _mm_shuffle_ps_0022((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 2, 0, 0): \ + ret = _mm_shuffle_ps_2200((a), (b)); \ + break; \ + case _MM_SHUFFLE(3, 2, 0, 2): \ + ret = _mm_shuffle_ps_3202((a), (b)); \ + break; \ + case _MM_SHUFFLE(3, 2, 3, 2): \ + ret = _mm_movehl_ps((b), (a)); \ + break; \ + case _MM_SHUFFLE(1, 1, 3, 3): \ + ret = _mm_shuffle_ps_1133((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 0, 1, 0): \ + ret = _mm_shuffle_ps_2010((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 0, 0, 1): \ + ret = _mm_shuffle_ps_2001((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 0, 3, 2): \ + ret = _mm_shuffle_ps_2032((a), (b)); \ + break; \ + default: \ + ret = _mm_shuffle_ps_default((a), (b), (imm)); \ + break; \ + } \ + ret; \ + }) +#endif + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of a and places it into the high end of the result. +FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) +{ + int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a32, a10)); +} + +// takes the lower two 32-bit values from a and swaps them and places in low end +// of result takes the higher two 32 bit values from a and swaps them and places +// in high end of result. +FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a23)); +} + +// rotates the least significant 32 bits into the most signficant 32 bits, and +// shifts the rest down +FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) +{ + return vreinterpretq_m128i_s32( + vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1)); +} + +// rotates the most significant 32 bits into the least signficant 32 bits, and +// shifts the rest up +FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) +{ + return vreinterpretq_m128i_s32( + vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3)); +} + +// gets the lower 64 bits of a, and places it in the upper 64 bits +// gets the lower 64 bits of a and places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) +{ + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a10, a10)); +} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the +// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a10)); +} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the +// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and +// places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a01)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) +{ + int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1); + int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); + return vreinterpretq_m128i_s32(vcombine_s32(a11, a22)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) +{ + int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a22, a01)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) +{ + int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1); + return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); +} + +// Shuffle packed 8-bit integers in a according to shuffle control mask in the +// corresponding 8-bit element of b, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8 +FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) +{ + int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a + uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b + uint8x16_t idx_masked = + vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits +#if defined(__aarch64__) + return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); +#elif defined(__GNUC__) + int8x16_t ret; + // %e and %f represent the even and odd D registers + // respectively. + __asm__ __volatile__( + "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" + "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" + : [ret] "=&w"(ret) + : [tbl] "w"(tbl), [idx] "w"(idx_masked)); + return vreinterpretq_m128i_s8(ret); +#else + // use this line if testing on aarch64 + int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)}; + return vreinterpretq_m128i_s8( + vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)), + vtbl2_s8(a_split, vget_high_u8(idx_masked)))); +#endif +} + +// C equivalent: +// __m128i _mm_shuffle_epi32_default(__m128i a, +// __constrange(0, 255) int imm) { +// __m128i ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; +// return ret; +// } +#define _mm_shuffle_epi32_default(a, imm) \ + __extension__({ \ + int32x4_t ret; \ + ret = vmovq_n_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \ + ret, 1); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \ + ret, 2); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \ + ret, 3); \ + vreinterpretq_m128i_s32(ret); \ + }) + +// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255) +// int imm) +#if defined(__aarch64__) +#define _mm_shuffle_epi32_splat(a, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \ + }) +#else +#define _mm_shuffle_epi32_splat(a, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \ + }) +#endif + +// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. +// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx +// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_epi32(a, imm) \ + __extension__({ \ + int32x4_t _input = vreinterpretq_s32_m128i(a); \ + int32x4_t _shuf = __builtin_shufflevector( \ + _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \ + ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \ + vreinterpretq_m128i_s32(_shuf); \ + }) +#else // generic +#define _mm_shuffle_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + switch (imm) { \ + case _MM_SHUFFLE(1, 0, 3, 2): \ + ret = _mm_shuffle_epi_1032((a)); \ + break; \ + case _MM_SHUFFLE(2, 3, 0, 1): \ + ret = _mm_shuffle_epi_2301((a)); \ + break; \ + case _MM_SHUFFLE(0, 3, 2, 1): \ + ret = _mm_shuffle_epi_0321((a)); \ + break; \ + case _MM_SHUFFLE(2, 1, 0, 3): \ + ret = _mm_shuffle_epi_2103((a)); \ + break; \ + case _MM_SHUFFLE(1, 0, 1, 0): \ + ret = _mm_shuffle_epi_1010((a)); \ + break; \ + case _MM_SHUFFLE(1, 0, 0, 1): \ + ret = _mm_shuffle_epi_1001((a)); \ + break; \ + case _MM_SHUFFLE(0, 1, 0, 1): \ + ret = _mm_shuffle_epi_0101((a)); \ + break; \ + case _MM_SHUFFLE(2, 2, 1, 1): \ + ret = _mm_shuffle_epi_2211((a)); \ + break; \ + case _MM_SHUFFLE(0, 1, 2, 2): \ + ret = _mm_shuffle_epi_0122((a)); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 2): \ + ret = _mm_shuffle_epi_3332((a)); \ + break; \ + case _MM_SHUFFLE(0, 0, 0, 0): \ + ret = _mm_shuffle_epi32_splat((a), 0); \ + break; \ + case _MM_SHUFFLE(1, 1, 1, 1): \ + ret = _mm_shuffle_epi32_splat((a), 1); \ + break; \ + case _MM_SHUFFLE(2, 2, 2, 2): \ + ret = _mm_shuffle_epi32_splat((a), 2); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 3): \ + ret = _mm_shuffle_epi32_splat((a), 3); \ + break; \ + default: \ + ret = _mm_shuffle_epi32_default((a), (imm)); \ + break; \ + } \ + ret; \ + }) +#endif + +// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified +// by imm. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100) +// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a, +// __constrange(0,255) int +// imm) +#define _mm_shufflelo_epi16_function(a, imm) \ + __extension__({ \ + int16x8_t ret = vreinterpretq_s16_m128i(a); \ + int16x4_t lowBits = vget_low_s16(ret); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \ + 1); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \ + 2); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \ + 3); \ + vreinterpretq_m128i_s16(ret); \ + }) + +// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shufflelo_epi16(a, imm) \ + __extension__({ \ + int16x8_t _input = vreinterpretq_s16_m128i(a); \ + int16x8_t _shuf = __builtin_shufflevector( \ + _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ + (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ + vreinterpretq_m128i_s16(_shuf); \ + }) +#else // generic +#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) +#endif + +// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified +// by imm. +// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx +// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a, +// __constrange(0,255) int +// imm) +#define _mm_shufflehi_epi16_function(a, imm) \ + __extension__({ \ + int16x8_t ret = vreinterpretq_s16_m128i(a); \ + int16x4_t highBits = vget_high_s16(ret); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \ + 5); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \ + 6); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \ + 7); \ + vreinterpretq_m128i_s16(ret); \ + }) + +// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shufflehi_epi16(a, imm) \ + __extension__({ \ + int16x8_t _input = vreinterpretq_s16_m128i(a); \ + int16x8_t _shuf = __builtin_shufflevector( \ + _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ + (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \ + (((imm) >> 6) & 0x3) + 4); \ + vreinterpretq_m128i_s16(_shuf); \ + }) +#else // generic +#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) +#endif + +// Shuffle double-precision (64-bit) floating-point elements using the control +// in imm8, and store the results in dst. +// +// dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +// dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd +#if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_pd(a, b, imm8) \ + vreinterpretq_m128d_s64(__builtin_shufflevector( \ + vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \ + ((imm8 & 0x2) >> 1) + 2)) +#else +#define _mm_shuffle_pd(a, b, imm8) \ + _mm_castsi128_pd(_mm_set_epi64x( \ + vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \ + vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1))) +#endif + +// Blend packed 16-bit integers from a and b using control mask imm8, and store +// the results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// IF imm8[j] +// dst[i+15:i] := b[i+15:i] +// ELSE +// dst[i+15:i] := a[i+15:i] +// FI +// ENDFOR +// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, +// __constrange(0,255) int imm) +#define _mm_blend_epi16(a, b, imm) \ + __extension__({ \ + const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 1)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 2)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 3)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 4)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 5)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 6)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \ + uint16x8_t _mask_vec = vld1q_u16(_mask); \ + uint16x8_t _a = vreinterpretq_u16_m128i(a); \ + uint16x8_t _b = vreinterpretq_u16_m128i(b); \ + vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \ + }) + +// Blend packed 8-bit integers from a and b using mask, and store the results in +// dst. +// +// FOR j := 0 to 15 +// i := j*8 +// IF mask[i+7] +// dst[i+7:i] := b[i+7:i] +// ELSE +// dst[i+7:i] := a[i+7:i] +// FI +// ENDFOR +FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask) +{ + // Use a signed shift right to create a mask with the sign bit + uint8x16_t mask = + vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7)); + uint8x16_t a = vreinterpretq_u8_m128i(_a); + uint8x16_t b = vreinterpretq_u8_m128i(_b); + return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a)); +} + +/* Shifts */ + + +// Shift packed 16-bit integers in a right by imm while shifting in sign +// bits, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16 +FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) +{ + const int count = (imm & ~15) ? 15 : imm; + return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count)); +} + +// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// ... +// r7 := a7 << count +// +// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx +#define _mm_slli_epi16(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm)) <= 0) { \ + ret = a; \ + } \ + if (unlikely((imm) > 15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_s16( \ + vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \ + } \ + ret; \ + }) + +// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while +// shifting in zeros. : +// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx +// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm) +FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) +{ + if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */ + return a; + if (unlikely(imm > 31)) + return _mm_setzero_si128(); + return vreinterpretq_m128i_s32( + vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm))); +} + +// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and +// store the results in dst. +FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) +{ + if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */ + return a; + if (unlikely(imm > 63)) + return _mm_setzero_si128(); + return vreinterpretq_m128i_s64( + vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm))); +} + +// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// IF imm8[7:0] > 15 +// dst[i+15:i] := 0 +// ELSE +// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16 +#define _mm_srli_epi16(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely(imm) == 0) { \ + ret = a; \ + } \ + if (likely(0 < (imm) && (imm) < 16)) { \ + ret = vreinterpretq_m128i_u16( \ + vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \ + } else { \ + ret = _mm_setzero_si128(); \ + } \ + ret; \ + }) + +// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// IF imm8[7:0] > 31 +// dst[i+31:i] := 0 +// ELSE +// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32 +// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) +#define _mm_srli_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) == 0)) { \ + ret = a; \ + } \ + if (likely(0 < (imm) && (imm) < 32)) { \ + ret = vreinterpretq_m128i_u32( \ + vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \ + } else { \ + ret = _mm_setzero_si128(); \ + } \ + ret; \ + }) + +// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// IF imm8[7:0] > 63 +// dst[i+63:i] := 0 +// ELSE +// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64 +#define _mm_srli_epi64(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) == 0)) { \ + ret = a; \ + } \ + if (likely(0 < (imm) && (imm) < 64)) { \ + ret = vreinterpretq_m128i_u64( \ + vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \ + } else { \ + ret = _mm_setzero_si128(); \ + } \ + ret; \ + }) + +// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, +// and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// IF imm8[7:0] > 31 +// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) +// ELSE +// dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32 +// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) +#define _mm_srai_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) == 0)) { \ + ret = a; \ + } \ + if (likely(0 < (imm) && (imm) < 32)) { \ + ret = vreinterpretq_m128i_s32( \ + vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \ + } else { \ + ret = vreinterpretq_m128i_s32( \ + vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \ + } \ + ret; \ + }) + +// Shifts the 128 - bit value in a right by imm bytes while shifting in +// zeros.imm must be an immediate. +// +// r := srl(a, imm*8) +// +// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx +// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm) +#define _mm_srli_si128(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) <= 0)) { \ + ret = a; \ + } \ + if (unlikely((imm) > 15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_s8( \ + vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \ + } \ + ret; \ + }) + +// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm +// must be an immediate. +// +// r := a << (imm * 8) +// +// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx +// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm) +#define _mm_slli_si128(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) <= 0)) { \ + ret = a; \ + } \ + if (unlikely((imm) > 15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_s8(vextq_s8( \ + vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \ + } \ + ret; \ + }) + +// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// ... +// r7 := a7 << count +// +// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 15)) + return _mm_setzero_si128(); + + int16x8_t vc = vdupq_n_s16((int16_t) c); + return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc)); +} + +// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// r2 := a2 << count +// r3 := a3 << count +// +// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 31)) + return _mm_setzero_si128(); + + int32x4_t vc = vdupq_n_s32((int32_t) c); + return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc)); +} + +// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// +// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 63)) + return _mm_setzero_si128(); + + int64x2_t vc = vdupq_n_s64((int64_t) c); + return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc)); +} + +// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits +// while shifting in zeros. +// +// r0 := srl(a0, count) +// r1 := srl(a1, count) +// ... +// r7 := srl(a7, count) +// +// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 15)) + return _mm_setzero_si128(); + + int16x8_t vc = vdupq_n_s16(-(int16_t) c); + return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc)); +} + +// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits +// while shifting in zeros. +// +// r0 := srl(a0, count) +// r1 := srl(a1, count) +// r2 := srl(a2, count) +// r3 := srl(a3, count) +// +// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 31)) + return _mm_setzero_si128(); + + int32x4_t vc = vdupq_n_s32(-(int32_t) c); + return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc)); +} + +// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits +// while shifting in zeros. +// +// r0 := srl(a0, count) +// r1 := srl(a1, count) +// +// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 63)) + return _mm_setzero_si128(); + + int64x2_t vc = vdupq_n_s64(-(int64_t) c); + return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc)); +} + +// NEON does not provide a version of this function. +// Creates a 16-bit mask from the most significant bits of the 16 signed or +// unsigned 8-bit integers in a and zero extends the upper bits. +// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx +FORCE_INLINE int _mm_movemask_epi8(__m128i a) +{ + // Use increasingly wide shifts+adds to collect the sign bits + // together. + // Since the widening shifts would be rather confusing to follow in little + // endian, everything will be illustrated in big endian order instead. This + // has a different result - the bits would actually be reversed on a big + // endian machine. + + // Starting input (only half the elements are shown): + // 89 ff 1d c0 00 10 99 33 + uint8x16_t input = vreinterpretq_u8_m128i(a); + + // Shift out everything but the sign bits with an unsigned shift right. + // + // Bytes of the vector:: + // 89 ff 1d c0 00 10 99 33 + // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7) + // | | | | | | | | + // 01 01 00 01 00 00 01 00 + // + // Bits of first important lane(s): + // 10001001 (89) + // \______ + // | + // 00000001 (01) + uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); + + // Merge the even lanes together with a 16-bit unsigned shift right + add. + // 'xx' represents garbage data which will be ignored in the final result. + // In the important bytes, the add functions like a binary OR. + // + // 01 01 00 01 00 00 01 00 + // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7)) + // \| \| \| \| + // xx 03 xx 01 xx 00 xx 02 + // + // 00000001 00000001 (01 01) + // \_______ | + // \| + // xxxxxxxx xxxxxx11 (xx 03) + uint32x4_t paired16 = + vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + + // Repeat with a wider 32-bit shift + add. + // xx 03 xx 01 xx 00 xx 02 + // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> + // 14)) + // \| \| + // xx xx xx 0d xx xx xx 02 + // + // 00000011 00000001 (03 01) + // \\_____ || + // '----.\|| + // xxxxxxxx xxxx1101 (xx 0d) + uint64x2_t paired32 = + vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + + // Last, an even wider 64-bit shift + add to get our result in the low 8 bit + // lanes. xx xx xx 0d xx xx xx 02 + // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> + // 28)) + // \| + // xx xx xx xx xx xx xx d2 + // + // 00001101 00000010 (0d 02) + // \ \___ | | + // '---. \| | + // xxxxxxxx 11010010 (xx d2) + uint8x16_t paired64 = + vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + + // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts. + // xx xx xx xx xx xx xx d2 + // || return paired64[0] + // d2 + // Note: Little endian would return the correct value 4b (01001011) instead. + return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); +} + +// Copy the lower 64-bit integer in a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64 +FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) +{ + return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a))); +} + +// Copy the 64-bit integer a to the lower element of dst, and zero the upper +// element. +// +// dst[63:0] := a[63:0] +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64 +FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) +{ + return vreinterpretq_m128i_s64( + vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0))); +} + +// NEON does not provide this method +// Creates a 4-bit mask from the most significant bits of the four +// single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx +FORCE_INLINE int _mm_movemask_ps(__m128 a) +{ + uint32x4_t input = vreinterpretq_u32_m128(a); +#if defined(__aarch64__) + static const int32x4_t shift = {0, 1, 2, 3}; + uint32x4_t tmp = vshrq_n_u32(input, 31); + return vaddvq_u32(vshlq_u32(tmp, shift)); +#else + // Uses the exact same method as _mm_movemask_epi8, see that for details. + // Shift out everything but the sign bits with a 32-bit unsigned shift + // right. + uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31)); + // Merge the two pairs together with a 64-bit unsigned shift right + add. + uint8x16_t paired = + vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); + // Extract the result. + return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); +#endif +} + +// Compute the bitwise NOT of a and then AND with a 128-bit vector containing +// all 1's, and return 1 if the result is zero, otherwise return 0. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones +FORCE_INLINE int _mm_test_all_ones(__m128i a) +{ + return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) == + ~(uint64_t) 0; +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and +// mask, and return 1 if the result is zero, otherwise return 0. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros +FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) +{ + int64x2_t a_and_mask = + vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask)); + return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0 + : 1; +} + +/* Math operations */ + +// Subtracts the four single-precision, floating-point values of a and b. +// +// r0 := a0 - b0 +// r1 := a1 - b1 +// r2 := a2 - b2 +// r3 := a3 - b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Subtract the lower single-precision (32-bit) floating-point element in b from +// the lower single-precision (32-bit) floating-point element in a, store the +// result in the lower element of dst, and copy the upper 3 packed elements from +// a to the upper elements of dst. +// +// dst[31:0] := a[31:0] - b[31:0] +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss +FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_sub_ps(a, b)); +} + +// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a, +// and store the results in dst. +// r0 := a0 - b0 +// r1 := a1 - b1 +FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s64( + vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +} + +// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or +// unsigned 32-bit integers of a. +// +// r0 := a0 - b0 +// r1 := a1 - b1 +// r2 := a2 - b2 +// r3 := a3 - b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx +FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and +// store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16 +FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and +// store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8 +FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst. +// +// dst[63:0] := a[63:0] - b[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64 +FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) +{ + return vreinterpret_m64_s64( + vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); +} + +// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit +// integers of a and saturates.. +// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx +FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit +// integers of a and saturates. +// +// r0 := UnsignedSaturate(a0 - b0) +// r1 := UnsignedSaturate(a1 - b1) +// ... +// r15 := UnsignedSaturate(a15 - b15) +// +// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers +// of a and saturates. +// +// r0 := SignedSaturate(a0 - b0) +// r1 := SignedSaturate(a1 - b1) +// ... +// r15 := SignedSaturate(a15 - b15) +// +// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers +// of a and saturates. +// +// r0 := SignedSaturate(a0 - b0) +// r1 := SignedSaturate(a1 - b1) +// ... +// r7 := SignedSaturate(a7 - b7) +// +// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Subtract packed double-precision (64-bit) floating-point elements in b from +// packed double-precision (64-bit) floating-point elements in a, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := a[i+63:i] - b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd +FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] - db[0]; + c[1] = da[1] - db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Subtract the lower double-precision (64-bit) floating-point element in b from +// the lower double-precision (64-bit) floating-point element in a, store the +// result in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd +FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_sub_pd(a, b)); +} + +// Add packed unsigned 16-bit integers in a and b using saturation, and store +// the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16 +FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Negate packed 8-bit integers in a when the corresponding signed +// 8-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// +// for i in 0..15 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) +{ + int8x16_t a = vreinterpretq_s8_m128i(_a); + int8x16_t b = vreinterpretq_s8_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFF : 0 + uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7)); + + // (b == 0) ? 0xFF : 0 +#if defined(__aarch64__) + int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b)); +#else + int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0))); +#endif + + // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a') + // based on ltMask + int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a); + // res = masked & (~zeroMask) + int8x16_t res = vbicq_s8(masked, zeroMask); + + return vreinterpretq_m128i_s8(res); +} + +// Negate packed 16-bit integers in a when the corresponding signed +// 16-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// +// for i in 0..7 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) +{ + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFF : 0 + uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15)); + // (b == 0) ? 0xFFFF : 0 +#if defined(__aarch64__) + int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b)); +#else + int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0))); +#endif + + // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative + // 'a') based on ltMask + int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a); + // res = masked & (~zeroMask) + int16x8_t res = vbicq_s16(masked, zeroMask); + return vreinterpretq_m128i_s16(res); +} + +// Negate packed 32-bit integers in a when the corresponding signed +// 32-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// +// for i in 0..3 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFFFFFF : 0 + uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31)); + + // (b == 0) ? 0xFFFFFFFF : 0 +#if defined(__aarch64__) + int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b)); +#else + int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0))); +#endif + + // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative + // 'a') based on ltMask + int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a); + // res = masked & (~zeroMask) + int32x4_t res = vbicq_s32(masked, zeroMask); + return vreinterpretq_m128i_s32(res); +} + +// Negate packed 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. +// +// FOR j := 0 to 3 +// i := j*16 +// IF b[i+15:i] < 0 +// dst[i+15:i] := -(a[i+15:i]) +// ELSE IF b[i+15:i] == 0 +// dst[i+15:i] := 0 +// ELSE +// dst[i+15:i] := a[i+15:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16 +FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) +{ + int16x4_t a = vreinterpret_s16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFF : 0 + uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15)); + + // (b == 0) ? 0xFFFF : 0 +#if defined(__aarch64__) + int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b)); +#else + int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0))); +#endif + + // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a') + // based on ltMask + int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a); + // res = masked & (~zeroMask) + int16x4_t res = vbic_s16(masked, zeroMask); + + return vreinterpret_m64_s16(res); +} + +// Negate packed 32-bit integers in a when the corresponding signed 32-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. +// +// FOR j := 0 to 1 +// i := j*32 +// IF b[i+31:i] < 0 +// dst[i+31:i] := -(a[i+31:i]) +// ELSE IF b[i+31:i] == 0 +// dst[i+31:i] := 0 +// ELSE +// dst[i+31:i] := a[i+31:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32 +FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) +{ + int32x2_t a = vreinterpret_s32_m64(_a); + int32x2_t b = vreinterpret_s32_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFFFFFF : 0 + uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31)); + + // (b == 0) ? 0xFFFFFFFF : 0 +#if defined(__aarch64__) + int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b)); +#else + int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0))); +#endif + + // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a') + // based on ltMask + int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a); + // res = masked & (~zeroMask) + int32x2_t res = vbic_s32(masked, zeroMask); + + return vreinterpret_m64_s32(res); +} + +// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer +// in b is negative, and store the results in dst. Element in dst are zeroed out +// when the corresponding element in b is zero. +// +// FOR j := 0 to 7 +// i := j*8 +// IF b[i+7:i] < 0 +// dst[i+7:i] := -(a[i+7:i]) +// ELSE IF b[i+7:i] == 0 +// dst[i+7:i] := 0 +// ELSE +// dst[i+7:i] := a[i+7:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8 +FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) +{ + int8x8_t a = vreinterpret_s8_m64(_a); + int8x8_t b = vreinterpret_s8_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFF : 0 + uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7)); + + // (b == 0) ? 0xFF : 0 +#if defined(__aarch64__) + int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b)); +#else + int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0))); +#endif + + // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a') + // based on ltMask + int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a); + // res = masked & (~zeroMask) + int8x8_t res = vbic_s8(masked, zeroMask); + + return vreinterpret_m64_s8(res); +} + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16 +FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) +{ + return vreinterpret_m64_u16( + vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b))); +} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8 +FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb +#define _m_pavgb(a, b) _mm_avg_pu8(a, b) + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw +#define _m_pavgw(a, b) _mm_avg_pu16(a, b) + +// Extract a 16-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw +#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm) + +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw +#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm) + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw +#define _m_pmaxsw(a, b) _mm_max_pi16(a, b) + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub +#define _m_pmaxub(a, b) _mm_max_pu8(a, b) + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw +#define _m_pminsw(a, b) _mm_min_pi16(a, b) + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub +#define _m_pminub(a, b) _mm_min_pu8(a, b) + +// Computes the average of the 16 unsigned 8-bit integers in a and the 16 +// unsigned 8-bit integers in b and rounds. +// +// r0 := (a0 + b0) / 2 +// r1 := (a1 + b1) / 2 +// ... +// r15 := (a15 + b15) / 2 +// +// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Computes the average of the 8 unsigned 16-bit integers in a and the 8 +// unsigned 16-bit integers in b and rounds. +// +// r0 := (a0 + b0) / 2 +// r1 := (a1 + b1) / 2 +// ... +// r7 := (a7 + b7) / 2 +// +// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx +FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) +{ + return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a), + vreinterpretq_u16_m128i(b)); +} + +// Adds the four single-precision, floating-point values of a and b. +// +// r0 := a0 + b0 +// r1 := a1 + b1 +// r2 := a2 + b2 +// r3 := a3 + b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx +FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Add packed double-precision (64-bit) floating-point elements in a and b, and +// store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd +FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] + db[0]; + c[1] = da[1] + db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Add the lower double-precision (64-bit) floating-point element in a and b, +// store the result in the lower element of dst, and copy the upper element from +// a to the upper element of dst. +// +// dst[63:0] := a[63:0] + b[63:0] +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd +FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_add_pd(a, b)); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] + db[0]; + c[1] = da[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Add 64-bit integers a and b, and store the result in dst. +// +// dst[63:0] := a[63:0] + b[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64 +FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) +{ + return vreinterpret_m64_s64( + vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); +} + +// adds the scalar single-precision floating point values of a and b. +// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx +FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) +{ + float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); + float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); + // the upper values in the result must be the remnants of <a>. + return vreinterpretq_m128_f32(vaddq_f32(a, value)); +} + +// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or +// unsigned 32-bit integers in b. +// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s64( + vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +} + +// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or +// unsigned 32-bit integers in b. +// +// r0 := a0 + b0 +// r1 := a1 + b1 +// r2 := a2 + b2 +// r3 := a3 + b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or +// unsigned 16-bit integers in b. +// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or +// unsigned 8-bit integers in b. +// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90) +FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b +// and saturates. +// +// r0 := SignedSaturate(a0 + b0) +// r1 := SignedSaturate(a1 + b1) +// ... +// r7 := SignedSaturate(a7 + b7) +// +// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Add packed signed 8-bit integers in a and b using saturation, and store the +// results in dst. +// +// FOR j := 0 to 15 +// i := j*8 +// dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8 +FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in +// b and saturates.. +// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx +FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or +// unsigned 16-bit integers from b. +// +// r0 := (a0 * b0)[15:0] +// r1 := (a1 * b1)[15:0] +// ... +// r7 := (a7 * b7)[15:0] +// +// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or +// unsigned 32-bit integers from b. +// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// tmp[31:0] := a[i+15:i] * b[i+15:i] +// dst[i+15:i] := tmp[31:16] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw +#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) + +// Multiplies the four single-precision, floating-point values of a and b. +// +// r0 := a0 * b0 +// r1 := a1 * b1 +// r2 := a2 * b2 +// r3 := a3 * b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx +FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Multiply packed double-precision (64-bit) floating-point elements in a and b, +// and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd +FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] * db[0]; + c[1] = da[1] * db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Multiply the lower double-precision (64-bit) floating-point element in a and +// b, store the result in the lower element of dst, and copy the upper element +// from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd +FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_mul_pd(a, b)); +} + +// Multiply the lower single-precision (32-bit) floating-point element in a and +// b, store the result in the lower element of dst, and copy the upper 3 packed +// elements from a to the upper elements of dst. +// +// dst[31:0] := a[31:0] * b[31:0] +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss +FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_mul_ps(a, b)); +} + +// Multiply the low unsigned 32-bit integers from each packed 64-bit element in +// a and b, and store the unsigned 64-bit results in dst. +// +// r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF) +// r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF) +FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) +{ + // vmull_u32 upcasts instead of masking, so we downcast. + uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a)); + uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b)); + return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo)); +} + +// Multiply the low unsigned 32-bit integers from a and b, and store the +// unsigned 64-bit result in dst. +// +// dst[63:0] := a[31:0] * b[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32 +FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) +{ + return vreinterpret_m64_u64(vget_low_u64( + vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b)))); +} + +// Multiply the low signed 32-bit integers from each packed 64-bit element in +// a and b, and store the signed 64-bit results in dst. +// +// r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0 +// r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2 +FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) +{ + // vmull_s32 upcasts instead of masking, so we downcast. + int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a)); + int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo)); +} + +// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit +// integers from b. +// +// r0 := (a0 * b0) + (a1 * b1) +// r1 := (a2 * b2) + (a3 * b3) +// r2 := (a4 * b4) + (a5 * b5) +// r3 := (a6 * b6) + (a7 * b7) +// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx +FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) +{ + int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), + vget_low_s16(vreinterpretq_s16_m128i(b))); + int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), + vget_high_s16(vreinterpretq_s16_m128i(b))); + + int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low)); + int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high)); + + return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum)); +} + +// Multiply packed signed 16-bit integers in a and b, producing intermediate +// signed 32-bit integers. Shift right by 15 bits while rounding up, and store +// the packed 16-bit integers in dst. +// +// r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15) +// r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15) +// r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15) +// ... +// r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15) +FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) +{ + // Has issues due to saturation + // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b)); + + // Multiply + int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), + vget_low_s16(vreinterpretq_s16_m128i(b))); + int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), + vget_high_s16(vreinterpretq_s16_m128i(b))); + + // Rounding narrowing shift right + // narrow = (int16_t)((mul + 16384) >> 15); + int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); + int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); + + // Join together + return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi)); +} + +// Vertically multiply each unsigned 8-bit integer from a with the corresponding +// signed 8-bit integer from b, producing intermediate signed 16-bit integers. +// Horizontally add adjacent pairs of intermediate signed 16-bit integers, +// and pack the saturated results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + +// a[i+7:i]*b[i+7:i] ) +// ENDFOR +FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) + uint8x16_t a = vreinterpretq_u8_m128i(_a); + int8x16_t b = vreinterpretq_s8_m128i(_b); + int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), + vmovl_s8(vget_low_s8(b))); + int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), + vmovl_s8(vget_high_s8(b))); + return vreinterpretq_m128i_s16( + vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th))); +#else + // This would be much simpler if x86 would choose to zero extend OR sign + // extend, not both. This could probably be optimized better. + uint16x8_t a = vreinterpretq_u16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + + // Zero extend a + int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8)); + int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00))); + + // Sign extend by shifting left then shifting right. + int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8); + int16x8_t b_odd = vshrq_n_s16(b, 8); + + // multiply + int16x8_t prod1 = vmulq_s16(a_even, b_even); + int16x8_t prod2 = vmulq_s16(a_odd, b_odd); + + // saturated add + return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2)); +#endif +} + +// Computes the fused multiple add product of 32-bit floating point numbers. +// +// Return Value +// Multiplies A and B, and adds C to the temporary result before returning it. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd +FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c), + vreinterpretq_f32_m128(b), + vreinterpretq_f32_m128(a))); +#else + return _mm_add_ps(_mm_mul_ps(a, b), c); +#endif +} + +// Alternatively add and subtract packed single-precision (32-bit) +// floating-point elements in a to/from packed elements in b, and store the +// results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps +FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) +{ + __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f}; + return _mm_fmadd_ps(b, mask, a); +} + +// Horizontally add adjacent pairs of double-precision (64-bit) floating-point +// elements in a and b, and pack the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd +FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[] = {da[0] + da[1], db[0] + db[1]}; + return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); +#endif +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce two +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of 64-bit elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8 +FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) +{ + uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b)); + uint16_t r0 = t[0] + t[1] + t[2] + t[3]; + uint16_t r4 = t[4] + t[5] + t[6] + t[7]; + uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0); + return (__m128i) vsetq_lane_u16(r4, r, 4); +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8 +FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) +{ + uint16x4_t t = + vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); + uint16_t r0 = t[0] + t[1] + t[2] + t[3]; + return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0)); +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// +// FOR j := 0 to 7 +// i := j*8 +// tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) +// ENDFOR +// dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + +// tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw +#define _m_psadbw(a, b) _mm_sad_pu8(a, b) + +// Divides the four single-precision, floating-point values of a and b. +// +// r0 := a0 / b0 +// r1 := a1 / b1 +// r2 := a2 / b2 +// r3 := a3 / b3 +// +// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx +FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV + return vreinterpretq_m128_f32( + vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b)); + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); +#if SSE2NEON_PRECISE_DIV + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); +#endif + return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip)); +#endif +} + +// Divides the scalar single-precision floating point value of a by b. +// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx +FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) +{ + float32_t value = + vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Divide packed double-precision (64-bit) floating-point elements in a by +// packed elements in b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 64*j +// dst[i+63:i] := a[i+63:i] / b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd +FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] / db[0]; + c[1] = da[1] / db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Divide the lower double-precision (64-bit) floating-point element in a by the +// lower double-precision (64-bit) floating-point element in b, store the result +// in the lower element of dst, and copy the upper element from a to the upper +// element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd +FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + float64x2_t tmp = + vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)); + return vreinterpretq_m128d_f64( + vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1)); +#else + return _mm_move_sd(a, _mm_div_pd(a, b)); +#endif +} + +// Compute the approximate reciprocal of packed single-precision (32-bit) +// floating-point elements in a, and store the results in dst. The maximum +// relative error for this approximation is less than 1.5*2^-12. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps +FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) +{ + float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); +#if SSE2NEON_PRECISE_DIV + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); +#endif + return vreinterpretq_m128_f32(recip); +} + +// Compute the approximate reciprocal of the lower single-precision (32-bit) +// floating-point element in a, store the result in the lower element of dst, +// and copy the upper 3 packed elements from a to the upper elements of dst. The +// maximum relative error for this approximation is less than 1.5*2^-12. +// +// dst[31:0] := (1.0 / a[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss +FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) +{ + return _mm_move_ss(a, _mm_rcp_ps(a)); +} + +// Computes the approximations of square roots of the four single-precision, +// floating-point values of a. First computes reciprocal square roots and then +// reciprocals of the four values. +// +// r0 := sqrt(a0) +// r1 := sqrt(a1) +// r2 := sqrt(a2) +// r3 := sqrt(a3) +// +// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) +{ +#if SSE2NEON_PRECISE_SQRT + float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in)); + + // Test for vrsqrteq_f32(0) -> positive infinity case. + // Change to zero, so that s * 1/sqrt(s) result is zero too. + const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); + const uint32x4_t div_by_zero = + vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip)); + recip = vreinterpretq_f32_u32( + vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip))); + + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32( + vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), + recip); + recip = vmulq_f32( + vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), + recip); + + // sqrt(s) = s * 1/sqrt(s) + return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip)); +#elif defined(__aarch64__) + return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in))); +#else + float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in)); + float32x4_t sq = vrecpeq_f32(recipsq); + return vreinterpretq_m128_f32(sq); +#endif +} + +// Computes the approximation of the square root of the scalar single-precision +// floating point value of in. +// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) +{ + float32_t value = + vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0)); +} + +// Computes the approximations of the reciprocal square roots of the four +// single-precision floating point values of in. +// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx +FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) +{ + float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in)); +#if SSE2NEON_PRECISE_RSQRT + // Additional Netwon-Raphson iteration for accuracy + out = vmulq_f32( + out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); + out = vmulq_f32( + out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); +#endif + return vreinterpretq_m128_f32(out); +} + +// Compute the approximate reciprocal square root of the lower single-precision +// (32-bit) floating-point element in a, store the result in the lower element +// of dst, and copy the upper 3 packed elements from a to the upper elements of +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss +FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) +{ + return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0); +} + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 +FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 +#define _m_pmaxsw(a, b) _mm_max_pi16(a, b) + +// Computes the maximums of the four single-precision, floating-point values of +// a and b. +// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx +FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) +{ +#if SSE2NEON_PRECISE_MINMAX + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + return vbslq_f32(vcltq_f32(_b, _a), _a, _b); +#else + return vreinterpretq_m128_f32( + vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#endif +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 +FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 +#define _m_pmaxub(a, b) _mm_max_pu8(a, b) + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 +FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 +#define _m_pminsw(a, b) _mm_min_pi16(a, b) + +// Computes the minima of the four single-precision, floating-point values of a +// and b. +// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) +{ +#if SSE2NEON_PRECISE_MINMAX + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + return vbslq_f32(vcltq_f32(_a, _b), _a, _b); +#else + return vreinterpretq_m128_f32( + vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#endif +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 +FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 +#define _m_pminub(a, b) _mm_min_pu8(a, b) + +// Computes the maximum of the two lower scalar single-precision floating point +// values of a and b. +// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) +{ + float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Computes the minimum of the two lower scalar single-precision floating point +// values of a and b. +// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx +FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) +{ + float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the +// 16 unsigned 8-bit integers from b. +// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the +// 16 unsigned 8-bit integers from b. +// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx +FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 +// signed 16-bit integers from b. +// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx +FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8 +FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed unsigned 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16 +FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8 +FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed unsigned 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16 +FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8 +// signed 16-bit integers from b. +// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// epi versions of min/max +// Computes the pariwise maximums of the four signed 32-bit integer values of a +// and b. +// +// A 128-bit parameter that can be defined with the following equations: +// r0 := (a0 > b0) ? a0 : b0 +// r1 := (a1 > b1) ? a1 : b1 +// r2 := (a2 > b2) ? a2 : b2 +// r3 := (a3 > b3) ? a3 : b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Computes the pariwise minima of the four signed 32-bit integer values of a +// and b. +// +// A 128-bit parameter that can be defined with the following equations: +// r0 := (a0 < b0) ? a0 : b0 +// r1 := (a1 < b1) ? a1 : b1 +// r2 := (a2 < b2) ? a2 : b2 +// r3 := (a3 < b3) ? a3 : b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx +FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed unsigned 32-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 +FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); +} + +// Compare packed unsigned 32-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 +FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16 +FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) +{ + return vreinterpret_m64_u16(vshrn_n_u32( + vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16)); +} + +// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit +// integers from b. +// +// r0 := (a0 * b0)[31:16] +// r1 := (a1 * b1)[31:16] +// ... +// r7 := (a7 * b7)[31:16] +// +// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) +{ + /* FIXME: issue with large values because of result saturation */ + // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), + // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return + // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1)); + int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b)); + int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ + int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b)); + int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); + return vreinterpretq_m128i_u16(r.val[1]); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16 +FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) +{ + uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a)); + uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b)); + uint32x4_t ab3210 = vmull_u16(a3210, b3210); +#if defined(__aarch64__) + uint32x4_t ab7654 = + vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); + uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), + vreinterpretq_u16_u32(ab7654)); + return vreinterpretq_m128i_u16(r); +#else + uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a)); + uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b)); + uint32x4_t ab7654 = vmull_u16(a7654, b7654); + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); + return vreinterpretq_m128i_u16(r.val[1]); +#endif +} + +// Computes pairwise add of each argument as single-precision, floating-point +// values a and b. +// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx +FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32( + vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32))); +#endif +} + +// Computes pairwise add of each argument as a 16-bit signed or unsigned integer +// values a and b. +FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) +{ + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); +#if defined(__aarch64__) + return vreinterpretq_m128i_s16(vpaddq_s16(a, b)); +#else + return vreinterpretq_m128i_s16( + vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)), + vpadd_s16(vget_low_s16(b), vget_high_s16(b)))); +#endif +} + +// Horizontally substract adjacent pairs of single-precision (32-bit) +// floating-point elements in a and b, and pack the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps +FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32(vsubq_f32( + vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)), + vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)))); +#else + float32x4x2_t c = + vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)); + return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1])); +#endif +} + +// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the +// signed 16-bit results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16 +FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the +// signed 32-bit results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32 +FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) +{ + return vreinterpret_m64_s32( + vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))); +} + +// Computes pairwise difference of each argument as a 16-bit signed or unsigned +// integer values a and b. +FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Subtract + return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357)); +} + +// Computes saturated pairwise sub of each argument as a 16-bit signed +// integer values a and b. +FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + return vreinterpretq_s64_s16( + vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Saturated add + return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357)); +#endif +} + +// Computes saturated pairwise difference of each argument as a 16-bit signed +// integer values a and b. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16 +FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + return vreinterpretq_s64_s16( + vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Saturated subtract + return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357)); +#endif +} + +// Computes pairwise add of each argument as a 32-bit signed or unsigned integer +// values a and b. +FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + return vreinterpretq_m128i_s32( + vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)), + vpadd_s32(vget_low_s32(b), vget_high_s32(b)))); +} + +// Computes pairwise difference of each argument as a 32-bit signed or unsigned +// integer values a and b. +FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) +{ + int64x2_t a = vreinterpretq_s64_m128i(_a); + int64x2_t b = vreinterpretq_s64_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|b0|b2] + // [a1|a2|b1|b3] + int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b)); + int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32)); + // Subtract + return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13)); +} + +// Kahan summation for accurate summation of floating-point numbers. +// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html +FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y) +{ + y -= *c; + float t = *sum + y; + *c = (t - *sum) - y; + *sum = t; +} + +// Conditionally multiply the packed single-precision (32-bit) floating-point +// elements in a and b using the high 4 bits in imm8, sum the four products, +// and conditionally store the sum in dst using the low 4 bits of imm. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps +FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) +{ +#if defined(__aarch64__) + /* shortcuts */ + if (imm == 0xFF) { + return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b))); + } + if (imm == 0x7F) { + float32x4_t m = _mm_mul_ps(a, b); + m[3] = 0; + return _mm_set1_ps(vaddvq_f32(m)); + } +#endif + + float s = 0, c = 0; + float32x4_t f32a = vreinterpretq_f32_m128(a); + float32x4_t f32b = vreinterpretq_f32_m128(b); + + /* To improve the accuracy of floating-point summation, Kahan algorithm + * is used for each operation. + */ + if (imm & (1 << 4)) + _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]); + if (imm & (1 << 5)) + _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]); + if (imm & (1 << 6)) + _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]); + if (imm & (1 << 7)) + _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]); + s += c; + + float32x4_t res = { + (imm & 0x1) ? s : 0, + (imm & 0x2) ? s : 0, + (imm & 0x4) ? s : 0, + (imm & 0x8) ? s : 0, + }; + return vreinterpretq_m128_f32(res); +} + +/* Compare operations */ + +// Compares for less than +// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for less than +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100) +FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmplt_ps(a, b)); +} + +// Compares for greater than. +// +// r0 := (a0 > b0) ? 0xffffffff : 0x0 +// r1 := (a1 > b1) ? 0xffffffff : 0x0 +// r2 := (a2 > b2) ? 0xffffffff : 0x0 +// r3 := (a3 > b3) ? 0xffffffff : 0x0 +// +// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100) +FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpgt_ps(a, b)); +} + +// Compares for greater than or equal. +// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100) +FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpge_ps(a, b)); +} + +// Compares for less than or equal. +// +// r0 := (a0 <= b0) ? 0xffffffff : 0x0 +// r1 := (a1 <= b1) ? 0xffffffff : 0x0 +// r2 := (a2 <= b2) ? 0xffffffff : 0x0 +// r3 := (a3 <= b3) ? 0xffffffff : 0x0 +// +// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100) +FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmple_ps(a, b)); +} + +// Compares for equality. +// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for equality. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100) +FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpeq_ps(a, b)); +} + +// Compares for inequality. +// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32(vmvnq_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); +} + +// Compares for inequality. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100) +FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpneq_ps(a, b)); +} + +// Compares for not greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) +{ + return _mm_cmplt_ps(a, b); +} + +// Compares for not greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) +{ + return _mm_cmplt_ss(a, b); +} + +// Compares for not greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100) +FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) +{ + return _mm_cmple_ps(a, b); +} + +// Compares for not greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) +FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) +{ + return _mm_cmple_ss(a, b); +} + +// Compares for not less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) +{ + return _mm_cmpgt_ps(a, b); +} + +// Compares for not less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) +{ + return _mm_cmpgt_ss(a, b); +} + +// Compares for not less than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) +{ + return _mm_cmpge_ps(a, b); +} + +// Compares for not less than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) +{ + return _mm_cmpge_ss(a, b); +} + +// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or +// unsigned 8-bit integers in b for equality. +// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx +FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for equality, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd +FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_u64( + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped)); +#endif +} + +// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or +// unsigned 16-bit integers in b for equality. +// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed 32-bit integers in a and b for equality, and store the results +// in dst +FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed 64-bit integers in a and b for equality, and store the results +// in dst +FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_u64( + vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b))); +#else + // ARMv7 lacks vceqq_u64 + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped)); +#endif +} + +// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers +// in b for lesser than. +// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx +FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers +// in b for greater than. +// +// r0 := (a0 > b0) ? 0xff : 0x0 +// r1 := (a1 > b1) ? 0xff : 0x0 +// ... +// r15 := (a15 > b15) ? 0xff : 0x0 +// +// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers +// in b for less than. +// +// r0 := (a0 < b0) ? 0xffff : 0x0 +// r1 := (a1 < b1) ? 0xffff : 0x0 +// ... +// r7 := (a7 < b7) ? 0xffff : 0x0 +// +// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers +// in b for greater than. +// +// r0 := (a0 > b0) ? 0xffff : 0x0 +// r1 := (a1 > b1) ? 0xffff : 0x0 +// ... +// r7 := (a7 > b7) ? 0xffff : 0x0 +// +// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + + +// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers +// in b for less than. +// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers +// in b for greater than. +// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers +// in b for greater than. +FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_u64( + vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +#else + // ARMv7 lacks vcgtq_s64. + // This is based off of Clang's SSE2 polyfill: + // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi)) + + // Mask the sign bit out since we need a signed AND an unsigned comparison + // and it is ugly to try and split them. + int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull)); + int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask); + int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask); + // Check if a > b + int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask)); + // Copy upper mask to lower mask + // a_hi > b_hi + int64x2_t gt_hi = vshrq_n_s64(greater, 63); + // Copy lower mask to upper mask + // a_lo > b_lo + int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32); + // Compare for equality + int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask)); + // Copy upper mask to lower mask + // a_hi == b_hi + int64x2_t eq_hi = vshrq_n_s64(equal, 63); + // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi) + int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi)); + return vreinterpretq_m128i_s64(ret); +#endif +} + +// Compares the four 32-bit floats in a and b to check if any values are NaN. +// Ordered compare between each value returns true for "orderable" and false for +// "not orderable" (NaN). +// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see +// also: +// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean +// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics +FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) +{ + // Note: NEON does not have ordered compare builtin + // Need to compare a eq a and b eq b to check for NaN + // Do AND of results to get final + uint32x4_t ceqaa = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t ceqbb = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb)); +} + +// Compares for ordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100) +FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpord_ps(a, b)); +} + +// Compares for unordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100) +FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) +{ + uint32x4_t f32a = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t f32b = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b))); +} + +// Compares for unordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100) +FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpunord_ps(a, b)); +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a less than operation. : +// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important +// note!! The documentation on MSDN is incorrect! If either of the values is a +// NAN the docs say you will get a one, but in fact, it will return a zero!! +FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) +{ + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_lt_b = + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a greater than operation. : +// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx +FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) +{ + // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_gt_b = + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a less than or equal operation. : +// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx +FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) +{ + // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_le_b = + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a greater than or equal operation. : +// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx +FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) +{ + // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_ge_b = + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using an equality operation. : +// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx +FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) +{ + // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_eq_b = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using an inequality operation. : +// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx +FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) +{ + // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); + uint32x4_t a_neq_b = vmvnq_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0; +} + +// according to the documentation, these intrinsics behave the same as the +// non-'u' versions. We'll just alias them here. +#define _mm_ucomieq_ss _mm_comieq_ss +#define _mm_ucomige_ss _mm_comige_ss +#define _mm_ucomigt_ss _mm_comigt_ss +#define _mm_ucomile_ss _mm_comile_ss +#define _mm_ucomilt_ss _mm_comilt_ss +#define _mm_ucomineq_ss _mm_comineq_ss + +/* Conversions */ + +// Convert packed signed 32-bit integers in b to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, and copy the upper 2 packed elements from a to the upper elements of +// dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(b[63:32]) +// dst[95:64] := a[95:64] +// dst[127:96] := a[127:96] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps +FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), + vget_high_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert the signed 32-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss +FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) +{ + return vreinterpretq_m128_f32( + vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); +} + +// Convert the signed 32-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss +#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b) + +// Convert the signed 64-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss +FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b) +{ + return vreinterpretq_m128_f32( + vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si +FORCE_INLINE int _mm_cvt_ss2si(__m128 a) +{ +#if defined(__aarch64__) + return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0); +#else + float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32_t diff = data - floor(data); + if (diff > 0.5) + return (int32_t) ceil(data); + if (unlikely(diff == 0.5)) { + int32_t f = (int32_t) floor(data); + int32_t c = (int32_t) ceil(data); + return c & 1 ? f : c; + } + return (int32_t) floor(data); +#endif +} + +// Convert packed 16-bit integers in a to packed single-precision (32-bit) +// floating-point elements, and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// m := j*32 +// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps +FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) +{ + return vreinterpretq_m128_f32( + vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a)))); +} + +// Convert packed 32-bit integers in b to packed single-precision (32-bit) +// floating-point elements, store the results in the lower 2 elements of dst, +// and copy the upper 2 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(b[63:32]) +// dst[95:64] := a[95:64] +// dst[127:96] := a[127:96] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps +FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), + vget_high_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert packed signed 32-bit integers in a to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, then covert the packed signed 32-bit integers in b to +// single-precision (32-bit) floating-point element, and store the results in +// the upper 2 elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(a[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(a[63:32]) +// dst[95:64] := Convert_Int32_To_FP32(b[31:0]) +// dst[127:96] := Convert_Int32_To_FP32(b[63:32]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps +FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32( + vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)))); +} + +// Convert the lower packed 8-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*8 +// m := j*32 +// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps +FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32( + vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a)))))); +} + +// Convert packed unsigned 16-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// m := j*32 +// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps +FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) +{ + return vreinterpretq_m128_f32( + vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a)))); +} + +// Convert the lower packed unsigned 8-bit integers in a to packed +// single-precision (32-bit) floating-point elements, and store the results in +// dst. +// +// FOR j := 0 to 3 +// i := j*8 +// m := j*32 +// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps +FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_u32( + vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a)))))); +} + +// Converts the four single-precision, floating-point values of a to signed +// 32-bit integer values using truncate. +// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) +{ + return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))); +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// +// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64 +FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) +{ +#if defined(__aarch64__) + return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); +#else + double ret = *((double *) &a); + return (int64_t) ret; +#endif +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// +// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x +#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) + +// Converts the four signed 32-bit integer values of a to single-precision, +// floating-point values +// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a))); +} + +// Converts the four unsigned 8-bit integers in the lower 16 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + return vreinterpretq_m128i_u16(u16x8); +} + +// Converts the four unsigned 8-bit integers in the lower 32 bits to four +// unsigned 32-bit integers. +// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx +FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ + return vreinterpretq_m128i_u32(u32x4); +} + +// Converts the two unsigned 8-bit integers in the lower 16 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ + uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_u64(u64x2); +} + +// Converts the four unsigned 8-bit integers in the lower 16 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + return vreinterpretq_m128i_s16(s16x8); +} + +// Converts the four unsigned 8-bit integers in the lower 32 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ + return vreinterpretq_m128i_s32(s32x4); +} + +// Converts the two signed 8-bit integers in the lower 32 bits to four +// signed 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ + int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_s64(s64x2); +} + +// Converts the four signed 16-bit integers in the lower 64 bits to four signed +// 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) +{ + return vreinterpretq_m128i_s32( + vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a)))); +} + +// Converts the two signed 16-bit integers in the lower 32 bits two signed +// 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) +{ + int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ + int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_s64(s64x2); +} + +// Converts the four unsigned 16-bit integers in the lower 64 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) +{ + return vreinterpretq_m128i_u32( + vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a)))); +} + +// Converts the two unsigned 16-bit integers in the lower 32 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) +{ + uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ + uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_u64(u64x2); +} + +// Converts the two unsigned 32-bit integers in the lower 64 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) +{ + return vreinterpretq_m128i_u64( + vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a)))); +} + +// Converts the two signed 32-bit integers in the lower 64 bits to two signed +// 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) +{ + return vreinterpretq_m128i_s64( + vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))); +} + +// Converts the four single-precision, floating-point values of a to signed +// 32-bit integer values. +// +// r0 := (int) a0 +// r1 := (int) a1 +// r2 := (int) a2 +// r3 := (int) a3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx +// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A +// does not support! It is supported on ARMv8-A however. +FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a)); +#else + uint32x4_t signmask = vdupq_n_u32(0x80000000); + float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), + vdupq_n_f32(0.5f)); /* +/- 0.5 */ + int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( + vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ + int32x4_t r_trunc = + vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ + int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( + vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ + int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), + vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ + float32x4_t delta = vsubq_f32( + vreinterpretq_f32_m128(a), + vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ + uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ + return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal)); +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 16-bit integers, and store the results in dst. Note: this intrinsic +// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and +// 0x7FFFFFFF. +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16 +FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a) +{ + return vreinterpret_m64_s16( + vmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a)))); +} + +// Copy the lower 32-bit integer in a to dst. +// +// dst[31:0] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32 +FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) +{ + return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); +} + +// Copy the lower 64-bit integer in a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64 +FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) +{ + return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0); +} + +// Copy the lower 64-bit integer in a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x +#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) + +// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, +// zero extending the upper bits. +// +// r0 := a +// r1 := 0x0 +// r2 := 0x0 +// r3 := 0x0 +// +// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) +{ + return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0)); +} + +// Moves 64-bit integer a to the least significant 64 bits of an __m128 object, +// zero extending the upper bits. +// +// r0 := a +// r1 := 0x0 +FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) +{ + return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0)); +} + +// Cast vector of type __m128 to type __m128d. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd +FORCE_INLINE __m128d _mm_castps_pd(__m128 a) +{ + return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a)); +} + +// Applies a type cast to reinterpret four 32-bit floating point values passed +// in as a 128-bit parameter as packed 32-bit integers. +// https://msdn.microsoft.com/en-us/library/bb514099.aspx +FORCE_INLINE __m128i _mm_castps_si128(__m128 a) +{ + return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a)); +} + +// Cast vector of type __m128i to type __m128d. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd +FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a)); +#else + return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a)); +#endif +} + +// Applies a type cast to reinterpret four 32-bit integers passed in as a +// 128-bit parameter as packed 32-bit floating point values. +// https://msdn.microsoft.com/en-us/library/bb514029.aspx +FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) +{ + return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a)); +} + +// Loads 128-bit value. : +// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx +FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) +{ + return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd +FORCE_INLINE __m128d _mm_load1_pd(const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_dup_f64(p)); +#else + return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p)); +#endif +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1 +#define _mm_load_pd1 _mm_load1_pd + +// Load a double-precision (64-bit) floating-point element from memory into the +// upper element of dst, and copy the lower element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. +// +// dst[63:0] := a[63:0] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd +FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p))); +#else + return vreinterpretq_m128d_f32(vcombine_f32( + vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p))); +#endif +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1 +#define _mm_load_pd1 _mm_load1_pd + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd +#define _mm_loaddup_pd _mm_load1_pd + +// Loads 128-bit value. : +// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx +FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) +{ + return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); +} + +// Load unaligned 32-bit integer from memory into the first element of dst. +// +// dst[31:0] := MEM[mem_addr+31:mem_addr] +// dst[MAX:32] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32 +FORCE_INLINE __m128i _mm_loadu_si32(const void *p) +{ + return vreinterpretq_m128i_s32( + vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0)); +} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed single-precision (32-bit) floating-point elements, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// k := 64*j +// dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k]) +// ENDFOR +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps +FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) +{ +#if defined(__aarch64__) + float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); + return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); +#else + float a0 = (float) ((double *) &a)[0]; + float a1 = (float) ((double *) &a)[1]; + return _mm_set_ps(0, 0, a1, a0); +#endif +} + +// Copy the lower double-precision (64-bit) floating-point element of a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64 +FORCE_INLINE double _mm_cvtsd_f64(__m128d a) +{ +#if defined(__aarch64__) + return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); +#else + return ((double *) &a)[0]; +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed double-precision (64-bit) floating-point elements, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := 64*j +// k := 32*j +// dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd +FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a)))); +#else + double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); + return _mm_set_pd(a1, a0); +#endif +} + +// Cast vector of type __m128d to type __m128i. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128 +FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) +{ + return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a)); +} + +// Cast vector of type __m128d to type __m128. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps +FORCE_INLINE __m128 _mm_castpd_ps(__m128d a) +{ + return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a)); +} + +// Blend packed single-precision (32-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps +FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask) +{ + // Use a signed shift right to create a mask with the sign bit + uint32x4_t mask = + vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31)); + float32x4_t a = vreinterpretq_f32_m128(_a); + float32x4_t b = vreinterpretq_f32_m128(_b); + return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); +} + +// Blend packed single-precision (32-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps +FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) +{ + const uint32_t ALIGN_STRUCT(16) + data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, + ((imm8) & (1 << 1)) ? UINT32_MAX : 0, + ((imm8) & (1 << 2)) ? UINT32_MAX : 0, + ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; + uint32x4_t mask = vld1q_u32(data); + float32x4_t a = vreinterpretq_f32_m128(_a); + float32x4_t b = vreinterpretq_f32_m128(_b); + return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); +} + +// Blend packed double-precision (64-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd +FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask) +{ + uint64x2_t mask = + vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63)); +#if defined(__aarch64__) + float64x2_t a = vreinterpretq_f64_m128d(_a); + float64x2_t b = vreinterpretq_f64_m128d(_b); + return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a)); +#else + uint64x2_t a = vreinterpretq_u64_m128d(_a); + uint64x2_t b = vreinterpretq_u64_m128d(_b); + return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a)); +#endif +} + +typedef struct { + uint16_t res0; + uint8_t res1 : 6; + uint8_t bit22 : 1; + uint8_t bit23 : 1; + uint8_t res2; +#if defined(__aarch64__) + uint32_t res3; +#endif +} fpcr_bitfield; + +// Macro: Set the rounding mode bits of the MXCSR control and status register to +// the value in unsigned 32-bit integer a. The rounding mode may contain any of +// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, +// _MM_ROUND_TOWARD_ZERO +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE +FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) +{ + union { + fpcr_bitfield field; +#if defined(__aarch64__) + uint64_t value; +#else + uint32_t value; +#endif + } r; + +#if defined(__aarch64__) + asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */ +#else + asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ +#endif + + switch (rounding) { + case _MM_ROUND_TOWARD_ZERO: + r.field.bit22 = 1; + r.field.bit23 = 1; + break; + case _MM_ROUND_DOWN: + r.field.bit22 = 0; + r.field.bit23 = 1; + break; + case _MM_ROUND_UP: + r.field.bit22 = 1; + r.field.bit23 = 0; + break; + default: //_MM_ROUND_NEAREST + r.field.bit22 = 0; + r.field.bit23 = 0; + } + +#if defined(__aarch64__) + asm volatile("msr FPCR, %0" ::"r"(r)); /* write */ +#else + asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */ +#endif +} + +FORCE_INLINE void _mm_setcsr(unsigned int a) +{ + _MM_SET_ROUNDING_MODE(a); +} + +// Round the packed single-precision (32-bit) floating-point elements in a using +// the rounding parameter, and store the results as packed single-precision +// floating-point elements in dst. +// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps +FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) +{ +#if defined(__aarch64__) + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a))); + default: //_MM_FROUND_CUR_DIRECTION + return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a))); + } +#else + float *v_float = (float *) &a; + __m128 zero, neg_inf, pos_inf; + + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return _mm_cvtepi32_ps(_mm_cvtps_epi32(a)); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return (__m128){floorf(v_float[0]), floorf(v_float[1]), + floorf(v_float[2]), floorf(v_float[3])}; + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]), + ceilf(v_float[3])}; + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f); + neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]), + floorf(v_float[2]), floorf(v_float[3])); + pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]), + ceilf(v_float[2]), ceilf(v_float[3])); + return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero)); + default: //_MM_FROUND_CUR_DIRECTION + return (__m128){roundf(v_float[0]), roundf(v_float[1]), + roundf(v_float[2]), roundf(v_float[3])}; + } +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi +FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) +{ +#if defined(__aarch64__) + return vreinterpret_m64_s32( + vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)))); +#else + return vreinterpret_m64_s32( + vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128( + _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))))); +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32 +#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a) + +// Round the packed single-precision (32-bit) floating-point elements in a up to +// an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps +FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) +{ + return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); +} + +// Round the lower single-precision (32-bit) floating-point element in b up to +// an integer value, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. +// +// dst[31:0] := CEIL(b[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss +FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b) +{ + return _mm_move_ss( + a, _mm_round_ps(b, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); +} + +// Round the packed single-precision (32-bit) floating-point elements in a down +// to an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps +FORCE_INLINE __m128 _mm_floor_ps(__m128 a) +{ + return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); +} + +// Round the lower single-precision (32-bit) floating-point element in b down to +// an integer value, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. +// +// dst[31:0] := FLOOR(b[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss +FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) +{ + return _mm_move_ss( + a, _mm_round_ps(b, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); +} + +// Load 128-bits of integer data from unaligned memory into dst. This intrinsic +// may perform better than _mm_loadu_si128 when the data crosses a cache line +// boundary. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128 +#define _mm_lddqu_si128 _mm_loadu_si128 + +/* Miscellaneous Operations */ + +// Shifts the 8 signed 16-bit integers in a right by count bits while shifting +// in the sign bit. +// +// r0 := a0 >> count +// r1 := a1 >> count +// ... +// r7 := a7 >> count +// +// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) +{ + int64_t c = (int64_t) vget_low_s64((int64x2_t) count); + if (unlikely(c > 15)) + return _mm_cmplt_epi16(a, _mm_setzero_si128()); + return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c))); +} + +// Shifts the 4 signed 32-bit integers in a right by count bits while shifting +// in the sign bit. +// +// r0 := a0 >> count +// r1 := a1 >> count +// r2 := a2 >> count +// r3 := a3 >> count +// +// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx +FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) +{ + int64_t c = (int64_t) vget_low_s64((int64x2_t) count); + if (unlikely(c > 31)) + return _mm_cmplt_epi32(a, _mm_setzero_si128()); + return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c))); +} + +// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and +// saturates. +// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), + vqmovn_s16(vreinterpretq_s16_m128i(b)))); +} + +// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned +// integers and saturates. +// +// r0 := UnsignedSaturate(a0) +// r1 := UnsignedSaturate(a1) +// ... +// r7 := UnsignedSaturate(a7) +// r8 := UnsignedSaturate(b0) +// r9 := UnsignedSaturate(b1) +// ... +// r15 := UnsignedSaturate(b7) +// +// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx +FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) +{ + return vreinterpretq_m128i_u8( + vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), + vqmovun_s16(vreinterpretq_s16_m128i(b)))); +} + +// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers +// and saturates. +// +// r0 := SignedSaturate(a0) +// r1 := SignedSaturate(a1) +// r2 := SignedSaturate(a2) +// r3 := SignedSaturate(a3) +// r4 := SignedSaturate(b0) +// r5 := SignedSaturate(b1) +// r6 := SignedSaturate(b2) +// r7 := SignedSaturate(b3) +// +// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), + vqmovn_s32(vreinterpretq_s32_m128i(b)))); +} + +// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit +// integers and saturates. +// +// r0 := UnsignedSaturate(a0) +// r1 := UnsignedSaturate(a1) +// r2 := UnsignedSaturate(a2) +// r3 := UnsignedSaturate(a3) +// r4 := UnsignedSaturate(b0) +// r5 := UnsignedSaturate(b1) +// r6 := UnsignedSaturate(b2) +// r7 := UnsignedSaturate(b3) +FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)), + vqmovun_s32(vreinterpretq_s32_m128i(b)))); +} + +// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower +// 8 signed or unsigned 8-bit integers in b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// ... +// r14 := a7 +// r15 := b7 +// +// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s8( + vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#else + int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a))); + int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b))); + int8x8x2_t result = vzip_s8(a1, b1); + return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); +#endif +} + +// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the +// lower 4 signed or unsigned 16-bit integers in b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// r4 := a2 +// r5 := b2 +// r6 := a3 +// r7 := b3 +// +// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s16( + vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#else + int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b)); + int16x4x2_t result = vzip_s16(a1, b1); + return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); +#endif +} + +// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the +// lower 2 signed or unsigned 32 - bit integers in b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// +// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s32( + vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#else + int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a)); + int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b)); + int32x2x2_t result = vzip_s32(a1, b1); + return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); +#endif +} + +FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) +{ + int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a)); + int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l)); +} + +// Selects and interleaves the lower two single-precision, floating-point values +// from a and b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// +// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b)); + float32x2x2_t result = vzip_f32(a1, b1); + return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave double-precision (64-bit) floating-point elements from +// the low half of a and b, and store the results in dst. +// +// DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { +// dst[63:0] := src1[63:0] +// dst[127:64] := src2[63:0] +// RETURN dst[127:0] +// } +// dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd +FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + return vreinterpretq_m128d_s64( + vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)), + vget_low_s64(vreinterpretq_s64_m128d(b)))); +#endif +} + +// Unpack and interleave double-precision (64-bit) floating-point elements from +// the high half of a and b, and store the results in dst. +// +// DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { +// dst[63:0] := src1[127:64] +// dst[127:64] := src2[127:64] +// RETURN dst[127:0] +// } +// dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd +FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + return vreinterpretq_m128d_s64( + vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)), + vget_high_s64(vreinterpretq_s64_m128d(b)))); +#endif +} + +// Selects and interleaves the upper two single-precision, floating-point values +// from a and b. +// +// r0 := a2 +// r1 := b2 +// r2 := a3 +// r3 := b3 +// +// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b)); + float32x2x2_t result = vzip_f32(a1, b1); + return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper +// 8 signed or unsigned 8-bit integers in b. +// +// r0 := a8 +// r1 := b8 +// r2 := a9 +// r3 := b9 +// ... +// r14 := a15 +// r15 := b15 +// +// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s8( + vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#else + int8x8_t a1 = + vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a))); + int8x8_t b1 = + vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b))); + int8x8x2_t result = vzip_s8(a1, b1); + return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the +// upper 4 signed or unsigned 16-bit integers in b. +// +// r0 := a4 +// r1 := b4 +// r2 := a5 +// r3 := b5 +// r4 := a6 +// r5 := b6 +// r6 := a7 +// r7 := b7 +// +// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s16( + vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#else + int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b)); + int16x4x2_t result = vzip_s16(a1, b1); + return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the +// upper 2 signed or unsigned 32-bit integers in b. +// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s32( + vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#else + int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b)); + int32x2x2_t result = vzip_s32(a1, b1); + return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper signed or unsigned 64-bit integer in a with the +// upper signed or unsigned 64-bit integer in b. +// +// r0 := a1 +// r1 := b1 +FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) +{ + int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a)); + int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h)); +} + +// Horizontally compute the minimum amongst the packed unsigned 16-bit integers +// in a, store the minimum and index in dst, and zero the remaining bits in dst. +// +// index[2:0] := 0 +// min[15:0] := a[15:0] +// FOR j := 0 to 7 +// i := j*16 +// IF a[i+15:i] < min[15:0] +// index[2:0] := j +// min[15:0] := a[i+15:i] +// FI +// ENDFOR +// dst[15:0] := min[15:0] +// dst[18:16] := index[2:0] +// dst[127:19] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16 +FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) +{ + __m128i dst; + uint16_t min, idx = 0; + // Find the minimum value +#if defined(__aarch64__) + min = vminvq_u16(vreinterpretq_u16_m128i(a)); +#else + __m64 tmp; + tmp = vreinterpret_m64_u16( + vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)), + vget_high_u16(vreinterpretq_u16_m128i(a)))); + tmp = vreinterpret_m64_u16( + vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); + tmp = vreinterpret_m64_u16( + vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); + min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0); +#endif + // Get the index of the minimum value + int i; + for (i = 0; i < 8; i++) { + if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) { + idx = (uint16_t) i; + break; + } + a = _mm_srli_si128(a, 2); + } + // Generate result + dst = _mm_setzero_si128(); + dst = vreinterpretq_m128i_u16( + vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0)); + dst = vreinterpretq_m128i_u16( + vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1)); + return dst; +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return the CF value. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128 +FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) +{ + int64x2_t s64 = + vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))), + vreinterpretq_s64_m128i(b)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return the ZF value. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128 +FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) +{ + int64x2_t s64 = + vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); +} + +// Extracts the selected signed or unsigned 8-bit integer from a and zero +// extends. +// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm) +#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) + +// Inserts the least significant 8 bits of b into the selected 8-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, +// __constrange(0,16) int imm) +#define _mm_insert_epi8(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s8( \ + vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \ + }) + +// Extracts the selected signed or unsigned 16-bit integer from a and zero +// extends. +// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx +// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) +#define _mm_extract_epi16(a, imm) \ + vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) + +// Inserts the least significant 16 bits of b into the selected 16-bit integer +// of a. +// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx +// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, +// __constrange(0,8) int imm) +#define _mm_insert_epi16(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s16( \ + vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \ + }) + +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16 +#define _mm_insert_pi16(a, b, imm) \ + __extension__({ \ + vreinterpret_m64_s16( \ + vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \ + }) + +// Extracts the selected signed or unsigned 32-bit integer from a and zero +// extends. +// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm) +#define _mm_extract_epi32(a, imm) \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) + +// Extracts the selected single-precision (32-bit) floating-point from a. +// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm) +#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) + +// Inserts the least significant 32 bits of b into the selected 32-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, +// __constrange(0,4) int imm) +#define _mm_insert_epi32(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \ + }) + +// Extracts the selected signed or unsigned 64-bit integer from a and zero +// extends. +// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm) +#define _mm_extract_epi64(a, imm) \ + vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) + +// Inserts the least significant 64 bits of b into the selected 64-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, +// __constrange(0,2) int imm) +#define _mm_insert_epi64(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s64( \ + vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \ + }) + +// Count the number of bits set to 1 in unsigned 32-bit integer a, and +// return that count in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32 +FORCE_INLINE int _mm_popcnt_u32(unsigned int a) +{ +#if defined(__aarch64__) +#if __has_builtin(__builtin_popcount) + return __builtin_popcount(a); +#else + return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a))); +#endif +#else + uint32_t count = 0; + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + + input_val = vld1_u8((uint8_t *) &a); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + + vst1_u32(&count, count32x2_val); + return count; +#endif +} + +// Count the number of bits set to 1 in unsigned 64-bit integer a, and +// return that count in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64 +FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) +{ +#if defined(__aarch64__) +#if __has_builtin(__builtin_popcountll) + return __builtin_popcountll(a); +#else + return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a))); +#endif +#else + uint64_t count = 0; + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + uint64x1_t count64x1_val; + + input_val = vld1_u8((uint8_t *) &a); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + count64x1_val = vpaddl_u32(count32x2_val); + vst1_u64(&count, count64x1_val); + return count; +#endif +} + +// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision +// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the +// transposed matrix in these vectors (row0 now contains column 0, etc.). +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ + do { \ + float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ + float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ + row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \ + vget_low_f32(ROW23.val[0])); \ + row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \ + vget_low_f32(ROW23.val[1])); \ + row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ + vget_high_f32(ROW23.val[0])); \ + row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ + vget_high_f32(ROW23.val[1])); \ + } while (0) + +/* Crypto Extensions */ + +#if defined(__ARM_FEATURE_CRYPTO) +// Wraps vmull_p64 +FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) +{ + poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); + poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); + return vreinterpretq_u64_p128(vmull_p64(a, b)); +} +#else // ARMv7 polyfill +// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. +// +// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a +// 64-bit->128-bit polynomial multiply. +// +// It needs some work and is somewhat slow, but it is still faster than all +// known scalar methods. +// +// Algorithm adapted to C from +// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted +// from "Fast Software Polynomial Multiplication on ARM Processors Using the +// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab +// (https://hal.inria.fr/hal-01506572) +static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) +{ + poly8x8_t a = vreinterpret_p8_u64(_a); + poly8x8_t b = vreinterpret_p8_u64(_b); + + // Masks + uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), + vcreate_u8(0x00000000ffffffff)); + uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), + vcreate_u8(0x0000000000000000)); + + // Do the multiplies, rotating with vext to get all combinations + uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 + uint8x16_t e = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 + uint8x16_t f = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 + uint8x16_t g = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 + uint8x16_t h = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 + uint8x16_t i = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 + uint8x16_t j = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 + uint8x16_t k = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 + + // Add cross products + uint8x16_t l = veorq_u8(e, f); // L = E + F + uint8x16_t m = veorq_u8(g, h); // M = G + H + uint8x16_t n = veorq_u8(i, j); // N = I + J + + // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL + // instructions. +#if defined(__aarch64__) + uint8x16_t lm_p0 = vreinterpretq_u8_u64( + vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t lm_p1 = vreinterpretq_u8_u64( + vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t nk_p0 = vreinterpretq_u8_u64( + vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); + uint8x16_t nk_p1 = vreinterpretq_u8_u64( + vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); +#else + uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); + uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); + uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); + uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); +#endif + // t0 = (L) (P0 + P1) << 8 + // t1 = (M) (P2 + P3) << 16 + uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); + uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); + uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); + + // t2 = (N) (P4 + P5) << 24 + // t3 = (K) (P6 + P7) << 32 + uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); + uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); + uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); + + // De-interleave +#if defined(__aarch64__) + uint8x16_t t0 = vreinterpretq_u8_u64( + vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t1 = vreinterpretq_u8_u64( + vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t2 = vreinterpretq_u8_u64( + vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); + uint8x16_t t3 = vreinterpretq_u8_u64( + vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); +#else + uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); + uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); + uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); + uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); +#endif + // Shift the cross products + uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 + uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 + uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 + uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 + + // Accumulate the products + uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); + uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); + uint8x16_t mix = veorq_u8(d, cross1); + uint8x16_t r = veorq_u8(mix, cross2); + return vreinterpretq_u64_u8(r); +} +#endif // ARMv7 polyfill + +// Perform a carry-less multiplication of two 64-bit integers, selected from a +// and b according to imm8, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128 +FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm) +{ + uint64x2_t a = vreinterpretq_u64_m128i(_a); + uint64x2_t b = vreinterpretq_u64_m128i(_b); + switch (imm & 0x11) { + case 0x00: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b))); + case 0x01: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b))); + case 0x10: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b))); + case 0x11: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b))); + default: + abort(); + } +} + +#if !defined(__ARM_FEATURE_CRYPTO) +/* clang-format off */ +#define SSE2NEON_AES_DATA(w) \ + { \ + w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \ + w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \ + w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \ + w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \ + w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \ + w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \ + w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \ + w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ + w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \ + w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \ + w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \ + w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \ + w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \ + w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \ + w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \ + w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ + w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \ + w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \ + w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \ + w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \ + w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \ + w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \ + w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \ + w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ + w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \ + w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \ + w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \ + w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \ + w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \ + w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \ + w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \ + w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ + w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \ + w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \ + w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \ + w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \ + w(0xb0), w(0x54), w(0xbb), w(0x16) \ + } +/* clang-format on */ + +/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */ +#define SSE2NEON_AES_H0(x) (x) +static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0); +#undef SSE2NEON_AES_H0 + +// In the absence of crypto extensions, implement aesenc using regular neon +// intrinsics instead. See: +// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/ +// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and +// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52 +// for more information Reproduced with permission of the author. +FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey) +{ +#if defined(__aarch64__) + static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9, + 0xe, 0x3, 0x8, 0xd, 0x2, 0x7, + 0xc, 0x1, 0x6, 0xb}; + static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, + 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc}; + + uint8x16_t v; + uint8x16_t w = vreinterpretq_u8_m128i(EncBlock); + + // shift rows + w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); + + // sub bytes + v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0); + + // mix columns + w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b); + w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); + w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); + + // add round key + return vreinterpretq_m128i_u8(w) ^ RoundKey; + +#else /* ARMv7-A NEON implementation */ +#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ + (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \ + (b0)) +#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */)) +#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) +#define SSE2NEON_AES_U0(p) \ + SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) +#define SSE2NEON_AES_U1(p) \ + SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p) +#define SSE2NEON_AES_U2(p) \ + SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p) +#define SSE2NEON_AES_U3(p) \ + SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p)) + static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = { + SSE2NEON_AES_DATA(SSE2NEON_AES_U0), + SSE2NEON_AES_DATA(SSE2NEON_AES_U1), + SSE2NEON_AES_DATA(SSE2NEON_AES_U2), + SSE2NEON_AES_DATA(SSE2NEON_AES_U3), + }; +#undef SSE2NEON_AES_B2W +#undef SSE2NEON_AES_F2 +#undef SSE2NEON_AES_F3 +#undef SSE2NEON_AES_U0 +#undef SSE2NEON_AES_U1 +#undef SSE2NEON_AES_U2 +#undef SSE2NEON_AES_U3 + + uint32_t x0 = _mm_cvtsi128_si32(EncBlock); + uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55)); + uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA)); + uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF)); + + __m128i out = _mm_set_epi32( + (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^ + aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]), + (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^ + aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]), + (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^ + aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]), + (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^ + aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24])); + + return _mm_xor_si128(out, RoundKey); +#endif +} + +// Perform the last round of an AES encryption flow on data (state) in a using +// the round key in RoundKey, and store the result in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128 +FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) +{ + /* FIXME: optimized for NEON */ + uint8_t v[4][4] = { + [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]}, + [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]}, + [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]}, + [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]}, + }; + for (int i = 0; i < 16; i++) + vreinterpretq_nth_u8_m128i(a, i) = + v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i); + return a; +} + +// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist. +// This instruction generates a round key for AES encryption. See +// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/ +// for details. +// +// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx +FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon) +{ + uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)); + uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)); + for (int i = 0; i < 4; ++i) { + ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]]; + ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]]; + } + return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3, + ((X1 >> 8) | (X1 << 24)) ^ rcon, X1); +} +#undef SSE2NEON_AES_DATA + +#else /* __ARM_FEATURE_CRYPTO */ +// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and +// AESMC and then manually applying the real key as an xor operation. This +// unfortunately means an additional xor op; the compiler should be able to +// optimize this away for repeated calls however. See +// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a +// for more details. +FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^ + vreinterpretq_u8_m128i(b)); +} + +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128 +FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) +{ + return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8( + vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), + RoundKey); +} + +FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) +{ + // AESE does ShiftRows and SubBytes on A + uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); + + uint8x16_t dest = { + // Undo ShiftRows step from AESE and extract X1 and X3 + u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) + u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1)) + u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3) + u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3)) + }; + uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon}; + return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r); +} +#endif + +/* Streaming Extensions */ + +// Guarantees that every preceding store is globally visible before any +// subsequent store. +// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx +FORCE_INLINE void _mm_sfence(void) +{ + __sync_synchronize(); +} + +// Store 128-bits (composed of 4 packed single-precision (32-bit) floating- +// point elements) from a into memory using a non-temporal memory hint. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps +FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) +{ +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, (float32x4_t *) p); +#else + vst1q_f32(p, vreinterpretq_f32_m128(a)); +#endif +} + +// Stores the data in a to the address p without polluting the caches. If the +// cache line containing address p is already in the cache, the cache will be +// updated. +// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx +FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) +{ +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, p); +#else + vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a)); +#endif +} + +// Load 128-bits of integer data from memory into dst using a non-temporal +// memory hint. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128 +FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) +{ +#if __has_builtin(__builtin_nontemporal_store) + return __builtin_nontemporal_load(p); +#else + return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p)); +#endif +} + +// Cache line containing p is flushed and invalidated from all caches in the +// coherency domain. : +// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx +FORCE_INLINE void _mm_clflush(void const *p) +{ + (void) p; + // no corollary for Neon? +} + +// Allocate aligned blocks of memory. +// https://software.intel.com/en-us/ +// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks +FORCE_INLINE void *_mm_malloc(size_t size, size_t align) +{ + void *ptr; + if (align == 1) + return malloc(size); + if (align == 2 || (sizeof(void *) == 8 && align == 4)) + align = sizeof(void *); + if (!posix_memalign(&ptr, align, size)) + return ptr; + return NULL; +} + +// Free aligned memory that was allocated with _mm_malloc. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free +FORCE_INLINE void _mm_free(void *addr) +{ + free(addr); +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 8-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc ^= v; + for (int bit = 0; bit < 8; bit++) { + if (crc & 1) + crc = (crc >> 1) ^ UINT32_C(0x82f63b78); + else + crc = (crc >> 1); + } +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 16-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc = _mm_crc32_u8(crc, v & 0xff); + crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 32-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc = _mm_crc32_u16(crc, v & 0xffff); + crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 64-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100) +FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff); + crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff); +#endif + return crc; +} + +#if defined(__GNUC__) || defined(__clang__) +#pragma pop_macro("ALIGN_STRUCT") +#pragma pop_macro("FORCE_INLINE") +#endif + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC pop_options +#endif + +#endif diff --git a/thirdparty/embree/common/simd/avx.h b/thirdparty/embree/common/simd/avx.h new file mode 100644 index 0000000000..d3100306ee --- /dev/null +++ b/thirdparty/embree/common/simd/avx.h @@ -0,0 +1,34 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "sse.h" + +#if defined(__AVX512VL__) +#include "vboolf8_avx512.h" +#include "vboold4_avx512.h" +#else +#include "vboolf8_avx.h" +#include "vboold4_avx.h" +#endif + +#if defined(__AVX2__) +#include "vint8_avx2.h" +#include "vuint8_avx2.h" +#if defined(__X86_64__) +#include "vllong4_avx2.h" +#endif +#else +#include "vint8_avx.h" +#include "vuint8_avx.h" +#endif +#include "vfloat8_avx.h" +#if defined(__X86_64__) +#include "vdouble4_avx.h" +#endif + +#if defined(__AVX512F__) +#include "avx512.h" +#endif + diff --git a/thirdparty/embree/common/simd/avx512.h b/thirdparty/embree/common/simd/avx512.h new file mode 100644 index 0000000000..d43bbacea1 --- /dev/null +++ b/thirdparty/embree/common/simd/avx512.h @@ -0,0 +1,41 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/intrinsics.h" +#include "../math/constants.h" +#include "../sys/alloc.h" +#include "varying.h" + +#include "vboolf16_avx512.h" +#include "vint16_avx512.h" +#include "vuint16_avx512.h" +#include "vfloat16_avx512.h" + +#include "vboold8_avx512.h" +#include "vllong8_avx512.h" +#include "vdouble8_avx512.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// Prefetching + //////////////////////////////////////////////////////////////////////////////// + +#define PFHINT_L1 0 +#define PFHINT_L2 1 +#define PFHINT_NT 2 + + template<const unsigned int mode> + __forceinline void prefetch(const void * __restrict__ const m) + { + if (mode == PFHINT_L1) + _mm_prefetch((const char*)m,_MM_HINT_T0); + else if (mode == PFHINT_L2) + _mm_prefetch((const char*)m,_MM_HINT_T1); + else if (mode == PFHINT_NT) + _mm_prefetch((const char*)m,_MM_HINT_NTA); + } +} diff --git a/thirdparty/embree/common/simd/simd.h b/thirdparty/embree/common/simd/simd.h new file mode 100644 index 0000000000..195506b530 --- /dev/null +++ b/thirdparty/embree/common/simd/simd.h @@ -0,0 +1,110 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../math/math.h" + +/* include SSE wrapper classes */ +#if defined(__SSE__) +# include "sse.h" +#endif + +/* include AVX wrapper classes */ +#if defined(__AVX__) +# include "avx.h" +#endif + +/* include AVX512 wrapper classes */ +#if defined (__AVX512F__) +# include "avx512.h" +#endif + +namespace embree +{ + template <int N> + __forceinline vbool<N> isfinite(const vfloat<N>& v) + { + return (v >= vfloat<N>(-std::numeric_limits<float>::max())) + & (v <= vfloat<N>( std::numeric_limits<float>::max())); + } + + /* foreach unique */ + template<typename vbool, typename vint, typename Closure> + __forceinline void foreach_unique(const vbool& valid0, const vint& vi, const Closure& closure) + { + vbool valid1 = valid0; + while (any(valid1)) { + const int j = int(bsf(movemask(valid1))); + const int i = vi[j]; + const vbool valid2 = valid1 & (i == vi); + valid1 = andn(valid1, valid2); + closure(valid2, i); + } + } + + /* returns the next unique value i in vi and the corresponding valid_i mask */ + template<typename vbool, typename vint> + __forceinline int next_unique(vbool& valid, const vint& vi, /*out*/ vbool& valid_i) + { + assert(any(valid)); + const int j = int(bsf(movemask(valid))); + const int i = vi[j]; + valid_i = valid & (i == vi); + valid = andn(valid, valid_i); + return i; + } + + /* foreach unique index */ + template<typename vbool, typename vint, typename Closure> + __forceinline void foreach_unique_index(const vbool& valid0, const vint& vi, const Closure& closure) + { + vbool valid1 = valid0; + while (any(valid1)) { + const int j = int(bsf(movemask(valid1))); + const int i = vi[j]; + const vbool valid2 = valid1 & (i == vi); + valid1 = andn(valid1, valid2); + closure(valid2, i, j); + } + } + + /* returns the index of the next unique value i in vi and the corresponding valid_i mask */ + template<typename vbool, typename vint> + __forceinline int next_unique_index(vbool& valid, const vint& vi, /*out*/ vbool& valid_i) + { + assert(any(valid)); + const int j = int(bsf(movemask(valid))); + const int i = vi[j]; + valid_i = valid & (i == vi); + valid = andn(valid, valid_i); + return j; + } + + template<typename Closure> + __forceinline void foreach2(int x0, int x1, int y0, int y1, const Closure& closure) + { + __aligned(64) int U[2*VSIZEX]; + __aligned(64) int V[2*VSIZEX]; + int index = 0; + for (int y=y0; y<y1; y++) { + const bool lasty = y+1>=y1; + const vintx vy = y; + for (int x=x0; x<x1; ) { //x+=VSIZEX) { + const bool lastx = x+VSIZEX >= x1; + vintx vx = x+vintx(step); + vintx::storeu(&U[index], vx); + vintx::storeu(&V[index], vy); + const int dx = min(x1-x,VSIZEX); + index += dx; + x += dx; + if (index >= VSIZEX || (lastx && lasty)) { + const vboolx valid = vintx(step) < vintx(index); + closure(valid, vintx::load(U), vintx::load(V)); + x-= max(0, index-VSIZEX); + index = 0; + } + } + } + } +} diff --git a/thirdparty/embree/common/simd/sse.cpp b/thirdparty/embree/common/simd/sse.cpp new file mode 100644 index 0000000000..535d6943d8 --- /dev/null +++ b/thirdparty/embree/common/simd/sse.cpp @@ -0,0 +1,34 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "sse.h" + +namespace embree +{ + const __m128 mm_lookupmask_ps[16] = { + _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1)) + }; + + const __m128d mm_lookupmask_pd[4] = { + _mm_castsi128_pd(_mm_set_epi32( 0, 0, 0, 0)), + _mm_castsi128_pd(_mm_set_epi32( 0, 0,-1,-1)), + _mm_castsi128_pd(_mm_set_epi32(-1,-1, 0, 0)), + _mm_castsi128_pd(_mm_set_epi32(-1,-1,-1,-1)) + }; + +} diff --git a/thirdparty/embree/common/simd/sse.h b/thirdparty/embree/common/simd/sse.h new file mode 100644 index 0000000000..1465fb4fb0 --- /dev/null +++ b/thirdparty/embree/common/simd/sse.h @@ -0,0 +1,35 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/intrinsics.h" +#include "../sys/alloc.h" +#include "../math/constants.h" +#include "varying.h" + +namespace embree +{ +#if defined(__SSE4_1__) + __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { + return _mm_blendv_ps(f,t,mask); + } +#else + __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { + return _mm_or_ps(_mm_and_ps(mask, t), _mm_andnot_ps(mask, f)); + } +#endif + + extern const __m128 mm_lookupmask_ps[16]; + extern const __m128d mm_lookupmask_pd[4]; +} + +#if defined(__AVX512VL__) +#include "vboolf4_avx512.h" +#else +#include "vboolf4_sse2.h" +#endif +#include "vint4_sse2.h" +#include "vuint4_sse2.h" +#include "vfloat4_sse2.h" diff --git a/thirdparty/embree/common/simd/varying.h b/thirdparty/embree/common/simd/varying.h new file mode 100644 index 0000000000..9b98d326be --- /dev/null +++ b/thirdparty/embree/common/simd/varying.h @@ -0,0 +1,145 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" + +namespace embree +{ + /* Varying numeric types */ + template<int N> + struct vfloat_impl + { + union { float f[N]; int i[N]; }; + __forceinline const float& operator [](size_t index) const { assert(index < N); return f[index]; } + __forceinline float& operator [](size_t index) { assert(index < N); return f[index]; } + }; + + template<int N> + struct vdouble_impl + { + union { double f[N]; long long i[N]; }; + __forceinline const double& operator [](size_t index) const { assert(index < N); return f[index]; } + __forceinline double& operator [](size_t index) { assert(index < N); return f[index]; } + }; + + template<int N> + struct vint_impl + { + int i[N]; + __forceinline const int& operator [](size_t index) const { assert(index < N); return i[index]; } + __forceinline int& operator [](size_t index) { assert(index < N); return i[index]; } + }; + + template<int N> + struct vuint_impl + { + unsigned int i[N]; + __forceinline const unsigned int& operator [](size_t index) const { assert(index < N); return i[index]; } + __forceinline unsigned int& operator [](size_t index) { assert(index < N); return i[index]; } + }; + + template<int N> + struct vllong_impl + { + long long i[N]; + __forceinline const long long& operator [](size_t index) const { assert(index < N); return i[index]; } + __forceinline long long& operator [](size_t index) { assert(index < N); return i[index]; } + }; + + /* Varying bool types */ + template<int N> struct vboolf_impl { int i[N]; }; // for float/int + template<int N> struct vboold_impl { long long i[N]; }; // for double/long long + + /* Varying size constants */ +#if defined(__AVX512VL__) // SKX + const int VSIZEX = 8; // default size + const int VSIZEL = 16; // large size +#elif defined(__AVX__) + const int VSIZEX = 8; + const int VSIZEL = 8; +#else + const int VSIZEX = 4; + const int VSIZEL = 4; +#endif + + template<int N> + struct vtypes { + using vbool = vboolf_impl<N>; + using vboolf = vboolf_impl<N>; + using vboold = vboold_impl<N>; + using vint = vint_impl<N>; + using vuint = vuint_impl<N>; + using vllong = vllong_impl<N>; + using vfloat = vfloat_impl<N>; + using vdouble = vdouble_impl<N>; + }; + + template<> + struct vtypes<1> { + using vbool = bool; + using vboolf = bool; + using vboold = bool; + using vint = int; + using vuint = unsigned int; + using vllong = long long; + using vfloat = float; + using vdouble = double; + }; + + /* Aliases to default types */ + template<int N> using vbool = typename vtypes<N>::vbool; + template<int N> using vboolf = typename vtypes<N>::vboolf; + template<int N> using vboold = typename vtypes<N>::vboold; + template<int N> using vint = typename vtypes<N>::vint; + template<int N> using vuint = typename vtypes<N>::vuint; + template<int N> using vllong = typename vtypes<N>::vllong; + template<int N> using vreal = typename vtypes<N>::vfloat; + template<int N> using vfloat = typename vtypes<N>::vfloat; + template<int N> using vdouble = typename vtypes<N>::vdouble; + + /* 4-wide shortcuts */ + typedef vfloat<4> vfloat4; + typedef vdouble<4> vdouble4; + typedef vreal<4> vreal4; + typedef vint<4> vint4; + typedef vuint<4> vuint4; + typedef vllong<4> vllong4; + typedef vbool<4> vbool4; + typedef vboolf<4> vboolf4; + typedef vboold<4> vboold4; + + /* 8-wide shortcuts */ + typedef vfloat<8> vfloat8; + typedef vdouble<8> vdouble8; + typedef vreal<8> vreal8; + typedef vint<8> vint8; + typedef vuint<8> vuint8; + typedef vllong<8> vllong8; + typedef vbool<8> vbool8; + typedef vboolf<8> vboolf8; + typedef vboold<8> vboold8; + + /* 16-wide shortcuts */ + typedef vfloat<16> vfloat16; + typedef vdouble<16> vdouble16; + typedef vreal<16> vreal16; + typedef vint<16> vint16; + typedef vuint<16> vuint16; + typedef vllong<16> vllong16; + typedef vbool<16> vbool16; + typedef vboolf<16> vboolf16; + typedef vboold<16> vboold16; + + /* Default shortcuts */ + typedef vfloat<VSIZEX> vfloatx; + typedef vdouble<VSIZEX> vdoublex; + typedef vreal<VSIZEX> vrealx; + typedef vint<VSIZEX> vintx; + typedef vuint<VSIZEX> vuintx; + typedef vllong<VSIZEX> vllongx; + typedef vbool<VSIZEX> vboolx; + typedef vboolf<VSIZEX> vboolfx; + typedef vboold<VSIZEX> vbooldx; +} diff --git a/thirdparty/embree/common/simd/vboold4_avx.h b/thirdparty/embree/common/simd/vboold4_avx.h new file mode 100644 index 0000000000..7db0d1c5c1 --- /dev/null +++ b/thirdparty/embree/common/simd/vboold4_avx.h @@ -0,0 +1,169 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide AVX bool type for 64bit data types*/ + template<> + struct vboold<4> + { + ALIGNED_STRUCT_(32); + + typedef vboold4 Bool; + + enum { size = 4 }; // number of SIMD elements + union { // data + __m256d v; + struct { __m128d vl,vh; }; + long long i[4]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold() {} + __forceinline vboold(const vboold4& a) { v = a.v; } + __forceinline vboold4& operator =(const vboold4& a) { v = a.v; return *this; } + + __forceinline vboold(__m256d a) : v(a) {} + __forceinline vboold(__m256i a) : v(_mm256_castsi256_pd(a)) {} + + __forceinline operator const __m256() const { return _mm256_castpd_ps(v); } + __forceinline operator const __m256i() const { return _mm256_castpd_si256(v); } + __forceinline operator const __m256d() const { return v; } + + __forceinline vboold(int a) + { + assert(a >= 0 && a <= 255); +#if defined (__AVX2__) + const __m256i mask = _mm256_set_epi64x(0x8, 0x4, 0x2, 0x1); + const __m256i b = _mm256_set1_epi64x(a); + const __m256i c = _mm256_and_si256(b,mask); + v = _mm256_castsi256_pd(_mm256_cmpeq_epi64(c,mask)); +#else + vl = mm_lookupmask_pd[a & 0x3]; + vh = mm_lookupmask_pd[a >> 2]; +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {} + __forceinline vboold(TrueTy) : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm256_movemask_pd(v) >> index) & 1; } + __forceinline long long& operator [](size_t index) { assert(index < 4); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator !(const vboold4& a) { return _mm256_xor_pd(a, vboold4(embree::True)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm256_and_pd(a, b); } + __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm256_or_pd (a, b); } + __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); } + + __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm256_andnot_pd(b, a); } + + __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; } + __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; } + __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); } + __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(_mm256_xor_pd(a,b),vboold4(embree::True)); } + + __forceinline vboold4 select(const vboold4& mask, const vboold4& t, const vboold4& f) { + return _mm256_blendv_pd(f, t, mask); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); } + __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); } + + +#if defined(__AVX2__) + template<int i0, int i1, int i2, int i3> + __forceinline vboold4 shuffle(const vboold4& v) { + return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i> + __forceinline vboold4 shuffle(const vboold4& v) { + return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i, i, i, i)); + } +#endif + + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool reduce_and(const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; } + __forceinline bool reduce_or (const vboold4& a) { return !_mm256_testz_pd(a,a); } + + __forceinline bool all (const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; } + __forceinline bool any (const vboold4& a) { return !_mm256_testz_pd(a,a); } + __forceinline bool none(const vboold4& a) { return _mm256_testz_pd(a,a) != 0; } + + __forceinline bool all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); } + __forceinline bool any (const vboold4& valid, const vboold4& b) { return any(valid & b); } + __forceinline bool none(const vboold4& valid, const vboold4& b) { return none(valid & b); } + + __forceinline unsigned int movemask(const vboold4& a) { return _mm256_movemask_pd(a); } + __forceinline size_t popcnt (const vboold4& a) { return popcnt((size_t)_mm256_movemask_pd(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboold4& a, size_t index) { return a[index]; } + __forceinline void set (vboold4& a, size_t index) { a[index] = -1; } + __forceinline void clear(vboold4& a, size_t index) { a[index] = 0; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " + << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vboold4_avx512.h b/thirdparty/embree/common/simd/vboold4_avx512.h new file mode 100644 index 0000000000..ceaad7bba5 --- /dev/null +++ b/thirdparty/embree/common/simd/vboold4_avx512.h @@ -0,0 +1,156 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide AVX-512 bool type */ + template<> + struct vboold<4> + { + typedef vboold4 Bool; + typedef vint4 Int; + + enum { size = 4 }; // number of SIMD elements + __mmask8 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold() {} + __forceinline vboold(const vboold4& t) { v = t.v; } + __forceinline vboold4& operator =(const vboold4& f) { v = f.v; return *this; } + + __forceinline vboold(const __mmask8 &t) { v = t; } + __forceinline operator __mmask8() const { return v; } + + __forceinline vboold(bool b) { v = b ? 0xf : 0x0; } + __forceinline vboold(int t) { v = (__mmask8)t; } + __forceinline vboold(unsigned int t) { v = (__mmask8)t; } + + /* return int8 mask */ + __forceinline __m128i mask8() const { + return _mm_movm_epi8(v); + } + + /* return int32 mask */ + __forceinline __m128i mask32() const { + return _mm_movm_epi32(v); + } + + /* return int64 mask */ + __forceinline __m256i mask64() const { + return _mm256_movm_epi64(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold(FalseTy) : v(0x0) {} + __forceinline vboold(TrueTy) : v(0xf) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 4); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator !(const vboold4& a) { return _mm512_kandn(a, 0xf); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm512_kand(a, b); } + __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm512_kor(a, b); } + __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); } + + __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm512_kandn(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; } + __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; } + __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); } + __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); } + + __forceinline vboold4 select(const vboold4& s, const vboold4& a, const vboold4& b) { + return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboold4& a) { return a.v == 0xf; } + __forceinline int any (const vboold4& a) { return _mm512_kortestz(a, a) == 0; } + __forceinline int none(const vboold4& a) { return _mm512_kortestz(a, a) != 0; } + + __forceinline int all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); } + __forceinline int any (const vboold4& valid, const vboold4& b) { return any(valid & b); } + __forceinline int none(const vboold4& valid, const vboold4& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboold4& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboold4& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Conversion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt(const vboold4& a) { return mm512_mask2int(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboold4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; } + __forceinline void set(vboold4& a, size_t index) { assert(index < 4); a |= 1 << index; } + __forceinline void clear(vboold4& a, size_t index) { assert(index < 4); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) + { + cout << "<"; + for (size_t i=0; i<4; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vboold8_avx512.h b/thirdparty/embree/common/simd/vboold8_avx512.h new file mode 100644 index 0000000000..66d2054872 --- /dev/null +++ b/thirdparty/embree/common/simd/vboold8_avx512.h @@ -0,0 +1,151 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX-512 bool type */ + template<> + struct vboold<8> + { + typedef vboold8 Bool; + typedef vint8 Int; + + enum { size = 8 }; // number of SIMD elements + __mmask8 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold() {} + __forceinline vboold(const vboold8& t) { v = t.v; } + __forceinline vboold8& operator =(const vboold8& f) { v = f.v; return *this; } + + __forceinline vboold(const __mmask8& t) { v = t; } + __forceinline operator __mmask8() const { return v; } + + __forceinline vboold(bool b) { v = b ? 0xff : 0x00; } + __forceinline vboold(int t) { v = (__mmask8)t; } + __forceinline vboold(unsigned int t) { v = (__mmask8)t; } + + /* return int8 mask */ + __forceinline __m128i mask8() const { + return _mm_movm_epi8(v); + } + + /* return int64 mask */ + __forceinline __m512i mask64() const { + return _mm512_movm_epi64(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold(FalseTy) : v(0x00) {} + __forceinline vboold(TrueTy) : v(0xff) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 8); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator !(const vboold8& a) { return _mm512_knot(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator &(const vboold8& a, const vboold8& b) { return _mm512_kand(a, b); } + __forceinline vboold8 operator |(const vboold8& a, const vboold8& b) { return _mm512_kor(a, b); } + __forceinline vboold8 operator ^(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); } + + __forceinline vboold8 andn(const vboold8& a, const vboold8& b) { return _mm512_kandn(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8& operator &=(vboold8& a, const vboold8& b) { return a = a & b; } + __forceinline vboold8& operator |=(vboold8& a, const vboold8& b) { return a = a | b; } + __forceinline vboold8& operator ^=(vboold8& a, const vboold8& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator !=(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); } + __forceinline vboold8 operator ==(const vboold8& a, const vboold8& b) { return _mm512_kxnor(a, b); } + + __forceinline vboold8 select(const vboold8& s, const vboold8& a, const vboold8& b) { + return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboold8& a) { return a.v == 0xff; } + __forceinline int any (const vboold8& a) { return _mm512_kortestz(a, a) == 0; } + __forceinline int none(const vboold8& a) { return _mm512_kortestz(a, a) != 0; } + + __forceinline int all (const vboold8& valid, const vboold8& b) { return all((!valid) | b); } + __forceinline int any (const vboold8& valid, const vboold8& b) { return any(valid & b); } + __forceinline int none(const vboold8& valid, const vboold8& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboold8& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboold8& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Conversion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt(const vboold8& a) { return mm512_mask2int(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboold8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; } + __forceinline void set(vboold8& a, size_t index) { assert(index < 8); a |= 1 << index; } + __forceinline void clear(vboold8& a, size_t index) { assert(index < 8); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboold8& a) + { + cout << "<"; + for (size_t i=0; i<8; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vboolf16_avx512.h b/thirdparty/embree/common/simd/vboolf16_avx512.h new file mode 100644 index 0000000000..19841dcea8 --- /dev/null +++ b/thirdparty/embree/common/simd/vboolf16_avx512.h @@ -0,0 +1,153 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 16-wide AVX-512 bool type */ + template<> + struct vboolf<16> + { + typedef vboolf16 Bool; + typedef vint16 Int; + typedef vfloat16 Float; + + enum { size = 16 }; // number of SIMD elements + __mmask16 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf16& t) { v = t.v; } + __forceinline vboolf16& operator =(const vboolf16& f) { v = f.v; return *this; } + + __forceinline vboolf(const __mmask16& t) { v = t; } + __forceinline operator __mmask16() const { return v; } + + __forceinline vboolf(bool b) { v = b ? 0xFFFF : 0x0000; } + __forceinline vboolf(int t) { v = (__mmask16)t; } + __forceinline vboolf(unsigned int t) { v = (__mmask16)t; } + + /* return int8 mask */ + __forceinline __m128i mask8() const { + return _mm_movm_epi8(v); + } + + /* return int32 mask */ + __forceinline __m512i mask32() const { + return _mm512_movm_epi32(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(0x0000) {} + __forceinline vboolf(TrueTy) : v(0xffff) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 16); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator !(const vboolf16& a) { return _mm512_knot(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator &(const vboolf16& a, const vboolf16& b) { return _mm512_kand(a,b); } + __forceinline vboolf16 operator |(const vboolf16& a, const vboolf16& b) { return _mm512_kor(a,b); } + __forceinline vboolf16 operator ^(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a,b); } + + __forceinline vboolf16 andn(const vboolf16& a, const vboolf16& b) { return _mm512_kandn(b,a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16& operator &=(vboolf16& a, const vboolf16& b) { return a = a & b; } + __forceinline vboolf16& operator |=(vboolf16& a, const vboolf16& b) { return a = a | b; } + __forceinline vboolf16& operator ^=(vboolf16& a, const vboolf16& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator !=(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a, b); } + __forceinline vboolf16 operator ==(const vboolf16& a, const vboolf16& b) { return _mm512_kxnor(a, b); } + + __forceinline vboolf16 select(const vboolf16& s, const vboolf16& a, const vboolf16& b) { + return _mm512_kor(_mm512_kand(s,a),_mm512_kandn(s,b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboolf16& a) { return _mm512_kortestc(a,a) != 0; } + __forceinline int any (const vboolf16& a) { return _mm512_kortestz(a,a) == 0; } + __forceinline int none(const vboolf16& a) { return _mm512_kortestz(a,a) != 0; } + + __forceinline int all (const vboolf16& valid, const vboolf16& b) { return all((!valid) | b); } + __forceinline int any (const vboolf16& valid, const vboolf16& b) { return any(valid & b); } + __forceinline int none(const vboolf16& valid, const vboolf16& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboolf16& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboolf16& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Convertion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt (const vboolf16& a) { return mm512_mask2int(a); } + __forceinline vboolf16 toMask(const int& a) { return mm512_int2mask(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf16& a, size_t index) { assert(index < 16); return (toInt(a) >> index) & 1; } + __forceinline void set(vboolf16& a, size_t index) { assert(index < 16); a |= 1 << index; } + __forceinline void clear(vboolf16& a, size_t index) { assert(index < 16); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf16& a) + { + cout << "<"; + for (size_t i=0; i<16; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vboolf4_avx512.h b/thirdparty/embree/common/simd/vboolf4_avx512.h new file mode 100644 index 0000000000..e65f66b025 --- /dev/null +++ b/thirdparty/embree/common/simd/vboolf4_avx512.h @@ -0,0 +1,159 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide AVX-512 bool type */ + template<> + struct vboolf<4> + { + typedef vboolf4 Bool; + typedef vint4 Int; + + enum { size = 4 }; // number of SIMD elements + __mmask8 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf4& t) { v = t.v; } + __forceinline vboolf4& operator =(const vboolf4& f) { v = f.v; return *this; } + + __forceinline vboolf(const __mmask8 &t) { v = t; } + __forceinline operator __mmask8() const { return v; } + + __forceinline vboolf(bool b) { v = b ? 0xf : 0x0; } + __forceinline vboolf(int t) { v = (__mmask8)t; } + __forceinline vboolf(unsigned int t) { v = (__mmask8)t; } + + __forceinline vboolf(bool a, bool b, bool c, bool d) + : v((__mmask8)((int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {} + + /* return int8 mask */ + __forceinline __m128i mask8() const { + return _mm_movm_epi8(v); + } + + /* return int32 mask */ + __forceinline __m128i mask32() const { + return _mm_movm_epi32(v); + } + + /* return int64 mask */ + __forceinline __m256i mask64() const { + return _mm256_movm_epi64(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(0x0) {} + __forceinline vboolf(TrueTy) : v(0xf) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 4); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator !(const vboolf4& a) { return _mm512_kandn(a, 0xf); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm512_kand(a, b); } + __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm512_kor(a, b); } + __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); } + + __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm512_kandn(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; } + __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; } + __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); } + __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); } + + __forceinline vboolf4 select(const vboolf4& s, const vboolf4& a, const vboolf4& b) { + return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboolf4& a) { return a.v == 0xf; } + __forceinline int any (const vboolf4& a) { return _mm512_kortestz(a, a) == 0; } + __forceinline int none(const vboolf4& a) { return _mm512_kortestz(a, a) != 0; } + + __forceinline int all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); } + __forceinline int any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); } + __forceinline int none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboolf4& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboolf4& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Conversion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt(const vboolf4& a) { return mm512_mask2int(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; } + __forceinline void set(vboolf4& a, size_t index) { assert(index < 4); a |= 1 << index; } + __forceinline void clear(vboolf4& a, size_t index) { assert(index < 4); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) + { + cout << "<"; + for (size_t i=0; i<4; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vboolf4_sse2.h b/thirdparty/embree/common/simd/vboolf4_sse2.h new file mode 100644 index 0000000000..fa84b1b6ee --- /dev/null +++ b/thirdparty/embree/common/simd/vboolf4_sse2.h @@ -0,0 +1,189 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide SSE bool type */ + template<> + struct vboolf<4> + { + ALIGNED_STRUCT_(16); + + typedef vboolf4 Bool; + typedef vint4 Int; + typedef vfloat4 Float; + + enum { size = 4 }; // number of SIMD elements + union { __m128 v; int i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf4& other) { v = other.v; } + __forceinline vboolf4& operator =(const vboolf4& other) { v = other.v; return *this; } + + __forceinline vboolf(__m128 input) : v(input) {} + __forceinline operator const __m128&() const { return v; } + __forceinline operator const __m128i() const { return _mm_castps_si128(v); } + __forceinline operator const __m128d() const { return _mm_castps_pd(v); } + + __forceinline vboolf(bool a) + : v(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {} + __forceinline vboolf(bool a, bool b) + : v(mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {} + __forceinline vboolf(bool a, bool b, bool c, bool d) + : v(mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {} + __forceinline vboolf(int mask) { assert(mask >= 0 && mask < 16); v = mm_lookupmask_ps[mask]; } + __forceinline vboolf(unsigned int mask) { assert(mask < 16); v = mm_lookupmask_ps[mask]; } + + /* return int32 mask */ + __forceinline __m128i mask32() const { + return _mm_castps_si128(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(_mm_setzero_ps()) {} + __forceinline vboolf(TrueTy) : v(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm_movemask_ps(v) >> index) & 1; } + __forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator !(const vboolf4& a) { return _mm_xor_ps(a, vboolf4(embree::True)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm_and_ps(a, b); } + __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm_or_ps (a, b); } + __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); } + + __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm_andnot_ps(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; } + __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; } + __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); } + __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } + + __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) { +#if defined(__SSE4_1__) + return _mm_blendv_ps(f, t, m); +#else + return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); } + __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); } + + template<int i0, int i1, int i2, int i3> + __forceinline vboolf4 shuffle(const vboolf4& v) { + return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) { + return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i0> + __forceinline vboolf4 shuffle(const vboolf4& v) { + return shuffle<i0,i0,i0,i0>(v); + } + +#if defined(__SSE3__) + template<> __forceinline vboolf4 shuffle<0, 0, 2, 2>(const vboolf4& v) { return _mm_moveldup_ps(v); } + template<> __forceinline vboolf4 shuffle<1, 1, 3, 3>(const vboolf4& v) { return _mm_movehdup_ps(v); } + template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); } +#endif + +#if defined(__SSE4_1__) + template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } + template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); } + template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool reduce_and(const vboolf4& a) { return _mm_movemask_ps(a) == 0xf; } + __forceinline bool reduce_or (const vboolf4& a) { return _mm_movemask_ps(a) != 0x0; } + + __forceinline bool all (const vboolf4& b) { return _mm_movemask_ps(b) == 0xf; } + __forceinline bool any (const vboolf4& b) { return _mm_movemask_ps(b) != 0x0; } + __forceinline bool none(const vboolf4& b) { return _mm_movemask_ps(b) == 0x0; } + + __forceinline bool all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); } + __forceinline bool any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); } + __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); } +#if defined(__SSE4_2__) + __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); } +#else + __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf4& a, size_t index) { return a[index]; } + __forceinline void set(vboolf4& a, size_t index) { a[index] = -1; } + __forceinline void clear(vboolf4& a, size_t index) { a[index] = 0; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vboolf8_avx.h b/thirdparty/embree/common/simd/vboolf8_avx.h new file mode 100644 index 0000000000..ba77cc3c5e --- /dev/null +++ b/thirdparty/embree/common/simd/vboolf8_avx.h @@ -0,0 +1,202 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX bool type */ + template<> + struct vboolf<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256 v; + struct { __m128 vl,vh; }; + int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf8& a) { v = a.v; } + __forceinline vboolf8& operator =(const vboolf8& a) { v = a.v; return *this; } + + __forceinline vboolf(__m256 a) : v(a) {} + __forceinline operator const __m256&() const { return v; } + __forceinline operator const __m256i() const { return _mm256_castps_si256(v); } + __forceinline operator const __m256d() const { return _mm256_castps_pd(v); } + + __forceinline vboolf(int a) + { + assert(a >= 0 && a <= 255); +#if defined (__AVX2__) + const __m256i mask = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1); + const __m256i b = _mm256_set1_epi32(a); + const __m256i c = _mm256_and_si256(b,mask); + v = _mm256_castsi256_ps(_mm256_cmpeq_epi32(c,mask)); +#else + vl = mm_lookupmask_ps[a & 0xF]; + vh = mm_lookupmask_ps[a >> 4]; +#endif + } + + __forceinline vboolf(const vboolf4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {} + __forceinline vboolf(const vboolf4& a, const vboolf4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {} + __forceinline vboolf(__m128 a, __m128 b) : vl(a), vh(b) {} + + __forceinline vboolf(bool a) : v(vboolf8(vboolf4(a), vboolf4(a))) {} + __forceinline vboolf(bool a, bool b) : v(vboolf8(vboolf4(a), vboolf4(b))) {} + __forceinline vboolf(bool a, bool b, bool c, bool d) : v(vboolf8(vboolf4(a,b), vboolf4(c,d))) {} + __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) : v(vboolf8(vboolf4(a,b,c,d), vboolf4(e,f,g,h))) {} + + /* return int32 mask */ + __forceinline __m256i mask32() const { + return _mm256_castps_si256(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {} + __forceinline vboolf(TrueTy) : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { assert(index < 8); return (_mm256_movemask_ps(v) >> index) & 1; } + __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator !(const vboolf8& a) { return _mm256_xor_ps(a, vboolf8(embree::True)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm256_and_ps(a, b); } + __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm256_or_ps (a, b); } + __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); } + + __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm256_andnot_ps(b, a); } + + __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; } + __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; } + __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); } + __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(_mm256_xor_ps(a,b),vboolf8(embree::True)); } + + __forceinline vboolf8 select(const vboolf8& mask, const vboolf8& t, const vboolf8& f) { + return _mm256_blendv_ps(f, t, mask); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 unpacklo(const vboolf8& a, const vboolf8& b) { return _mm256_unpacklo_ps(a, b); } + __forceinline vboolf8 unpackhi(const vboolf8& a, const vboolf8& b) { return _mm256_unpackhi_ps(a, b); } + + template<int i> + __forceinline vboolf8 shuffle(const vboolf8& v) { + return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); + } + + template<int i0, int i1> + __forceinline vboolf8 shuffle4(const vboolf8& v) { + return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1> + __forceinline vboolf8 shuffle4(const vboolf8& a, const vboolf8& b) { + return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vboolf8 shuffle(const vboolf8& v) { + return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vboolf8 shuffle(const vboolf8& a, const vboolf8& b) { + return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<> __forceinline vboolf8 shuffle<0, 0, 2, 2>(const vboolf8& v) { return _mm256_moveldup_ps(v); } + template<> __forceinline vboolf8 shuffle<1, 1, 3, 3>(const vboolf8& v) { return _mm256_movehdup_ps(v); } + template<> __forceinline vboolf8 shuffle<0, 1, 0, 1>(const vboolf8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); } + + template<int i> __forceinline vboolf8 insert4(const vboolf8& a, const vboolf4& b) { return _mm256_insertf128_ps(a, b, i); } + template<int i> __forceinline vboolf4 extract4 (const vboolf8& a) { return _mm256_extractf128_ps(a, i); } + template<> __forceinline vboolf4 extract4<0>(const vboolf8& a) { return _mm256_castps256_ps128(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool reduce_and(const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; } + __forceinline bool reduce_or (const vboolf8& a) { return !_mm256_testz_ps(a,a); } + + __forceinline bool all (const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; } + __forceinline bool any (const vboolf8& a) { return !_mm256_testz_ps(a,a); } + __forceinline bool none(const vboolf8& a) { return _mm256_testz_ps(a,a) != 0; } + + __forceinline bool all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); } + __forceinline bool any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); } + __forceinline bool none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); } + + __forceinline unsigned int movemask(const vboolf8& a) { return _mm256_movemask_ps(a); } + __forceinline size_t popcnt (const vboolf8& a) { return popcnt((size_t)_mm256_movemask_ps(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf8& a, size_t index) { return a[index]; } + __forceinline void set(vboolf8& a, size_t index) { a[index] = -1; } + __forceinline void clear(vboolf8& a, size_t index) { a[index] = 0; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " + << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vboolf8_avx512.h b/thirdparty/embree/common/simd/vboolf8_avx512.h new file mode 100644 index 0000000000..73ff5666e1 --- /dev/null +++ b/thirdparty/embree/common/simd/vboolf8_avx512.h @@ -0,0 +1,159 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX-512 bool type */ + template<> + struct vboolf<8> + { + typedef vboolf8 Bool; + typedef vint8 Int; + + enum { size = 8 }; // number of SIMD elements + __mmask8 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf8& t) { v = t.v; } + __forceinline vboolf8& operator =(const vboolf8& f) { v = f.v; return *this; } + + __forceinline vboolf(const __mmask8 &t) { v = t; } + __forceinline operator __mmask8() const { return v; } + + __forceinline vboolf(bool b) { v = b ? 0xff : 0x00; } + __forceinline vboolf(int t) { v = (__mmask8)t; } + __forceinline vboolf(unsigned int t) { v = (__mmask8)t; } + + __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) + : v((__mmask8)((int(h) << 7) | (int(g) << 6) | (int(f) << 5) | (int(e) << 4) | (int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {} + + /* return int8 mask */ + __forceinline __m128i mask8() const { + return _mm_movm_epi8(v); + } + + /* return int32 mask */ + __forceinline __m256i mask32() const { + return _mm256_movm_epi32(v); + } + + /* return int64 mask */ + __forceinline __m512i mask64() const { + return _mm512_movm_epi64(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(0x00) {} + __forceinline vboolf(TrueTy) : v(0xff) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 8); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator !(const vboolf8& a) { return _mm512_knot(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm512_kand(a, b); } + __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm512_kor(a, b); } + __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); } + + __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm512_kandn(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; } + __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; } + __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); } + __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm512_kxnor(a, b); } + + __forceinline vboolf8 select(const vboolf8& s, const vboolf8& a, const vboolf8& b) { + return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboolf8& a) { return a.v == 0xff; } + __forceinline int any (const vboolf8& a) { return _mm512_kortestz(a, a) == 0; } + __forceinline int none(const vboolf8& a) { return _mm512_kortestz(a, a) != 0; } + + __forceinline int all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); } + __forceinline int any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); } + __forceinline int none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboolf8& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboolf8& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Conversion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt(const vboolf8& a) { return mm512_mask2int(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; } + __forceinline void set(vboolf8& a, size_t index) { assert(index < 8); a |= 1 << index; } + __forceinline void clear(vboolf8& a, size_t index) { assert(index < 8); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) + { + cout << "<"; + for (size_t i=0; i<8; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vdouble4_avx.h b/thirdparty/embree/common/simd/vdouble4_avx.h new file mode 100644 index 0000000000..55326de7dd --- /dev/null +++ b/thirdparty/embree/common/simd/vdouble4_avx.h @@ -0,0 +1,321 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide AVX 64-bit double type */ + template<> + struct vdouble<4> + { + ALIGNED_STRUCT_(32); + + typedef vboold4 Bool; + + enum { size = 4 }; // number of SIMD elements + union { // data + __m256d v; + double i[4]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble() {} + __forceinline vdouble(const vdouble4& t) { v = t.v; } + __forceinline vdouble4& operator =(const vdouble4& f) { v = f.v; return *this; } + + __forceinline vdouble(const __m256d& t) { v = t; } + __forceinline operator __m256d() const { return v; } + + __forceinline vdouble(double i) { + v = _mm256_set1_pd(i); + } + + __forceinline vdouble(double a, double b, double c, double d) { + v = _mm256_set_pd(d,c,b,a); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble(ZeroTy) : v(_mm256_setzero_pd()) {} + __forceinline vdouble(OneTy) : v(_mm256_set1_pd(1)) {} + __forceinline vdouble(StepTy) : v(_mm256_set_pd(3.0,2.0,1.0,0.0)) {} + __forceinline vdouble(ReverseStepTy) : v(_mm256_setr_pd(3.0,2.0,1.0,0.0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(double *__restrict__ ptr, const vdouble4& a) { + _mm256_stream_pd(ptr, a); + } + + static __forceinline vdouble4 loadu(const double* addr) { + return _mm256_loadu_pd(addr); + } + + static __forceinline vdouble4 load(const vdouble4* addr) { + return _mm256_load_pd((double*)addr); + } + + static __forceinline vdouble4 load(const double* addr) { + return _mm256_load_pd(addr); + } + + static __forceinline void store(double* ptr, const vdouble4& v) { + _mm256_store_pd(ptr, v); + } + + static __forceinline void storeu(double* ptr, const vdouble4& v) { + _mm256_storeu_pd(ptr, v); + } + + static __forceinline vdouble4 broadcast(const void* a) { return _mm256_set1_pd(*(double*)a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline double& operator [](size_t index) { assert(index < 4); return i[index]; } + __forceinline const double& operator [](size_t index) const { assert(index < 4); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline vdouble4 asDouble(const vllong4& a) { return _mm256_castsi256_pd(a); } + __forceinline vllong4 asLLong (const vdouble4& a) { return _mm256_castpd_si256(a); } +#endif + + __forceinline vdouble4 operator +(const vdouble4& a) { return a; } + __forceinline vdouble4 operator -(const vdouble4& a) { return _mm256_sub_pd(_mm256_setzero_pd(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble4 operator +(const vdouble4& a, const vdouble4& b) { return _mm256_add_pd(a, b); } + __forceinline vdouble4 operator +(const vdouble4& a, double b) { return a + vdouble4(b); } + __forceinline vdouble4 operator +(double a, const vdouble4& b) { return vdouble4(a) + b; } + + __forceinline vdouble4 operator -(const vdouble4& a, const vdouble4& b) { return _mm256_sub_pd(a, b); } + __forceinline vdouble4 operator -(const vdouble4& a, double b) { return a - vdouble4(b); } + __forceinline vdouble4 operator -(double a, const vdouble4& b) { return vdouble4(a) - b; } + + __forceinline vdouble4 operator *(const vdouble4& a, const vdouble4& b) { return _mm256_mul_pd(a, b); } + __forceinline vdouble4 operator *(const vdouble4& a, double b) { return a * vdouble4(b); } + __forceinline vdouble4 operator *(double a, const vdouble4& b) { return vdouble4(a) * b; } + + __forceinline vdouble4 operator &(const vdouble4& a, const vdouble4& b) { return _mm256_and_pd(a, b); } + __forceinline vdouble4 operator &(const vdouble4& a, double b) { return a & vdouble4(b); } + __forceinline vdouble4 operator &(double a, const vdouble4& b) { return vdouble4(a) & b; } + + __forceinline vdouble4 operator |(const vdouble4& a, const vdouble4& b) { return _mm256_or_pd(a, b); } + __forceinline vdouble4 operator |(const vdouble4& a, double b) { return a | vdouble4(b); } + __forceinline vdouble4 operator |(double a, const vdouble4& b) { return vdouble4(a) | b; } + + __forceinline vdouble4 operator ^(const vdouble4& a, const vdouble4& b) { return _mm256_xor_pd(a, b); } + __forceinline vdouble4 operator ^(const vdouble4& a, double b) { return a ^ vdouble4(b); } + __forceinline vdouble4 operator ^(double a, const vdouble4& b) { return vdouble4(a) ^ b; } + + __forceinline vdouble4 min(const vdouble4& a, const vdouble4& b) { return _mm256_min_pd(a, b); } + __forceinline vdouble4 min(const vdouble4& a, double b) { return min(a,vdouble4(b)); } + __forceinline vdouble4 min(double a, const vdouble4& b) { return min(vdouble4(a),b); } + + __forceinline vdouble4 max(const vdouble4& a, const vdouble4& b) { return _mm256_max_pd(a, b); } + __forceinline vdouble4 max(const vdouble4& a, double b) { return max(a,vdouble4(b)); } + __forceinline vdouble4 max(double a, const vdouble4& b) { return max(vdouble4(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__FMA__) + __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmadd_pd(a,b,c); } + __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmsub_pd(a,b,c); } + __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmadd_pd(a,b,c); } + __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmsub_pd(a,b,c); } +#else + __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b+c; } + __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b-c; } + __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b+c;} + __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b-c; } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble4& operator +=(vdouble4& a, const vdouble4& b) { return a = a + b; } + __forceinline vdouble4& operator +=(vdouble4& a, double b) { return a = a + b; } + + __forceinline vdouble4& operator -=(vdouble4& a, const vdouble4& b) { return a = a - b; } + __forceinline vdouble4& operator -=(vdouble4& a, double b) { return a = a - b; } + + __forceinline vdouble4& operator *=(vdouble4& a, const vdouble4& b) { return a = a * b; } + __forceinline vdouble4& operator *=(vdouble4& a, double b) { return a = a * b; } + + __forceinline vdouble4& operator &=(vdouble4& a, const vdouble4& b) { return a = a & b; } + __forceinline vdouble4& operator &=(vdouble4& a, double b) { return a = a & b; } + + __forceinline vdouble4& operator |=(vdouble4& a, const vdouble4& b) { return a = a | b; } + __forceinline vdouble4& operator |=(vdouble4& a, double b) { return a = a | b; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_EQ); } + __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_NE); } + __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LT); } + __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); } + __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); } + __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); } +#else + __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); } + __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); } + __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); } + __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); } + __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); } + __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); } +#endif + + __forceinline vboold4 operator ==(const vdouble4& a, double b) { return a == vdouble4(b); } + __forceinline vboold4 operator ==(double a, const vdouble4& b) { return vdouble4(a) == b; } + + __forceinline vboold4 operator !=(const vdouble4& a, double b) { return a != vdouble4(b); } + __forceinline vboold4 operator !=(double a, const vdouble4& b) { return vdouble4(a) != b; } + + __forceinline vboold4 operator < (const vdouble4& a, double b) { return a < vdouble4(b); } + __forceinline vboold4 operator < (double a, const vdouble4& b) { return vdouble4(a) < b; } + + __forceinline vboold4 operator >=(const vdouble4& a, double b) { return a >= vdouble4(b); } + __forceinline vboold4 operator >=(double a, const vdouble4& b) { return vdouble4(a) >= b; } + + __forceinline vboold4 operator > (const vdouble4& a, double b) { return a > vdouble4(b); } + __forceinline vboold4 operator > (double a, const vdouble4& b) { return vdouble4(a) > b; } + + __forceinline vboold4 operator <=(const vdouble4& a, double b) { return a <= vdouble4(b); } + __forceinline vboold4 operator <=(double a, const vdouble4& b) { return vdouble4(a) <= b; } + + __forceinline vboold4 eq(const vdouble4& a, const vdouble4& b) { return a == b; } + __forceinline vboold4 ne(const vdouble4& a, const vdouble4& b) { return a != b; } + __forceinline vboold4 lt(const vdouble4& a, const vdouble4& b) { return a < b; } + __forceinline vboold4 ge(const vdouble4& a, const vdouble4& b) { return a >= b; } + __forceinline vboold4 gt(const vdouble4& a, const vdouble4& b) { return a > b; } + __forceinline vboold4 le(const vdouble4& a, const vdouble4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a == b); } + __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a != b); } + __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a < b); } + __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a >= b); } + __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a > b); } + __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a <= b); } +#endif + + __forceinline vdouble4 select(const vboold4& m, const vdouble4& t, const vdouble4& f) { +#if defined(__AVX512VL__) + return _mm256_mask_blend_pd(m, f, t); +#else + return _mm256_blendv_pd(f, t, m); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template<int i0, int i1> + __forceinline vdouble4 shuffle(const vdouble4& v) { + return _mm256_permute_pd(v, (i1 << 3) | (i0 << 2) | (i1 << 1) | i0); + } + + template<int i> + __forceinline vdouble4 shuffle(const vdouble4& v) { + return shuffle<i, i>(v); + } + + template<int i0, int i1> + __forceinline vdouble4 shuffle2(const vdouble4& v) { + return _mm256_permute2f128_pd(v, v, (i1 << 4) | i0); + } + + __forceinline double toScalar(const vdouble4& v) { + return _mm_cvtsd_f64(_mm256_castpd256_pd128(v)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble4 vreduce_min2(const vdouble4& x) { return min(x, shuffle<1,0>(x)); } + __forceinline vdouble4 vreduce_min (const vdouble4& y) { const vdouble4 x = vreduce_min2(y); return min(x, shuffle2<1,0>(x)); } + + __forceinline vdouble4 vreduce_max2(const vdouble4& x) { return max(x,shuffle<1,0>(x)); } + __forceinline vdouble4 vreduce_max (const vdouble4& y) { const vdouble4 x = vreduce_max2(y); return max(x, shuffle2<1,0>(x)); } + + __forceinline vdouble4 vreduce_and2(const vdouble4& x) { return x & shuffle<1,0>(x); } + __forceinline vdouble4 vreduce_and (const vdouble4& y) { const vdouble4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); } + + __forceinline vdouble4 vreduce_or2(const vdouble4& x) { return x | shuffle<1,0>(x); } + __forceinline vdouble4 vreduce_or (const vdouble4& y) { const vdouble4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); } + + __forceinline vdouble4 vreduce_add2(const vdouble4& x) { return x + shuffle<1,0>(x); } + __forceinline vdouble4 vreduce_add (const vdouble4& y) { const vdouble4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); } + + __forceinline double reduce_add(const vdouble4& a) { return toScalar(vreduce_add(a)); } + __forceinline double reduce_min(const vdouble4& a) { return toScalar(vreduce_min(a)); } + __forceinline double reduce_max(const vdouble4& a) { return toScalar(vreduce_max(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble4& v) + { + cout << "<" << v[0]; + for (size_t i=1; i<4; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vdouble8_avx512.h b/thirdparty/embree/common/simd/vdouble8_avx512.h new file mode 100644 index 0000000000..98d21bfe4a --- /dev/null +++ b/thirdparty/embree/common/simd/vdouble8_avx512.h @@ -0,0 +1,351 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX-512 64-bit double type */ + template<> + struct vdouble<8> + { + ALIGNED_STRUCT_(64); + + typedef vboold8 Bool; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m512d v; + double i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble() {} + __forceinline vdouble(const vdouble8& t) { v = t.v; } + __forceinline vdouble8& operator =(const vdouble8& f) { v = f.v; return *this; } + + __forceinline vdouble(const __m512d& t) { v = t; } + __forceinline operator __m512d() const { return v; } + __forceinline operator __m256d() const { return _mm512_castpd512_pd256(v); } + + __forceinline vdouble(double i) { + v = _mm512_set1_pd(i); + } + + __forceinline vdouble(double a, double b, double c, double d) { + v = _mm512_set4_pd(d,c,b,a); + } + + __forceinline vdouble(double a0, double a1, double a2, double a3, + double a4, double a5, double a6, double a7) + { + v = _mm512_set_pd(a7,a6,a5,a4,a3,a2,a1,a0); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble(ZeroTy) : v(_mm512_setzero_pd()) {} + __forceinline vdouble(OneTy) : v(_mm512_set1_pd(1)) {} + __forceinline vdouble(StepTy) : v(_mm512_set_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {} + __forceinline vdouble(ReverseStepTy) : v(_mm512_setr_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(void *__restrict__ ptr, const vdouble8& a) { + _mm512_stream_pd((double*)ptr, a); + } + + static __forceinline vdouble8 loadu(const void* addr) { + return _mm512_loadu_pd((double*)addr); + } + + static __forceinline vdouble8 load(const vdouble8* addr) { + return _mm512_load_pd((double*)addr); + } + + static __forceinline vdouble8 load(const double* addr) { + return _mm512_load_pd(addr); + } + + static __forceinline void store(void* ptr, const vdouble8& v) { + _mm512_store_pd(ptr, v); + } + + static __forceinline void storeu(void* ptr, const vdouble8& v) { + _mm512_storeu_pd(ptr, v); + } + + static __forceinline void storeu(const vboold8& mask, double* ptr, const vdouble8& f) { + _mm512_mask_storeu_pd(ptr, mask, f); + } + + static __forceinline void store(const vboold8& mask, void* addr, const vdouble8& v2) { + _mm512_mask_store_pd(addr, mask, v2); + } + + static __forceinline vdouble8 compact(const vboold8& mask, vdouble8& v) { + return _mm512_mask_compress_pd(v, mask, v); + } + + static __forceinline vdouble8 compact(const vboold8& mask, const vdouble8& a, vdouble8& b) { + return _mm512_mask_compress_pd(a, mask, b); + } + + static __forceinline vdouble8 broadcast(const void* a) { return _mm512_set1_pd(*(double*)a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline double& operator [](size_t index) { assert(index < 8); return i[index]; } + __forceinline const double& operator [](size_t index) const { assert(index < 8); return i[index]; } + + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 asDouble(const vllong8& a) { return _mm512_castsi512_pd(a); } + __forceinline vllong8 asLLong (const vdouble8& a) { return _mm512_castpd_si512(a); } + + __forceinline vdouble8 operator +(const vdouble8& a) { return a; } + __forceinline vdouble8 operator -(const vdouble8& a) { return _mm512_sub_pd(_mm512_setzero_pd(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 operator +(const vdouble8& a, const vdouble8& b) { return _mm512_add_pd(a, b); } + __forceinline vdouble8 operator +(const vdouble8& a, double b) { return a + vdouble8(b); } + __forceinline vdouble8 operator +(double a, const vdouble8& b) { return vdouble8(a) + b; } + + __forceinline vdouble8 operator -(const vdouble8& a, const vdouble8& b) { return _mm512_sub_pd(a, b); } + __forceinline vdouble8 operator -(const vdouble8& a, double b) { return a - vdouble8(b); } + __forceinline vdouble8 operator -(double a, const vdouble8& b) { return vdouble8(a) - b; } + + __forceinline vdouble8 operator *(const vdouble8& a, const vdouble8& b) { return _mm512_mul_pd(a, b); } + __forceinline vdouble8 operator *(const vdouble8& a, double b) { return a * vdouble8(b); } + __forceinline vdouble8 operator *(double a, const vdouble8& b) { return vdouble8(a) * b; } + + __forceinline vdouble8 operator &(const vdouble8& a, const vdouble8& b) { return _mm512_and_pd(a, b); } + __forceinline vdouble8 operator &(const vdouble8& a, double b) { return a & vdouble8(b); } + __forceinline vdouble8 operator &(double a, const vdouble8& b) { return vdouble8(a) & b; } + + __forceinline vdouble8 operator |(const vdouble8& a, const vdouble8& b) { return _mm512_or_pd(a, b); } + __forceinline vdouble8 operator |(const vdouble8& a, double b) { return a | vdouble8(b); } + __forceinline vdouble8 operator |(double a, const vdouble8& b) { return vdouble8(a) | b; } + + __forceinline vdouble8 operator ^(const vdouble8& a, const vdouble8& b) { return _mm512_xor_pd(a, b); } + __forceinline vdouble8 operator ^(const vdouble8& a, double b) { return a ^ vdouble8(b); } + __forceinline vdouble8 operator ^(double a, const vdouble8& b) { return vdouble8(a) ^ b; } + + __forceinline vdouble8 operator <<(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), n)); } + __forceinline vdouble8 operator >>(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), n)); } + + __forceinline vdouble8 operator <<(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(a), n)); } + __forceinline vdouble8 operator >>(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_srav_epi64(_mm512_castpd_si512(a), n)); } + + __forceinline vdouble8 sll (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), b)); } + __forceinline vdouble8 sra (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), b)); } + __forceinline vdouble8 srl (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), b)); } + + __forceinline vdouble8 min(const vdouble8& a, const vdouble8& b) { return _mm512_min_pd(a, b); } + __forceinline vdouble8 min(const vdouble8& a, double b) { return min(a,vdouble8(b)); } + __forceinline vdouble8 min(double a, const vdouble8& b) { return min(vdouble8(a),b); } + + __forceinline vdouble8 max(const vdouble8& a, const vdouble8& b) { return _mm512_max_pd(a, b); } + __forceinline vdouble8 max(const vdouble8& a, double b) { return max(a,vdouble8(b)); } + __forceinline vdouble8 max(double a, const vdouble8& b) { return max(vdouble8(a),b); } + + __forceinline vdouble8 mask_add(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_add_pd(c,mask,a,b); } + __forceinline vdouble8 mask_sub(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_sub_pd(c,mask,a,b); } + + __forceinline vdouble8 mask_and(const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_and_pd(c,m,a,b); } + __forceinline vdouble8 mask_or (const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_or_pd(c,m,a,b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 madd (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmadd_pd(a,b,c); } + __forceinline vdouble8 msub (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmsub_pd(a,b,c); } + __forceinline vdouble8 nmadd(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmadd_pd(a,b,c); } + __forceinline vdouble8 nmsub(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmsub_pd(a,b,c); } + + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8& operator +=(vdouble8& a, const vdouble8& b) { return a = a + b; } + __forceinline vdouble8& operator +=(vdouble8& a, double b) { return a = a + b; } + + __forceinline vdouble8& operator -=(vdouble8& a, const vdouble8& b) { return a = a - b; } + __forceinline vdouble8& operator -=(vdouble8& a, double b) { return a = a - b; } + + __forceinline vdouble8& operator *=(vdouble8& a, const vdouble8& b) { return a = a * b; } + __forceinline vdouble8& operator *=(vdouble8& a, double b) { return a = a * b; } + + __forceinline vdouble8& operator &=(vdouble8& a, const vdouble8& b) { return a = a & b; } + __forceinline vdouble8& operator &=(vdouble8& a, double b) { return a = a & b; } + + __forceinline vdouble8& operator |=(vdouble8& a, const vdouble8& b) { return a = a | b; } + __forceinline vdouble8& operator |=(vdouble8& a, double b) { return a = a | b; } + + __forceinline vdouble8& operator <<=(vdouble8& a, const double b) { return a = a << b; } + __forceinline vdouble8& operator >>=(vdouble8& a, const double b) { return a = a >> b; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator ==(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 operator ==(const vdouble8& a, double b) { return a == vdouble8(b); } + __forceinline vboold8 operator ==(double a, const vdouble8& b) { return vdouble8(a) == b; } + + __forceinline vboold8 operator !=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold8 operator !=(const vdouble8& a, double b) { return a != vdouble8(b); } + __forceinline vboold8 operator !=(double a, const vdouble8& b) { return vdouble8(a) != b; } + + __forceinline vboold8 operator < (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold8 operator < (const vdouble8& a, double b) { return a < vdouble8(b); } + __forceinline vboold8 operator < (double a, const vdouble8& b) { return vdouble8(a) < b; } + + __forceinline vboold8 operator >=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold8 operator >=(const vdouble8& a, double b) { return a >= vdouble8(b); } + __forceinline vboold8 operator >=(double a, const vdouble8& b) { return vdouble8(a) >= b; } + + __forceinline vboold8 operator > (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold8 operator > (const vdouble8& a, double b) { return a > vdouble8(b); } + __forceinline vboold8 operator > (double a, const vdouble8& b) { return vdouble8(a) > b; } + + __forceinline vboold8 operator <=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboold8 operator <=(const vdouble8& a, double b) { return a <= vdouble8(b); } + __forceinline vboold8 operator <=(double a, const vdouble8& b) { return vdouble8(a) <= b; } + + __forceinline vboold8 eq(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 ne(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold8 lt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold8 ge(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold8 gt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold8 le(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vboold8 eq(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 ne(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboold8 lt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboold8 ge(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboold8 gt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboold8 le(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LE); } + + __forceinline vdouble8 select(const vboold8& m, const vdouble8& t, const vdouble8& f) { + return _mm512_mask_or_pd(f,m,t,t); + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template<int i0, int i1> + __forceinline vdouble8 shuffle(const vdouble8& v) { + return _mm512_permute_pd(v, (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0); + } + + template<int i> + __forceinline vdouble8 shuffle(const vdouble8& v) { + return shuffle<i, i>(v); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vdouble8 shuffle(const vdouble8& v) { + return _mm512_permutex_pd(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i0, int i1> + __forceinline vdouble8 shuffle4(const vdouble8& v) { + return _mm512_shuffle_f64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2)); + } + + template<int i> + __forceinline vdouble8 shuffle4(const vdouble8& v) { + return shuffle4<i, i>(v); + } + + template<int i> + __forceinline vdouble8 align_shift_right(const vdouble8& a, const vdouble8& b) { + return _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(a), _mm512_castpd_si512(b), i)); + } + + __forceinline double toScalar(const vdouble8& v) { + return _mm_cvtsd_f64(_mm512_castpd512_pd128(v)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 vreduce_add2(vdouble8 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vdouble8 vreduce_add4(vdouble8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vdouble8 vreduce_add (vdouble8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); } + + __forceinline vdouble8 vreduce_min2(vdouble8 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vdouble8 vreduce_min4(vdouble8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vdouble8 vreduce_min (vdouble8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); } + + __forceinline vdouble8 vreduce_max2(vdouble8 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vdouble8 vreduce_max4(vdouble8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vdouble8 vreduce_max (vdouble8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); } + + __forceinline double reduce_add(const vdouble8& v) { return toScalar(vreduce_add(v)); } + __forceinline double reduce_min(const vdouble8& v) { return toScalar(vreduce_min(v)); } + __forceinline double reduce_max(const vdouble8& v) { return toScalar(vreduce_max(v)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 permute(const vdouble8& v, const vllong8& index) { + return _mm512_permutexvar_pd(index, v); + } + + __forceinline vdouble8 reverse(const vdouble8& a) { + return permute(a, vllong8(reverse_step)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble8& v) + { + cout << "<" << v[0]; + for (size_t i=1; i<8; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vfloat16_avx512.h b/thirdparty/embree/common/simd/vfloat16_avx512.h new file mode 100644 index 0000000000..9f1e2459c4 --- /dev/null +++ b/thirdparty/embree/common/simd/vfloat16_avx512.h @@ -0,0 +1,615 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 16-wide AVX-512 float type */ + template<> + struct vfloat<16> + { + ALIGNED_STRUCT_(64); + + typedef vboolf16 Bool; + typedef vint16 Int; + typedef vfloat16 Float; + + enum { size = 16 }; // number of SIMD elements + union { // data + __m512 v; + float f[16]; + int i[16]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat() {} + __forceinline vfloat(const vfloat16& t) { v = t; } + __forceinline vfloat16& operator =(const vfloat16& f) { v = f.v; return *this; } + + __forceinline vfloat(const __m512& t) { v = t; } + __forceinline operator __m512() const { return v; } + __forceinline operator __m256() const { return _mm512_castps512_ps256(v); } + __forceinline operator __m128() const { return _mm512_castps512_ps128(v); } + + __forceinline vfloat(float f) { + v = _mm512_set1_ps(f); + } + + __forceinline vfloat(float a, float b, float c, float d) { + v = _mm512_set4_ps(a, b, c, d); + } + + __forceinline vfloat(const vfloat4& i) { + v = _mm512_broadcast_f32x4(i); + } + + __forceinline vfloat(const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d) { + v = _mm512_castps128_ps512(a); + v = _mm512_insertf32x4(v, b, 1); + v = _mm512_insertf32x4(v, c, 2); + v = _mm512_insertf32x4(v, d, 3); + } + + __forceinline vfloat(const vboolf16& mask, const vfloat4& a, const vfloat4& b) { + v = _mm512_broadcast_f32x4(a); + v = _mm512_mask_broadcast_f32x4(v,mask,b); + } + + __forceinline vfloat(const vfloat8& i) { + v = _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castps_pd(i))); + } + + __forceinline vfloat(const vfloat8& a, const vfloat8& b) { + v = _mm512_castps256_ps512(a); +#if defined(__AVX512DQ__) + v = _mm512_insertf32x8(v, b, 1); +#else + v = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(v), _mm256_castps_pd(b), 1)); +#endif + } + + /* WARNING: due to f64x4 the mask is considered as an 8bit mask */ + /*__forceinline vfloat(const vboolf16& mask, const vfloat8& a, const vfloat8& b) { + __m512d aa = _mm512_broadcast_f64x4(_mm256_castps_pd(a)); + aa = _mm512_mask_broadcast_f64x4(aa,mask,_mm256_castps_pd(b)); + v = _mm512_castpd_ps(aa); + }*/ + + __forceinline explicit vfloat(const vint16& a) { + v = _mm512_cvtepi32_ps(a); + } + + __forceinline explicit vfloat(const vuint16& a) { + v = _mm512_cvtepu32_ps(a); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat(ZeroTy) : v(_mm512_setzero_ps()) {} + __forceinline vfloat(OneTy) : v(_mm512_set1_ps(1.0f)) {} + __forceinline vfloat(PosInfTy) : v(_mm512_set1_ps(pos_inf)) {} + __forceinline vfloat(NegInfTy) : v(_mm512_set1_ps(neg_inf)) {} + __forceinline vfloat(StepTy) : v(_mm512_set_ps(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + __forceinline vfloat(NaNTy) : v(_mm512_set1_ps(nan)) {} + __forceinline vfloat(UndefinedTy) : v(_mm512_undefined_ps()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vfloat16 load (const void* ptr) { return _mm512_load_ps((float*)ptr); } + static __forceinline vfloat16 loadu(const void* ptr) { return _mm512_loadu_ps((float*)ptr); } + + static __forceinline vfloat16 load (const vboolf16& mask, const void* ptr) { return _mm512_mask_load_ps (_mm512_setzero_ps(),mask,(float*)ptr); } + static __forceinline vfloat16 loadu(const vboolf16& mask, const void* ptr) { return _mm512_mask_loadu_ps(_mm512_setzero_ps(),mask,(float*)ptr); } + + static __forceinline void store (void* ptr, const vfloat16& v) { _mm512_store_ps ((float*)ptr,v); } + static __forceinline void storeu(void* ptr, const vfloat16& v) { _mm512_storeu_ps((float*)ptr,v); } + + static __forceinline void store (const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_store_ps ((float*)ptr,mask,v); } + static __forceinline void storeu(const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_storeu_ps((float*)ptr,mask,v); } + + static __forceinline void store_nt(void* __restrict__ ptr, const vfloat16& a) { + _mm512_stream_ps((float*)ptr,a); + } + + static __forceinline vfloat16 broadcast(const float* f) { + return _mm512_set1_ps(*f); + } + + template<int scale = 4> + static __forceinline vfloat16 gather(const float* ptr, const vint16& index) { + return _mm512_i32gather_ps(index, ptr, scale); + } + + template<int scale = 4> + static __forceinline vfloat16 gather(const vboolf16& mask, const float* ptr, const vint16& index) { + vfloat16 r = zero; + return _mm512_mask_i32gather_ps(r, mask, index, ptr, scale); + } + + template<int scale = 4> + static __forceinline void scatter(float* ptr, const vint16& index, const vfloat16& v) { + _mm512_i32scatter_ps(ptr, index, v, scale); + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf16& mask, float* ptr, const vint16& index, const vfloat16& v) { + _mm512_mask_i32scatter_ps(ptr, mask, index, v, scale); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float& operator [](size_t index) { assert(index < 16); return f[index]; } + __forceinline const float& operator [](size_t index) const { assert(index < 16); return f[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 asFloat(const vint16& a) { return _mm512_castsi512_ps(a); } + __forceinline vint16 asInt (const vfloat16& a) { return _mm512_castps_si512(a); } + __forceinline vuint16 asUInt (const vfloat16& a) { return _mm512_castps_si512(a); } + + __forceinline vint16 toInt (const vfloat16& a) { return vint16(a); } + __forceinline vfloat16 toFloat(const vint16& a) { return vfloat16(a); } + + __forceinline vfloat16 operator +(const vfloat16& a) { return a; } + __forceinline vfloat16 operator -(const vfloat16& a) { return _mm512_mul_ps(a,vfloat16(-1)); } + + __forceinline vfloat16 abs (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); } + __forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); } + + __forceinline vfloat16 rcp(const vfloat16& a) { + const vfloat16 r = _mm512_rcp14_ps(a); + return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vfloat16(2.0f))); + } + + __forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); } + __forceinline vfloat16 sqrt(const vfloat16& a) { return _mm512_sqrt_ps(a); } + + __forceinline vfloat16 rsqrt(const vfloat16& a) + { + const vfloat16 r = _mm512_rsqrt14_ps(a); + return _mm512_fmadd_ps(_mm512_set1_ps(1.5f), r, + _mm512_mul_ps(_mm512_mul_ps(_mm512_mul_ps(a, _mm512_set1_ps(-0.5f)), r), _mm512_mul_ps(r, r))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 operator +(const vfloat16& a, const vfloat16& b) { return _mm512_add_ps(a, b); } + __forceinline vfloat16 operator +(const vfloat16& a, float b) { return a + vfloat16(b); } + __forceinline vfloat16 operator +(float a, const vfloat16& b) { return vfloat16(a) + b; } + + __forceinline vfloat16 operator -(const vfloat16& a, const vfloat16& b) { return _mm512_sub_ps(a, b); } + __forceinline vfloat16 operator -(const vfloat16& a, float b) { return a - vfloat16(b); } + __forceinline vfloat16 operator -(float a, const vfloat16& b) { return vfloat16(a) - b; } + + __forceinline vfloat16 operator *(const vfloat16& a, const vfloat16& b) { return _mm512_mul_ps(a, b); } + __forceinline vfloat16 operator *(const vfloat16& a, float b) { return a * vfloat16(b); } + __forceinline vfloat16 operator *(float a, const vfloat16& b) { return vfloat16(a) * b; } + + __forceinline vfloat16 operator /(const vfloat16& a, const vfloat16& b) { return _mm512_div_ps(a,b); } + __forceinline vfloat16 operator /(const vfloat16& a, float b) { return a/vfloat16(b); } + __forceinline vfloat16 operator /(float a, const vfloat16& b) { return vfloat16(a)/b; } + + __forceinline vfloat16 operator &(const vfloat16& a, const vfloat16& b) { return _mm512_and_ps(a,b); } + __forceinline vfloat16 operator |(const vfloat16& a, const vfloat16& b) { return _mm512_or_ps(a,b); } + __forceinline vfloat16 operator ^(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b))); + } + + __forceinline vfloat16 min(const vfloat16& a, const vfloat16& b) { return _mm512_min_ps(a,b); } + __forceinline vfloat16 min(const vfloat16& a, float b) { return _mm512_min_ps(a,vfloat16(b)); } + __forceinline vfloat16 min(const float& a, const vfloat16& b) { return _mm512_min_ps(vfloat16(a),b); } + + __forceinline vfloat16 max(const vfloat16& a, const vfloat16& b) { return _mm512_max_ps(a,b); } + __forceinline vfloat16 max(const vfloat16& a, float b) { return _mm512_max_ps(a,vfloat16(b)); } + __forceinline vfloat16 max(const float& a, const vfloat16& b) { return _mm512_max_ps(vfloat16(a),b); } + + __forceinline vfloat16 mini(const vfloat16& a, const vfloat16& b) { + const vint16 ai = _mm512_castps_si512(a); + const vint16 bi = _mm512_castps_si512(b); + const vint16 ci = _mm512_min_epi32(ai,bi); + return _mm512_castsi512_ps(ci); + } + + __forceinline vfloat16 maxi(const vfloat16& a, const vfloat16& b) { + const vint16 ai = _mm512_castps_si512(a); + const vint16 bi = _mm512_castps_si512(b); + const vint16 ci = _mm512_max_epi32(ai,bi); + return _mm512_castsi512_ps(ci); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(a,b,c); } + __forceinline vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); } + __forceinline vfloat16 nmadd(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(a,b,c); } + __forceinline vfloat16 nmsub(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmsub_ps(a,b,c); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16& operator +=(vfloat16& a, const vfloat16& b) { return a = a + b; } + __forceinline vfloat16& operator +=(vfloat16& a, float b) { return a = a + b; } + + __forceinline vfloat16& operator -=(vfloat16& a, const vfloat16& b) { return a = a - b; } + __forceinline vfloat16& operator -=(vfloat16& a, float b) { return a = a - b; } + + __forceinline vfloat16& operator *=(vfloat16& a, const vfloat16& b) { return a = a * b; } + __forceinline vfloat16& operator *=(vfloat16& a, float b) { return a = a * b; } + + __forceinline vfloat16& operator /=(vfloat16& a, const vfloat16& b) { return a = a / b; } + __forceinline vfloat16& operator /=(vfloat16& a, float b) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator ==(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 operator ==(const vfloat16& a, float b) { return a == vfloat16(b); } + __forceinline vboolf16 operator ==(float a, const vfloat16& b) { return vfloat16(a) == b; } + + __forceinline vboolf16 operator !=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 operator !=(const vfloat16& a, float b) { return a != vfloat16(b); } + __forceinline vboolf16 operator !=(float a, const vfloat16& b) { return vfloat16(a) != b; } + + __forceinline vboolf16 operator < (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 operator < (const vfloat16& a, float b) { return a < vfloat16(b); } + __forceinline vboolf16 operator < (float a, const vfloat16& b) { return vfloat16(a) < b; } + + __forceinline vboolf16 operator >=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 operator >=(const vfloat16& a, float b) { return a >= vfloat16(b); } + __forceinline vboolf16 operator >=(float a, const vfloat16& b) { return vfloat16(a) >= b; } + + __forceinline vboolf16 operator > (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 operator > (const vfloat16& a, float b) { return a > vfloat16(b); } + __forceinline vboolf16 operator > (float a, const vfloat16& b) { return vfloat16(a) > b; } + + __forceinline vboolf16 operator <=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 operator <=(const vfloat16& a, float b) { return a <= vfloat16(b); } + __forceinline vboolf16 operator <=(float a, const vfloat16& b) { return vfloat16(a) <= b; } + + __forceinline vboolf16 eq(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vboolf16 eq(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LE); } + + __forceinline vfloat16 select(const vboolf16& s, const vfloat16& t, const vfloat16& f) { + return _mm512_mask_blend_ps(s, f, t); + } + + __forceinline vfloat16 lerp(const vfloat16& a, const vfloat16& b, const vfloat16& t) { + return madd(t,b-a,a); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 floor(const vfloat16& a) { + return _mm512_floor_ps(a); + } + __forceinline vfloat16 ceil (const vfloat16& a) { + return _mm512_ceil_ps(a); + } + __forceinline vfloat16 round (const vfloat16& a) { + return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + } + __forceinline vint16 floori (const vfloat16& a) { + return _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 unpacklo(const vfloat16& a, const vfloat16& b) { return _mm512_unpacklo_ps(a, b); } + __forceinline vfloat16 unpackhi(const vfloat16& a, const vfloat16& b) { return _mm512_unpackhi_ps(a, b); } + + template<int i> + __forceinline vfloat16 shuffle(const vfloat16& v) { + return _mm512_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vfloat16 shuffle(const vfloat16& v) { + return _mm512_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i> + __forceinline vfloat16 shuffle4(const vfloat16& v) { + return _mm512_shuffle_f32x4(v, v ,_MM_SHUFFLE(i, i, i, i)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vfloat16 shuffle4(const vfloat16& v) { + return _mm512_shuffle_f32x4(v, v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + __forceinline vfloat16 interleave4_even(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xcc), _mm512_castps_si512(b), (_MM_PERM_ENUM)0x4e)); + } + + __forceinline vfloat16 interleave4_odd(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x33), _mm512_castps_si512(a), (_MM_PERM_ENUM)0x4e)); + } + + __forceinline vfloat16 permute(vfloat16 v, __m512i index) { + return _mm512_castsi512_ps(_mm512_permutexvar_epi32(index, _mm512_castps_si512(v))); + } + + __forceinline vfloat16 reverse(const vfloat16& v) { + return permute(v,_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)); + } + + template<int i> + __forceinline vfloat16 align_shift_right(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b),i)); + }; + + template<int i> + __forceinline vfloat16 mask_align_shift_right(const vboolf16& mask, vfloat16& c, const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_mask_alignr_epi32(_mm512_castps_si512(c),mask,_mm512_castps_si512(a),_mm512_castps_si512(b),i)); + }; + + __forceinline vfloat16 shift_left_1(const vfloat16& a) { + vfloat16 z = zero; + return mask_align_shift_right<15>(0xfffe,z,a,a); + } + + __forceinline vfloat16 shift_right_1(const vfloat16& x) { + return align_shift_right<1>(zero,x); + } + + __forceinline float toScalar(const vfloat16& v) { return mm512_cvtss_f32(v); } + + + template<int i> __forceinline vfloat16 insert4(const vfloat16& a, const vfloat4& b) { return _mm512_insertf32x4(a, b, i); } + + template<int N, int i> + vfloat<N> extractN(const vfloat16& v); + + template<> __forceinline vfloat4 extractN<4,0>(const vfloat16& v) { return _mm512_castps512_ps128(v); } + template<> __forceinline vfloat4 extractN<4,1>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 1); } + template<> __forceinline vfloat4 extractN<4,2>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 2); } + template<> __forceinline vfloat4 extractN<4,3>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 3); } + + template<> __forceinline vfloat8 extractN<8,0>(const vfloat16& v) { return _mm512_castps512_ps256(v); } + template<> __forceinline vfloat8 extractN<8,1>(const vfloat16& v) { return _mm512_extractf32x8_ps(v, 1); } + + template<int i> __forceinline vfloat4 extract4 (const vfloat16& v) { return _mm512_extractf32x4_ps(v, i); } + template<> __forceinline vfloat4 extract4<0>(const vfloat16& v) { return _mm512_castps512_ps128(v); } + + template<int i> __forceinline vfloat8 extract8 (const vfloat16& v) { return _mm512_extractf32x8_ps(v, i); } + template<> __forceinline vfloat8 extract8<0>(const vfloat16& v) { return _mm512_castps512_ps256(v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Transpose + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3, + vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3) + { + vfloat16 a0a2_b0b2 = unpacklo(r0, r2); + vfloat16 c0c2_d0d2 = unpackhi(r0, r2); + vfloat16 a1a3_b1b3 = unpacklo(r1, r3); + vfloat16 c1c3_d1d3 = unpackhi(r1, r3); + + c0 = unpacklo(a0a2_b0b2, a1a3_b1b3); + c1 = unpackhi(a0a2_b0b2, a1a3_b1b3); + c2 = unpacklo(c0c2_d0d2, c1c3_d1d3); + c3 = unpackhi(c0c2_d0d2, c1c3_d1d3); + } + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, + const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, + const vfloat4& r8, const vfloat4& r9, const vfloat4& r10, const vfloat4& r11, + const vfloat4& r12, const vfloat4& r13, const vfloat4& r14, const vfloat4& r15, + vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3) + { + return transpose(vfloat16(r0, r4, r8, r12), vfloat16(r1, r5, r9, r13), vfloat16(r2, r6, r10, r14), vfloat16(r3, r7, r11, r15), + c0, c1, c2, c3); + } + + __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3, + const vfloat16& r4, const vfloat16& r5, const vfloat16& r6, const vfloat16& r7, + vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3, + vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7) + { + vfloat16 a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3; + transpose(r0, r1, r2, r3, a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3); + + vfloat16 a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7; + transpose(r4, r5, r6, r7, a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7); + + c0 = interleave4_even(a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7); + c1 = interleave4_even(b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7); + c2 = interleave4_even(c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7); + c3 = interleave4_even(d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7); + c4 = interleave4_odd (a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7); + c5 = interleave4_odd (b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7); + c6 = interleave4_odd (c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7); + c7 = interleave4_odd (d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7); + } + + __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, + const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7, + const vfloat8& r8, const vfloat8& r9, const vfloat8& r10, const vfloat8& r11, + const vfloat8& r12, const vfloat8& r13, const vfloat8& r14, const vfloat8& r15, + vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3, + vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7) + { + return transpose(vfloat16(r0, r8), vfloat16(r1, r9), vfloat16(r2, r10), vfloat16(r3, r11), + vfloat16(r4, r12), vfloat16(r5, r13), vfloat16(r6, r14), vfloat16(r7, r15), + c0, c1, c2, c3, c4, c5, c6, c7); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 vreduce_add2(vfloat16 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vfloat16 vreduce_add4(vfloat16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vfloat16 vreduce_add8(vfloat16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } + __forceinline vfloat16 vreduce_add (vfloat16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } + + __forceinline vfloat16 vreduce_min2(vfloat16 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vfloat16 vreduce_min4(vfloat16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vfloat16 vreduce_min8(vfloat16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } + __forceinline vfloat16 vreduce_min (vfloat16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vfloat16 vreduce_max2(vfloat16 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vfloat16 vreduce_max4(vfloat16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vfloat16 vreduce_max8(vfloat16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } + __forceinline vfloat16 vreduce_max (vfloat16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } + + __forceinline float reduce_add(const vfloat16& v) { return toScalar(vreduce_add(v)); } + __forceinline float reduce_min(const vfloat16& v) { return toScalar(vreduce_min(v)); } + __forceinline float reduce_max(const vfloat16& v) { return toScalar(vreduce_max(v)); } + + __forceinline size_t select_min(const vfloat16& v) { + return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_min(v)),_MM_CMPINT_EQ))); + } + + __forceinline size_t select_max(const vfloat16& v) { + return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_max(v)),_MM_CMPINT_EQ))); + } + + __forceinline size_t select_min(const vboolf16& valid, const vfloat16& v) + { + const vfloat16 a = select(valid,v,vfloat16(pos_inf)); + const vbool16 valid_min = valid & (a == vreduce_min(a)); + return bsf(movemask(any(valid_min) ? valid_min : valid)); + } + + __forceinline size_t select_max(const vboolf16& valid, const vfloat16& v) + { + const vfloat16 a = select(valid,v,vfloat16(neg_inf)); + const vbool16 valid_max = valid & (a == vreduce_max(a)); + return bsf(movemask(any(valid_max) ? valid_max : valid)); + } + + __forceinline vfloat16 prefix_sum(const vfloat16& a) + { + const vfloat16 z(zero); + vfloat16 v = a; + v = v + align_shift_right<16-1>(v,z); + v = v + align_shift_right<16-2>(v,z); + v = v + align_shift_right<16-4>(v,z); + v = v + align_shift_right<16-8>(v,z); + return v; + } + + __forceinline vfloat16 reverse_prefix_sum(const vfloat16& a) + { + const vfloat16 z(zero); + vfloat16 v = a; + v = v + align_shift_right<1>(z,v); + v = v + align_shift_right<2>(z,v); + v = v + align_shift_right<4>(z,v); + v = v + align_shift_right<8>(z,v); + return v; + } + + __forceinline vfloat16 prefix_min(const vfloat16& a) + { + const vfloat16 z(pos_inf); + vfloat16 v = a; + v = min(v,align_shift_right<16-1>(v,z)); + v = min(v,align_shift_right<16-2>(v,z)); + v = min(v,align_shift_right<16-4>(v,z)); + v = min(v,align_shift_right<16-8>(v,z)); + return v; + } + + __forceinline vfloat16 prefix_max(const vfloat16& a) + { + const vfloat16 z(neg_inf); + vfloat16 v = a; + v = max(v,align_shift_right<16-1>(v,z)); + v = max(v,align_shift_right<16-2>(v,z)); + v = max(v,align_shift_right<16-4>(v,z)); + v = max(v,align_shift_right<16-8>(v,z)); + return v; + } + + + __forceinline vfloat16 reverse_prefix_min(const vfloat16& a) + { + const vfloat16 z(pos_inf); + vfloat16 v = a; + v = min(v,align_shift_right<1>(z,v)); + v = min(v,align_shift_right<2>(z,v)); + v = min(v,align_shift_right<4>(z,v)); + v = min(v,align_shift_right<8>(z,v)); + return v; + } + + __forceinline vfloat16 reverse_prefix_max(const vfloat16& a) + { + const vfloat16 z(neg_inf); + vfloat16 v = a; + v = max(v,align_shift_right<1>(z,v)); + v = max(v,align_shift_right<2>(z,v)); + v = max(v,align_shift_right<4>(z,v)); + v = max(v,align_shift_right<8>(z,v)); + return v; + } + + __forceinline vfloat16 rcp_safe(const vfloat16& a) { + return rcp(select(a != vfloat16(zero), a, vfloat16(min_rcp_input))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat16& v) + { + cout << "<" << v[0]; + for (int i=1; i<16; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vfloat4_sse2.h b/thirdparty/embree/common/simd/vfloat4_sse2.h new file mode 100644 index 0000000000..5215bf9730 --- /dev/null +++ b/thirdparty/embree/common/simd/vfloat4_sse2.h @@ -0,0 +1,722 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide SSE float type */ + template<> + struct vfloat<4> + { + ALIGNED_STRUCT_(16); + + typedef vboolf4 Bool; + typedef vint4 Int; + typedef vfloat4 Float; + + enum { size = 4 }; // number of SIMD elements + union { __m128 v; float f[4]; int i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat() {} + __forceinline vfloat(const vfloat4& other) { v = other.v; } + __forceinline vfloat4& operator =(const vfloat4& other) { v = other.v; return *this; } + + __forceinline vfloat(__m128 a) : v(a) {} + __forceinline operator const __m128&() const { return v; } + __forceinline operator __m128&() { return v; } + + __forceinline vfloat(float a) : v(_mm_set1_ps(a)) {} + __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {} + + __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {} + __forceinline explicit vfloat(const vuint4& x) { + const __m128i a = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF)); + const __m128i b = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31 + const __m128 af = _mm_cvtepi32_ps(a); + const __m128 bf = _mm_castsi128_ps(b); + v = _mm_add_ps(af,bf); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat(ZeroTy) : v(_mm_setzero_ps()) {} + __forceinline vfloat(OneTy) : v(_mm_set1_ps(1.0f)) {} + __forceinline vfloat(PosInfTy) : v(_mm_set1_ps(pos_inf)) {} + __forceinline vfloat(NegInfTy) : v(_mm_set1_ps(neg_inf)) {} + __forceinline vfloat(StepTy) : v(_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)) {} + __forceinline vfloat(NaNTy) : v(_mm_set1_ps(nan)) {} + __forceinline vfloat(UndefinedTy) : v(_mm_undefined_ps()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vfloat4 load (const void* a) { return _mm_load_ps((float*)a); } + static __forceinline vfloat4 loadu(const void* a) { return _mm_loadu_ps((float*)a); } + + static __forceinline void store (void* ptr, const vfloat4& v) { _mm_store_ps((float*)ptr,v); } + static __forceinline void storeu(void* ptr, const vfloat4& v) { _mm_storeu_ps((float*)ptr,v); } + +#if defined(__AVX512VL__) + + static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_ps (_mm_setzero_ps(),mask,(float*)ptr); } + static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_ps(_mm_setzero_ps(),mask,(float*)ptr); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_store_ps ((float*)ptr,mask,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_storeu_ps((float*)ptr,mask,v); } +#elif defined(__AVX__) + static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); } + static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); } +#else + static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_load_ps ((float*)ptr),mask); } + static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_loadu_ps((float*)ptr),mask); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { store (ptr,select(mask,v,load (ptr))); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { storeu(ptr,select(mask,v,loadu(ptr))); } +#endif + +#if defined(__AVX__) + static __forceinline vfloat4 broadcast(const void* a) { return _mm_broadcast_ss((float*)a); } +#else + static __forceinline vfloat4 broadcast(const void* a) { return _mm_set1_ps(*(float*)a); } +#endif + + static __forceinline vfloat4 load_nt (const float* ptr) { +#if defined (__SSE4_1__) + return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)ptr)); +#else + return _mm_load_ps(ptr); +#endif + } + +#if defined(__SSE4_1__) + static __forceinline vfloat4 load(const char* ptr) { + return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr))); + } +#else + static __forceinline vfloat4 load(const char* ptr) { + return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); + } +#endif + +#if defined(__SSE4_1__) + static __forceinline vfloat4 load(const unsigned char* ptr) { + return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr))); + } +#else + static __forceinline vfloat4 load(const unsigned char* ptr) { + //return _mm_cvtpu8_ps(*(__m64*)ptr); // don't enable, will use MMX instructions + return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); + } +#endif + +#if defined(__SSE4_1__) + static __forceinline vfloat4 load(const short* ptr) { + return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr))); + } +#else + static __forceinline vfloat4 load(const short* ptr) { + return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); + } +#endif + + static __forceinline vfloat4 load(const unsigned short* ptr) { + return _mm_mul_ps(vfloat4(vint4::load(ptr)),vfloat4(1.0f/65535.0f)); + } + + static __forceinline void store_nt(void* ptr, const vfloat4& v) + { +#if defined (__SSE4_1__) + _mm_stream_ps((float*)ptr,v); +#else + _mm_store_ps((float*)ptr,v); +#endif + } + + template<int scale = 4> + static __forceinline vfloat4 gather(const float* ptr, const vint4& index) { +#if defined(__AVX2__) + return _mm_i32gather_ps(ptr, index, scale); +#else + return vfloat4( + *(float*)(((char*)ptr)+scale*index[0]), + *(float*)(((char*)ptr)+scale*index[1]), + *(float*)(((char*)ptr)+scale*index[2]), + *(float*)(((char*)ptr)+scale*index[3])); +#endif + } + + template<int scale = 4> + static __forceinline vfloat4 gather(const vboolf4& mask, const float* ptr, const vint4& index) { + vfloat4 r = zero; +#if defined(__AVX512VL__) + return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale); +#elif defined(__AVX2__) + return _mm_mask_i32gather_ps(r, ptr, index, mask, scale); +#else + if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]); + return r; +#endif + } + + template<int scale = 4> + static __forceinline void scatter(void* ptr, const vint4& index, const vfloat4& v) + { +#if defined(__AVX512VL__) + _mm_i32scatter_ps((float*)ptr, index, v, scale); +#else + *(float*)(((char*)ptr)+scale*index[0]) = v[0]; + *(float*)(((char*)ptr)+scale*index[1]) = v[1]; + *(float*)(((char*)ptr)+scale*index[2]) = v[2]; + *(float*)(((char*)ptr)+scale*index[3]) = v[3]; +#endif + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vfloat4& v) + { +#if defined(__AVX512VL__) + _mm_mask_i32scatter_ps((float*)ptr ,mask, index, v, scale); +#else + if (likely(mask[0])) *(float*)(((char*)ptr)+scale*index[0]) = v[0]; + if (likely(mask[1])) *(float*)(((char*)ptr)+scale*index[1]) = v[1]; + if (likely(mask[2])) *(float*)(((char*)ptr)+scale*index[2]) = v[2]; + if (likely(mask[3])) *(float*)(((char*)ptr)+scale*index[3]) = v[3]; +#endif + } + + static __forceinline void store(const vboolf4& mask, char* ptr, const vint4& ofs, const vfloat4& v) { + scatter<1>(mask,ptr,ofs,v); + } + static __forceinline void store(const vboolf4& mask, float* ptr, const vint4& ofs, const vfloat4& v) { + scatter<4>(mask,ptr,ofs,v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator [](size_t index) const { assert(index < 4); return f[index]; } + __forceinline float& operator [](size_t index) { assert(index < 4); return f[index]; } + + friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) { +#if defined(__AVX512VL__) + return _mm_mask_blend_ps(m, f, t); +#elif defined(__SSE4_1__) + return _mm_blendv_ps(f, t, m); +#else + return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); +#endif + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Load/Store + //////////////////////////////////////////////////////////////////////////////// + + template<> struct mem<vfloat4> + { + static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return vfloat4::load (mask,ptr); } + static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return vfloat4::loadu(mask,ptr); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { vfloat4::store (mask,ptr,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { vfloat4::storeu(mask,ptr,v); } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 asFloat(const vint4& a) { return _mm_castsi128_ps(a); } + __forceinline vint4 asInt (const vfloat4& a) { return _mm_castps_si128(a); } + __forceinline vuint4 asUInt (const vfloat4& a) { return _mm_castps_si128(a); } + + __forceinline vint4 toInt (const vfloat4& a) { return vint4(a); } + __forceinline vfloat4 toFloat(const vint4& a) { return vfloat4(a); } + + __forceinline vfloat4 operator +(const vfloat4& a) { return a; } + __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } + + __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); } +#if defined(__AVX512VL__) + __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); } +#else + __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); } +#endif + __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } + + __forceinline vfloat4 rcp(const vfloat4& a) + { +#if defined(__AVX512VL__) + const vfloat4 r = _mm_rcp14_ps(a); +#else + const vfloat4 r = _mm_rcp_ps(a); +#endif + +#if defined(__AVX2__) + return _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f))); +#else + return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); +#endif + } + __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); } + __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); } + + __forceinline vfloat4 rsqrt(const vfloat4& a) + { +#if defined(__AVX512VL__) + vfloat4 r = _mm_rsqrt14_ps(a); +#else + vfloat4 r = _mm_rsqrt_ps(a); +#endif + +#if defined(__ARM_NEON) + r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); +#elif defined(__AVX2__) + r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); +#else + r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); +#endif + return r; + } + + __forceinline vboolf4 isnan(const vfloat4& a) { + const vfloat4 b = _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); +#if defined(__AVX512VL__) + return _mm_cmp_epi32_mask(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT); +#else + return _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000))); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 operator +(const vfloat4& a, const vfloat4& b) { return _mm_add_ps(a, b); } + __forceinline vfloat4 operator +(const vfloat4& a, float b) { return a + vfloat4(b); } + __forceinline vfloat4 operator +(float a, const vfloat4& b) { return vfloat4(a) + b; } + + __forceinline vfloat4 operator -(const vfloat4& a, const vfloat4& b) { return _mm_sub_ps(a, b); } + __forceinline vfloat4 operator -(const vfloat4& a, float b) { return a - vfloat4(b); } + __forceinline vfloat4 operator -(float a, const vfloat4& b) { return vfloat4(a) - b; } + + __forceinline vfloat4 operator *(const vfloat4& a, const vfloat4& b) { return _mm_mul_ps(a, b); } + __forceinline vfloat4 operator *(const vfloat4& a, float b) { return a * vfloat4(b); } + __forceinline vfloat4 operator *(float a, const vfloat4& b) { return vfloat4(a) * b; } + + __forceinline vfloat4 operator /(const vfloat4& a, const vfloat4& b) { return _mm_div_ps(a,b); } + __forceinline vfloat4 operator /(const vfloat4& a, float b) { return a/vfloat4(b); } + __forceinline vfloat4 operator /(float a, const vfloat4& b) { return vfloat4(a)/b; } + + __forceinline vfloat4 operator &(const vfloat4& a, const vfloat4& b) { return _mm_and_ps(a,b); } + __forceinline vfloat4 operator |(const vfloat4& a, const vfloat4& b) { return _mm_or_ps(a,b); } + __forceinline vfloat4 operator ^(const vfloat4& a, const vfloat4& b) { return _mm_xor_ps(a,b); } + __forceinline vfloat4 operator ^(const vfloat4& a, const vint4& b) { return _mm_xor_ps(a,_mm_castsi128_ps(b)); } + + __forceinline vfloat4 min(const vfloat4& a, const vfloat4& b) { return _mm_min_ps(a,b); } + __forceinline vfloat4 min(const vfloat4& a, float b) { return _mm_min_ps(a,vfloat4(b)); } + __forceinline vfloat4 min(float a, const vfloat4& b) { return _mm_min_ps(vfloat4(a),b); } + + __forceinline vfloat4 max(const vfloat4& a, const vfloat4& b) { return _mm_max_ps(a,b); } + __forceinline vfloat4 max(const vfloat4& a, float b) { return _mm_max_ps(a,vfloat4(b)); } + __forceinline vfloat4 max(float a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); } + +#if defined(__SSE4_1__) + __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_min_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } + + __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_max_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } + + __forceinline vfloat4 minui(const vfloat4& a, const vfloat4& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_min_epu32(ai,bi); + return _mm_castsi128_ps(ci); + } + + __forceinline vfloat4 maxui(const vfloat4& a, const vfloat4& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_max_epu32(ai,bi); + return _mm_castsi128_ps(ci); + } +#else + __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) { + return min(a,b); + } + + __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) { + return max(a,b); + } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) || defined(__ARM_NEON) + __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmadd_ps(a,b,c); } + __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmsub_ps(a,b,c); } + __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); } + __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); } +#else + __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; } + __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; } + __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;} + __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4& operator +=(vfloat4& a, const vfloat4& b) { return a = a + b; } + __forceinline vfloat4& operator +=(vfloat4& a, float b) { return a = a + b; } + + __forceinline vfloat4& operator -=(vfloat4& a, const vfloat4& b) { return a = a - b; } + __forceinline vfloat4& operator -=(vfloat4& a, float b) { return a = a - b; } + + __forceinline vfloat4& operator *=(vfloat4& a, const vfloat4& b) { return a = a * b; } + __forceinline vfloat4& operator *=(vfloat4& a, float b) { return a = a * b; } + + __forceinline vfloat4& operator /=(vfloat4& a, const vfloat4& b) { return a = a / b; } + __forceinline vfloat4& operator /=(vfloat4& a, float b) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_EQ); } + __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_NE); } + __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LT); } + __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GE); } + __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GT); } + __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); } + __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); } + __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); } + __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); } + __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); } + __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); } +#endif + + __forceinline vboolf4 operator ==(const vfloat4& a, float b) { return a == vfloat4(b); } + __forceinline vboolf4 operator ==(float a, const vfloat4& b) { return vfloat4(a) == b; } + + __forceinline vboolf4 operator !=(const vfloat4& a, float b) { return a != vfloat4(b); } + __forceinline vboolf4 operator !=(float a, const vfloat4& b) { return vfloat4(a) != b; } + + __forceinline vboolf4 operator < (const vfloat4& a, float b) { return a < vfloat4(b); } + __forceinline vboolf4 operator < (float a, const vfloat4& b) { return vfloat4(a) < b; } + + __forceinline vboolf4 operator >=(const vfloat4& a, float b) { return a >= vfloat4(b); } + __forceinline vboolf4 operator >=(float a, const vfloat4& b) { return vfloat4(a) >= b; } + + __forceinline vboolf4 operator > (const vfloat4& a, float b) { return a > vfloat4(b); } + __forceinline vboolf4 operator > (float a, const vfloat4& b) { return vfloat4(a) > b; } + + __forceinline vboolf4 operator <=(const vfloat4& a, float b) { return a <= vfloat4(b); } + __forceinline vboolf4 operator <=(float a, const vfloat4& b) { return vfloat4(a) <= b; } + + __forceinline vboolf4 eq(const vfloat4& a, const vfloat4& b) { return a == b; } + __forceinline vboolf4 ne(const vfloat4& a, const vfloat4& b) { return a != b; } + __forceinline vboolf4 lt(const vfloat4& a, const vfloat4& b) { return a < b; } + __forceinline vboolf4 ge(const vfloat4& a, const vfloat4& b) { return a >= b; } + __forceinline vboolf4 gt(const vfloat4& a, const vfloat4& b) { return a > b; } + __forceinline vboolf4 le(const vfloat4& a, const vfloat4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a == b); } + __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a != b); } + __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a < b); } + __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a >= b); } + __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a > b); } + __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a <= b); } +#endif + + template<int mask> + __forceinline vfloat4 select(const vfloat4& t, const vfloat4& f) + { +#if defined(__SSE4_1__) + return _mm_blend_ps(f, t, mask); +#else + return select(vboolf4(mask), t, f); +#endif + } + + __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) { + return madd(t,b-a,a); + } + + __forceinline bool isvalid(const vfloat4& v) { + return all((v > vfloat4(-FLT_LARGE)) & (v < vfloat4(+FLT_LARGE))); + } + + __forceinline bool is_finite(const vfloat4& a) { + return all((a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX))); + } + + __forceinline bool is_finite(const vboolf4& valid, const vfloat4& a) { + return all(valid, (a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__aarch64__) + __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } + __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } + __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } + __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } +#elif defined (__SSE4_1__) + __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } + __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } + __forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO ); } + __forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } +#else + __forceinline vfloat4 floor(const vfloat4& a) { return vfloat4(floorf(a[0]),floorf(a[1]),floorf(a[2]),floorf(a[3])); } + __forceinline vfloat4 ceil (const vfloat4& a) { return vfloat4(ceilf (a[0]),ceilf (a[1]),ceilf (a[2]),ceilf (a[3])); } + __forceinline vfloat4 trunc(const vfloat4& a) { return vfloat4(truncf(a[0]),truncf(a[1]),truncf(a[2]),truncf(a[3])); } + __forceinline vfloat4 round(const vfloat4& a) { return vfloat4(roundf(a[0]),roundf(a[1]),roundf(a[2]),roundf(a[3])); } +#endif + __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); } + + __forceinline vint4 floori(const vfloat4& a) { +#if defined(__SSE4_1__) + return vint4(floor(a)); +#else + return vint4(a-vfloat4(0.5f)); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); } + __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); } + + template<int i0, int i1, int i2, int i3> + __forceinline vfloat4 shuffle(const vfloat4& v) { + return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) { + return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); + } + +#if defined(__SSE3__) + template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); } + template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); } + template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); } +#endif + + template<int i> + __forceinline vfloat4 shuffle(const vfloat4& v) { + return shuffle<i,i,i,i>(v); + } + + template<int i> __forceinline float extract (const vfloat4& a) { return _mm_cvtss_f32(shuffle<i>(a)); } + template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); } + +#if defined (__SSE4_1__) + template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } + template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); } + template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); } +#else + template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { vfloat4 c = a; c[dst&3] = b[src&3]; return c; } + template<int dst> __forceinline vfloat4 insert(const vfloat4& a, float b) { vfloat4 c = a; c[dst&3] = b; return c; } +#endif + + __forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v); } + + __forceinline vfloat4 shift_right_1(const vfloat4& x) { + return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x), 4)); + } + +#if defined (__AVX2__) + __forceinline vfloat4 permute(const vfloat4 &a, const __m128i &index) { + return _mm_permutevar_ps(a,index); + } + + __forceinline vfloat4 broadcast1f(const void* a) { return _mm_broadcast_ss((float*)a); } + +#endif + +#if defined(__AVX512VL__) + template<int i> + __forceinline vfloat4 align_shift_right(const vfloat4& a, const vfloat4& b) { + return _mm_castsi128_ps(_mm_alignr_epi32(_mm_castps_si128(a), _mm_castps_si128(b), i)); + } +#endif + + + //////////////////////////////////////////////////////////////////////////////// + /// Sorting Network + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 sort_ascending(const vfloat4& v) + { + const vfloat4 a0 = v; + const vfloat4 b0 = shuffle<1,0,3,2>(a0); + const vfloat4 c0 = min(a0,b0); + const vfloat4 d0 = max(a0,b0); + const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vfloat4 b1 = shuffle<2,3,0,1>(a1); + const vfloat4 c1 = min(a1,b1); + const vfloat4 d1 = max(a1,b1); + const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vfloat4 b2 = shuffle<0,2,1,3>(a2); + const vfloat4 c2 = min(a2,b2); + const vfloat4 d2 = max(a2,b2); + const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3; + } + + __forceinline vfloat4 sort_descending(const vfloat4& v) + { + const vfloat4 a0 = v; + const vfloat4 b0 = shuffle<1,0,3,2>(a0); + const vfloat4 c0 = max(a0,b0); + const vfloat4 d0 = min(a0,b0); + const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vfloat4 b1 = shuffle<2,3,0,1>(a1); + const vfloat4 c1 = max(a1,b1); + const vfloat4 d1 = min(a1,b1); + const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vfloat4 b2 = shuffle<0,2,1,3>(a2); + const vfloat4 c2 = max(a2,b2); + const vfloat4 d2 = min(a2,b2); + const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Transpose + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2, vfloat4& c3) + { + vfloat4 l02 = unpacklo(r0,r2); + vfloat4 h02 = unpackhi(r0,r2); + vfloat4 l13 = unpacklo(r1,r3); + vfloat4 h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + c3 = unpackhi(h02,h13); + } + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2) + { + vfloat4 l02 = unpacklo(r0,r2); + vfloat4 h02 = unpackhi(r0,r2); + vfloat4 l13 = unpacklo(r1,r3); + vfloat4 h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } + __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } + __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } + + __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); } + __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); } + __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); } + + __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v) + { + const vfloat4 a = select(valid,v,vfloat4(pos_inf)); + const vbool4 valid_min = valid & (a == vreduce_min(a)); + return bsf(movemask(any(valid_min) ? valid_min : valid)); + } + __forceinline size_t select_max(const vboolf4& valid, const vfloat4& v) + { + const vfloat4 a = select(valid,v,vfloat4(neg_inf)); + const vbool4 valid_max = valid & (a == vreduce_max(a)); + return bsf(movemask(any(valid_max) ? valid_max : valid)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float dot(const vfloat4& a, const vfloat4& b) { + return reduce_add(a*b); + } + + __forceinline vfloat4 cross(const vfloat4& a, const vfloat4& b) + { + const vfloat4 a0 = a; + const vfloat4 b0 = shuffle<1,2,0,3>(b); + const vfloat4 a1 = shuffle<1,2,0,3>(a); + const vfloat4 b1 = b; + return shuffle<1,2,0,3>(msub(a0,b0,a1*b1)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; + } + +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vfloat8_avx.h b/thirdparty/embree/common/simd/vfloat8_avx.h new file mode 100644 index 0000000000..13446454e8 --- /dev/null +++ b/thirdparty/embree/common/simd/vfloat8_avx.h @@ -0,0 +1,758 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX float type */ + template<> + struct vfloat<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { __m256 v; float f[8]; int i[8]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat() {} + __forceinline vfloat(const vfloat8& other) { v = other.v; } + __forceinline vfloat8& operator =(const vfloat8& other) { v = other.v; return *this; } + + __forceinline vfloat(__m256 a) : v(a) {} + __forceinline operator const __m256&() const { return v; } + __forceinline operator __m256&() { return v; } + + __forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {} + __forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {} + + __forceinline explicit vfloat(const char* a) : v(_mm256_loadu_ps((const float*)a)) {} + __forceinline vfloat(float a) : v(_mm256_set1_ps(a)) {} + __forceinline vfloat(float a, float b) : v(_mm256_set_ps(b, a, b, a, b, a, b, a)) {} + __forceinline vfloat(float a, float b, float c, float d) : v(_mm256_set_ps(d, c, b, a, d, c, b, a)) {} + __forceinline vfloat(float a, float b, float c, float d, float e, float f, float g, float h) : v(_mm256_set_ps(h, g, f, e, d, c, b, a)) {} + + __forceinline explicit vfloat(__m256i a) : v(_mm256_cvtepi32_ps(a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat(ZeroTy) : v(_mm256_setzero_ps()) {} + __forceinline vfloat(OneTy) : v(_mm256_set1_ps(1.0f)) {} + __forceinline vfloat(PosInfTy) : v(_mm256_set1_ps(pos_inf)) {} + __forceinline vfloat(NegInfTy) : v(_mm256_set1_ps(neg_inf)) {} + __forceinline vfloat(StepTy) : v(_mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)) {} + __forceinline vfloat(NaNTy) : v(_mm256_set1_ps(nan)) {} + __forceinline vfloat(UndefinedTy) : v(_mm256_undefined_ps()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vfloat8 broadcast(const void* a) { + return _mm256_broadcast_ss((float*)a); + } + + static __forceinline vfloat8 load(const char* ptr) { +#if defined(__AVX2__) + return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr))); +#else + return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); +#endif + } + + static __forceinline vfloat8 load(const unsigned char* ptr) { +#if defined(__AVX2__) + return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr))); +#else + return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); +#endif + } + + static __forceinline vfloat8 load(const short* ptr) { +#if defined(__AVX2__) + return _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr))); +#else + return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); +#endif + } + + static __forceinline vfloat8 load (const void* ptr) { return _mm256_load_ps((float*)ptr); } + static __forceinline vfloat8 loadu(const void* ptr) { return _mm256_loadu_ps((float*)ptr); } + + static __forceinline void store (void* ptr, const vfloat8& v) { return _mm256_store_ps((float*)ptr,v); } + static __forceinline void storeu(void* ptr, const vfloat8& v) { return _mm256_storeu_ps((float*)ptr,v); } + +#if defined(__AVX512VL__) + + static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_ps (_mm256_setzero_ps(),mask,(float*)ptr); } + static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_ps(_mm256_setzero_ps(),mask,(float*)ptr); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); } +#else + static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); } + static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); } +#endif + +#if defined(__AVX2__) + static __forceinline vfloat8 load_nt(void* ptr) { + return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i*)ptr)); + } +#endif + + static __forceinline void store_nt(void* ptr, const vfloat8& v) { + _mm256_stream_ps((float*)ptr,v); + } + + template<int scale = 4> + static __forceinline vfloat8 gather(const float* ptr, const vint8& index) { +#if defined(__AVX2__) + return _mm256_i32gather_ps(ptr, index ,scale); +#else + return vfloat8( + *(float*)(((char*)ptr)+scale*index[0]), + *(float*)(((char*)ptr)+scale*index[1]), + *(float*)(((char*)ptr)+scale*index[2]), + *(float*)(((char*)ptr)+scale*index[3]), + *(float*)(((char*)ptr)+scale*index[4]), + *(float*)(((char*)ptr)+scale*index[5]), + *(float*)(((char*)ptr)+scale*index[6]), + *(float*)(((char*)ptr)+scale*index[7])); +#endif + } + + template<int scale = 4> + static __forceinline vfloat8 gather(const vboolf8& mask, const float* ptr, const vint8& index) { + vfloat8 r = zero; +#if defined(__AVX512VL__) + return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale); +#elif defined(__AVX2__) + return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale); +#else + if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]); + if (likely(mask[4])) r[4] = *(float*)(((char*)ptr)+scale*index[4]); + if (likely(mask[5])) r[5] = *(float*)(((char*)ptr)+scale*index[5]); + if (likely(mask[6])) r[6] = *(float*)(((char*)ptr)+scale*index[6]); + if (likely(mask[7])) r[7] = *(float*)(((char*)ptr)+scale*index[7]); + return r; + #endif + } + + template<int scale = 4> + static __forceinline void scatter(void* ptr, const vint8& ofs, const vfloat8& v) + { +#if defined(__AVX512VL__) + _mm256_i32scatter_ps((float*)ptr, ofs, v, scale); +#else + *(float*)(((char*)ptr)+scale*ofs[0]) = v[0]; + *(float*)(((char*)ptr)+scale*ofs[1]) = v[1]; + *(float*)(((char*)ptr)+scale*ofs[2]) = v[2]; + *(float*)(((char*)ptr)+scale*ofs[3]) = v[3]; + *(float*)(((char*)ptr)+scale*ofs[4]) = v[4]; + *(float*)(((char*)ptr)+scale*ofs[5]) = v[5]; + *(float*)(((char*)ptr)+scale*ofs[6]) = v[6]; + *(float*)(((char*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vfloat8& v) + { +#if defined(__AVX512VL__) + _mm256_mask_i32scatter_ps((float*)ptr, mask, ofs, v, scale); +#else + if (likely(mask[0])) *(float*)(((char*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(float*)(((char*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(float*)(((char*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(float*)(((char*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(float*)(((char*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(float*)(((char*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(float*)(((char*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(float*)(((char*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator [](size_t index) const { assert(index < 8); return f[index]; } + __forceinline float& operator [](size_t index) { assert(index < 8); return f[index]; } + }; + + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 asFloat(const vint8& a) { return _mm256_castsi256_ps(a); } + __forceinline vint8 asInt (const vfloat8& a) { return _mm256_castps_si256(a); } + + __forceinline vint8 toInt (const vfloat8& a) { return vint8(a); } + __forceinline vfloat8 toFloat(const vint8& a) { return vfloat8(a); } + + __forceinline vfloat8 operator +(const vfloat8& a) { return a; } + __forceinline vfloat8 operator -(const vfloat8& a) { + const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); + return _mm256_xor_ps(a, mask); + } + __forceinline vfloat8 abs(const vfloat8& a) { + const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)); + return _mm256_and_ps(a, mask); + } + __forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); } + __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); } + + + static __forceinline vfloat8 rcp(const vfloat8& a) + { +#if defined(__AVX512VL__) + const vfloat8 r = _mm256_rcp14_ps(a); +#else + const vfloat8 r = _mm256_rcp_ps(a); +#endif + +#if defined(__AVX2__) + return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f))); +#else + return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a))); +#endif + } + __forceinline vfloat8 sqr (const vfloat8& a) { return _mm256_mul_ps(a,a); } + __forceinline vfloat8 sqrt(const vfloat8& a) { return _mm256_sqrt_ps(a); } + + static __forceinline vfloat8 rsqrt(const vfloat8& a) + { +#if defined(__AVX512VL__) + const vfloat8 r = _mm256_rsqrt14_ps(a); +#else + const vfloat8 r = _mm256_rsqrt_ps(a); +#endif + +#if defined(__AVX2__) + return _mm256_fmadd_ps(_mm256_set1_ps(1.5f), r, + _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); +#else + return _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(1.5f), r), + _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 operator +(const vfloat8& a, const vfloat8& b) { return _mm256_add_ps(a, b); } + __forceinline vfloat8 operator +(const vfloat8& a, float b) { return a + vfloat8(b); } + __forceinline vfloat8 operator +(float a, const vfloat8& b) { return vfloat8(a) + b; } + + __forceinline vfloat8 operator -(const vfloat8& a, const vfloat8& b) { return _mm256_sub_ps(a, b); } + __forceinline vfloat8 operator -(const vfloat8& a, float b) { return a - vfloat8(b); } + __forceinline vfloat8 operator -(float a, const vfloat8& b) { return vfloat8(a) - b; } + + __forceinline vfloat8 operator *(const vfloat8& a, const vfloat8& b) { return _mm256_mul_ps(a, b); } + __forceinline vfloat8 operator *(const vfloat8& a, float b) { return a * vfloat8(b); } + __forceinline vfloat8 operator *(float a, const vfloat8& b) { return vfloat8(a) * b; } + + __forceinline vfloat8 operator /(const vfloat8& a, const vfloat8& b) { return _mm256_div_ps(a, b); } + __forceinline vfloat8 operator /(const vfloat8& a, float b) { return a / vfloat8(b); } + __forceinline vfloat8 operator /(float a, const vfloat8& b) { return vfloat8(a) / b; } + + __forceinline vfloat8 operator &(const vfloat8& a, const vfloat8& b) { return _mm256_and_ps(a,b); } + __forceinline vfloat8 operator |(const vfloat8& a, const vfloat8& b) { return _mm256_or_ps(a,b); } + __forceinline vfloat8 operator ^(const vfloat8& a, const vfloat8& b) { return _mm256_xor_ps(a,b); } + __forceinline vfloat8 operator ^(const vfloat8& a, const vint8& b) { return _mm256_xor_ps(a,_mm256_castsi256_ps(b)); } + + __forceinline vfloat8 min(const vfloat8& a, const vfloat8& b) { return _mm256_min_ps(a, b); } + __forceinline vfloat8 min(const vfloat8& a, float b) { return _mm256_min_ps(a, vfloat8(b)); } + __forceinline vfloat8 min(float a, const vfloat8& b) { return _mm256_min_ps(vfloat8(a), b); } + + __forceinline vfloat8 max(const vfloat8& a, const vfloat8& b) { return _mm256_max_ps(a, b); } + __forceinline vfloat8 max(const vfloat8& a, float b) { return _mm256_max_ps(a, vfloat8(b)); } + __forceinline vfloat8 max(float a, const vfloat8& b) { return _mm256_max_ps(vfloat8(a), b); } + + /* need "static __forceinline for MSVC, otherwise we'll link the wrong version in debug mode */ +#if defined(__AVX2__) + + static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) { + const vint8 ai = _mm256_castps_si256(a); + const vint8 bi = _mm256_castps_si256(b); + const vint8 ci = _mm256_min_epi32(ai,bi); + return _mm256_castsi256_ps(ci); + } + + static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) { + const vint8 ai = _mm256_castps_si256(a); + const vint8 bi = _mm256_castps_si256(b); + const vint8 ci = _mm256_max_epi32(ai,bi); + return _mm256_castsi256_ps(ci); + } + + static __forceinline vfloat8 minui(const vfloat8& a, const vfloat8& b) { + const vint8 ai = _mm256_castps_si256(a); + const vint8 bi = _mm256_castps_si256(b); + const vint8 ci = _mm256_min_epu32(ai,bi); + return _mm256_castsi256_ps(ci); + } + + static __forceinline vfloat8 maxui(const vfloat8& a, const vfloat8& b) { + const vint8 ai = _mm256_castps_si256(a); + const vint8 bi = _mm256_castps_si256(b); + const vint8 ci = _mm256_max_epu32(ai,bi); + return _mm256_castsi256_ps(ci); + } + +#else + + static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) { + return asFloat(min(asInt(a),asInt(b))); + } + + static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) { + return asFloat(max(asInt(a),asInt(b))); + } + +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + static __forceinline vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmadd_ps(a,b,c); } + static __forceinline vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmsub_ps(a,b,c); } + static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmadd_ps(a,b,c); } + static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmsub_ps(a,b,c); } +#else + static __forceinline vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b+c; } + static __forceinline vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b-c; } + static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b+c;} + static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b-c; } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8& operator +=(vfloat8& a, const vfloat8& b) { return a = a + b; } + __forceinline vfloat8& operator +=(vfloat8& a, float b) { return a = a + b; } + + __forceinline vfloat8& operator -=(vfloat8& a, const vfloat8& b) { return a = a - b; } + __forceinline vfloat8& operator -=(vfloat8& a, float b) { return a = a - b; } + + __forceinline vfloat8& operator *=(vfloat8& a, const vfloat8& b) { return a = a * b; } + __forceinline vfloat8& operator *=(vfloat8& a, float b) { return a = a * b; } + + __forceinline vfloat8& operator /=(vfloat8& a, const vfloat8& b) { return a = a / b; } + __forceinline vfloat8& operator /=(vfloat8& a, float b) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_EQ); } + static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_NE); } + static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LT); } + static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GE); } + static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GT); } + static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LE); } + + static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { + return _mm256_mask_blend_ps(m, f, t); + } +#else + static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); } + static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); } + static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); } + static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } + static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } + static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); } + + static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { + return _mm256_blendv_ps(f, t, m); + } +#endif + + template<int mask> + __forceinline vfloat8 select(const vfloat8& t, const vfloat8& f) { + return _mm256_blend_ps(f, t, mask); + } + + __forceinline vboolf8 operator ==(const vfloat8& a, const float& b) { return a == vfloat8(b); } + __forceinline vboolf8 operator ==(const float& a, const vfloat8& b) { return vfloat8(a) == b; } + + __forceinline vboolf8 operator !=(const vfloat8& a, const float& b) { return a != vfloat8(b); } + __forceinline vboolf8 operator !=(const float& a, const vfloat8& b) { return vfloat8(a) != b; } + + __forceinline vboolf8 operator < (const vfloat8& a, const float& b) { return a < vfloat8(b); } + __forceinline vboolf8 operator < (const float& a, const vfloat8& b) { return vfloat8(a) < b; } + + __forceinline vboolf8 operator >=(const vfloat8& a, const float& b) { return a >= vfloat8(b); } + __forceinline vboolf8 operator >=(const float& a, const vfloat8& b) { return vfloat8(a) >= b; } + + __forceinline vboolf8 operator > (const vfloat8& a, const float& b) { return a > vfloat8(b); } + __forceinline vboolf8 operator > (const float& a, const vfloat8& b) { return vfloat8(a) > b; } + + __forceinline vboolf8 operator <=(const vfloat8& a, const float& b) { return a <= vfloat8(b); } + __forceinline vboolf8 operator <=(const float& a, const vfloat8& b) { return vfloat8(a) <= b; } + + __forceinline vboolf8 eq(const vfloat8& a, const vfloat8& b) { return a == b; } + __forceinline vboolf8 ne(const vfloat8& a, const vfloat8& b) { return a != b; } + __forceinline vboolf8 lt(const vfloat8& a, const vfloat8& b) { return a < b; } + __forceinline vboolf8 ge(const vfloat8& a, const vfloat8& b) { return a >= b; } + __forceinline vboolf8 gt(const vfloat8& a, const vfloat8& b) { return a > b; } + __forceinline vboolf8 le(const vfloat8& a, const vfloat8& b) { return a <= b; } + +#if defined(__AVX512VL__) + static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); } + static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); } + static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); } + static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); } + static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); } + static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); } +#else + static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a == b); } + static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a != b); } + static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a < b); } + static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a >= b); } + static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a > b); } + static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a <= b); } +#endif + + __forceinline vfloat8 lerp(const vfloat8& a, const vfloat8& b, const vfloat8& t) { + return madd(t,b-a,a); + } + + __forceinline bool isvalid (const vfloat8& v) { + return all((v > vfloat8(-FLT_LARGE)) & (v < vfloat8(+FLT_LARGE))); + } + + __forceinline bool is_finite (const vfloat8& a) { + return all((a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX))); + } + + __forceinline bool is_finite (const vboolf8& valid, const vfloat8& a) { + return all(valid, (a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF ); } + __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF ); } + __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO ); } + __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } + __forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 unpacklo(const vfloat8& a, const vfloat8& b) { return _mm256_unpacklo_ps(a, b); } + __forceinline vfloat8 unpackhi(const vfloat8& a, const vfloat8& b) { return _mm256_unpackhi_ps(a, b); } + + template<int i> + __forceinline vfloat8 shuffle(const vfloat8& v) { + return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); + } + + template<int i0, int i1> + __forceinline vfloat8 shuffle4(const vfloat8& v) { + return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1> + __forceinline vfloat8 shuffle4(const vfloat8& a, const vfloat8& b) { + return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vfloat8 shuffle(const vfloat8& v) { + return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vfloat8 shuffle(const vfloat8& a, const vfloat8& b) { + return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); } + template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); } + template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); } + + __forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); } + template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); } + template<size_t i> __forceinline vfloat4 extract4 (const vfloat8& a) { return _mm256_extractf128_ps(a, i); } + template<> __forceinline vfloat4 extract4<0>(const vfloat8& a) { return _mm256_castps256_ps128(a); } + + __forceinline float toScalar(const vfloat8& v) { return _mm_cvtss_f32(_mm256_castps256_ps128(v)); } + +#if defined (__AVX2__) + static __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) { + return _mm256_permutevar8x32_ps(a, index); + } +#endif + +#if defined(__AVX512VL__) + template<int i> + static __forceinline vfloat8 align_shift_right(const vfloat8& a, const vfloat8& b) { + return _mm256_castsi256_ps(_mm256_alignr_epi32(_mm256_castps_si256(a), _mm256_castps_si256(b), i)); + } +#endif + +#if defined (__AVX_I__) + template<const int mode> + static __forceinline vint4 convert_to_hf16(const vfloat8& a) { + return _mm256_cvtps_ph(a, mode); + } + + static __forceinline vfloat8 convert_from_hf16(const vint4& a) { + return _mm256_cvtph_ps(a); + } +#endif + +#if defined(__AVX512VL__) + static __forceinline vfloat8 shift_right_1(const vfloat8& x) { + return align_shift_right<1>(zero,x); + } +#else + static __forceinline vfloat8 shift_right_1(const vfloat8& x) { + const vfloat8 t0 = shuffle<1,2,3,0>(x); + const vfloat8 t1 = shuffle4<1,0>(t0); + return _mm256_blend_ps(t0,t1,0x88); + } +#endif + + __forceinline vint8 floori(const vfloat8& a) { + return vint8(floor(a)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Transpose + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3) + { + vfloat8 l02 = unpacklo(r0,r2); + vfloat8 h02 = unpackhi(r0,r2); + vfloat8 l13 = unpacklo(r1,r3); + vfloat8 h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + c3 = unpackhi(h02,h13); + } + + __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2) + { + vfloat8 l02 = unpacklo(r0,r2); + vfloat8 h02 = unpackhi(r0,r2); + vfloat8 l13 = unpacklo(r1,r3); + vfloat8 h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + } + + __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7, + vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3, vfloat8& c4, vfloat8& c5, vfloat8& c6, vfloat8& c7) + { + vfloat8 h0,h1,h2,h3; transpose(r0,r1,r2,r3,h0,h1,h2,h3); + vfloat8 h4,h5,h6,h7; transpose(r4,r5,r6,r7,h4,h5,h6,h7); + c0 = shuffle4<0,2>(h0,h4); + c1 = shuffle4<0,2>(h1,h5); + c2 = shuffle4<0,2>(h2,h6); + c3 = shuffle4<0,2>(h3,h7); + c4 = shuffle4<1,3>(h0,h4); + c5 = shuffle4<1,3>(h1,h5); + c6 = shuffle4<1,3>(h2,h6); + c7 = shuffle4<1,3>(h3,h7); + } + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, + vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3) + { + transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2, c3); + } + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, + vfloat8& c0, vfloat8& c1, vfloat8& c2) + { + transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); } + __forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + __forceinline vfloat8 vreduce_max2(const vfloat8& v) { return max(v,shuffle<1,0,3,2>(v)); } + __forceinline vfloat8 vreduce_max4(const vfloat8& v) { vfloat8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vfloat8 vreduce_max (const vfloat8& v) { vfloat8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vfloat8 vreduce_add2(const vfloat8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vfloat8 vreduce_add4(const vfloat8& v) { vfloat8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vfloat8 vreduce_add (const vfloat8& v) { vfloat8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + __forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); } + __forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); } + __forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); } + + __forceinline size_t select_min(const vboolf8& valid, const vfloat8& v) + { + const vfloat8 a = select(valid,v,vfloat8(pos_inf)); + const vbool8 valid_min = valid & (a == vreduce_min(a)); + return bsf(movemask(any(valid_min) ? valid_min : valid)); + } + + __forceinline size_t select_max(const vboolf8& valid, const vfloat8& v) + { + const vfloat8 a = select(valid,v,vfloat8(neg_inf)); + const vbool8 valid_max = valid & (a == vreduce_max(a)); + return bsf(movemask(any(valid_max) ? valid_max : valid)); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators (pairs of Vec3fa's) + //////////////////////////////////////////////////////////////////////////////// + + //__forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) { + // return vreduce_add4(a*b); + //} + + __forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) { + return _mm256_dp_ps(a,b,0x7F); + } + + __forceinline vfloat8 cross(const vfloat8& a, const vfloat8& b) + { + const vfloat8 a0 = a; + const vfloat8 b0 = shuffle<1,2,0,3>(b); + const vfloat8 a1 = shuffle<1,2,0,3>(a); + const vfloat8 b1 = b; + return shuffle<1,2,0,3>(msub(a0,b0,a1*b1)); + } + + //__forceinline float sqr_length (const vfloat<8>& a) { return dot(a,a); } + //__forceinline float rcp_length (const vfloat<8>& a) { return rsqrt(dot(a,a)); } + //__forceinline float rcp_length2(const vfloat<8>& a) { return rcp(dot(a,a)); } + //__forceinline float length (const vfloat<8>& a) { return sqrt(dot(a,a)); } + __forceinline vfloat<8> normalize(const vfloat<8>& a) { return a*rsqrt(dot(a,a)); } + //__forceinline float distance(const vfloat<8>& a, const vfloat<8>& b) { return length(a-b); } + //__forceinline float halfArea(const vfloat<8>& d) { return madd(d.x,(d.y+d.z),d.y*d.z); } + //__forceinline float area (const vfloat<8>& d) { return 2.0f*halfArea(d); } + //__forceinline vfloat<8> reflect(const vfloat<8>& V, const vfloat<8>& N) { return 2.0f*dot(V,N)*N-V; } + + //__forceinline vfloat<8> normalize_safe(const vfloat<8>& a) { + // const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); + //} + + //////////////////////////////////////////////////////////////////////////////// + /// In Register Sorting + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 sort_ascending(const vfloat8& v) + { + const vfloat8 a0 = v; + const vfloat8 b0 = shuffle<1,0,3,2>(a0); + const vfloat8 c0 = min(a0,b0); + const vfloat8 d0 = max(a0,b0); + const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); + const vfloat8 b1 = shuffle<2,3,0,1>(a1); + const vfloat8 c1 = min(a1,b1); + const vfloat8 d1 = max(a1,b1); + const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); + const vfloat8 b2 = shuffle<1,0,3,2>(a2); + const vfloat8 c2 = min(a2,b2); + const vfloat8 d2 = max(a2,b2); + const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); + const vfloat8 b3 = shuffle4<1,0>(a3); + const vfloat8 c3 = min(a3,b3); + const vfloat8 d3 = max(a3,b3); + const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3); + const vfloat8 b4 = shuffle<2,3,0,1>(a4); + const vfloat8 c4 = min(a4,b4); + const vfloat8 d4 = max(a4,b4); + const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); + const vfloat8 b5 = shuffle<1,0,3,2>(a5); + const vfloat8 c5 = min(a5,b5); + const vfloat8 d5 = max(a5,b5); + const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); + return a6; + } + + __forceinline vfloat8 sort_descending(const vfloat8& v) + { + const vfloat8 a0 = v; + const vfloat8 b0 = shuffle<1,0,3,2>(a0); + const vfloat8 c0 = max(a0,b0); + const vfloat8 d0 = min(a0,b0); + const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); + const vfloat8 b1 = shuffle<2,3,0,1>(a1); + const vfloat8 c1 = max(a1,b1); + const vfloat8 d1 = min(a1,b1); + const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); + const vfloat8 b2 = shuffle<1,0,3,2>(a2); + const vfloat8 c2 = max(a2,b2); + const vfloat8 d2 = min(a2,b2); + const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); + const vfloat8 b3 = shuffle4<1,0>(a3); + const vfloat8 c3 = max(a3,b3); + const vfloat8 d3 = min(a3,b3); + const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3); + const vfloat8 b4 = shuffle<2,3,0,1>(a4); + const vfloat8 c4 = max(a4,b4); + const vfloat8 d4 = min(a4,b4); + const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); + const vfloat8 b5 = shuffle<1,0,3,2>(a5); + const vfloat8 c5 = max(a5,b5); + const vfloat8 d5 = min(a5,b5); + const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); + return a6; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vint16_avx512.h b/thirdparty/embree/common/simd/vint16_avx512.h new file mode 100644 index 0000000000..3720c3c9d6 --- /dev/null +++ b/thirdparty/embree/common/simd/vint16_avx512.h @@ -0,0 +1,472 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 16-wide AVX-512 integer type */ + template<> + struct vint<16> + { + ALIGNED_STRUCT_(64); + + typedef vboolf16 Bool; + typedef vint16 Int; + typedef vfloat16 Float; + + enum { size = 16 }; // number of SIMD elements + union { // data + __m512i v; + int i[16]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint() {} + __forceinline vint(const vint16& t) { v = t.v; } + __forceinline vint16& operator =(const vint16& f) { v = f.v; return *this; } + + __forceinline vint(const __m512i& t) { v = t; } + __forceinline operator __m512i() const { return v; } + __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } + + __forceinline vint(int i) { + v = _mm512_set1_epi32(i); + } + + __forceinline vint(int a, int b, int c, int d) { + v = _mm512_set4_epi32(d,c,b,a); + } + + __forceinline vint(int a0 , int a1 , int a2 , int a3, + int a4 , int a5 , int a6 , int a7, + int a8 , int a9 , int a10, int a11, + int a12, int a13, int a14, int a15) + { + v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0); + } + + __forceinline vint(const vint4& i) { + v = _mm512_broadcast_i32x4(i); + } + + __forceinline vint(const vint4& a, const vint4& b, const vint4& c, const vint4& d) { + v = _mm512_castsi128_si512(a); + v = _mm512_inserti32x4(v, b, 1); + v = _mm512_inserti32x4(v, c, 2); + v = _mm512_inserti32x4(v, d, 3); + } + + __forceinline vint(const vint8& i) { + v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i)))); + } + + __forceinline vint(const vint8& a, const vint8& b) { + v = _mm512_castsi256_si512(a); + v = _mm512_inserti64x4(v, b, 1); + } + + __forceinline explicit vint(const __m512& f) { + v = _mm512_cvtps_epi32(f); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint(ZeroTy) : v(_mm512_setzero_epi32()) {} + __forceinline vint(OneTy) : v(_mm512_set1_epi32(1)) {} + __forceinline vint(PosInfTy) : v(_mm512_set1_epi32(pos_inf)) {} + __forceinline vint(NegInfTy) : v(_mm512_set1_epi32(neg_inf)) {} + __forceinline vint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + __forceinline vint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vint16 load (const void* addr) { return _mm512_load_si512((int*)addr); } + + static __forceinline vint16 load(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); } + static __forceinline vint16 load(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_load_si256((__m256i*)ptr)); } + + static __forceinline vint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); } + static __forceinline vint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); } + + static __forceinline vint16 loadu(const void* addr) { return _mm512_loadu_si512(addr); } + + static __forceinline vint16 load (const vboolf16& mask, const void* addr) { return _mm512_mask_load_epi32 (_mm512_setzero_epi32(),mask,addr); } + static __forceinline vint16 loadu(const vboolf16& mask, const void* addr) { return _mm512_mask_loadu_epi32(_mm512_setzero_epi32(),mask,addr); } + + static __forceinline void store (void* ptr, const vint16& v) { _mm512_store_si512 (ptr,v); } + static __forceinline void storeu(void* ptr, const vint16& v) { _mm512_storeu_si512(ptr,v); } + + static __forceinline void store (const vboolf16& mask, void* addr, const vint16& v2) { _mm512_mask_store_epi32(addr,mask,v2); } + static __forceinline void storeu(const vboolf16& mask, void* ptr, const vint16& f) { _mm512_mask_storeu_epi32((int*)ptr,mask,f); } + + static __forceinline void store_nt(void* __restrict__ ptr, const vint16& a) { _mm512_stream_si512((__m512i*)ptr,a); } + + static __forceinline vint16 compact(const vboolf16& mask, vint16 &v) { + return _mm512_mask_compress_epi32(v,mask,v); + } + + static __forceinline vint16 compact(const vboolf16& mask, const vint16 &a, vint16 &b) { + return _mm512_mask_compress_epi32(a,mask,b); + } + + static __forceinline vint16 expand(const vboolf16& mask, const vint16& a, vint16& b) { + return _mm512_mask_expand_epi32(b,mask,a); + } + + template<int scale = 4> + static __forceinline vint16 gather(const int* ptr, const vint16& index) { + return _mm512_i32gather_epi32(index,ptr,scale); + } + + template<int scale = 4> + static __forceinline vint16 gather(const vboolf16& mask, const int* ptr, const vint16& index) { + return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale); + } + + template<int scale = 4> + static __forceinline vint16 gather(const vboolf16& mask, vint16& dest, const int* ptr, const vint16& index) { + return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale); + } + + template<int scale = 4> + static __forceinline void scatter(int* ptr, const vint16& index, const vint16& v) { + _mm512_i32scatter_epi32((int*)ptr,index,v,scale); + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf16& mask, int* ptr, const vint16& index, const vint16& v) { + _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int& operator [](size_t index) { assert(index < 16); return i[index]; } + __forceinline const int& operator [](size_t index) const { assert(index < 16); return i[index]; } + + __forceinline unsigned int uint (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; } + __forceinline size_t& uint64_t(size_t index) const { assert(index < 8); return ((size_t*)i)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 asBool(const vint16& a) { return _mm512_movepi32_mask(a); } + + __forceinline vint16 operator +(const vint16& a) { return a; } + __forceinline vint16 operator -(const vint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16 operator +(const vint16& a, const vint16& b) { return _mm512_add_epi32(a, b); } + __forceinline vint16 operator +(const vint16& a, int b) { return a + vint16(b); } + __forceinline vint16 operator +(int a, const vint16& b) { return vint16(a) + b; } + + __forceinline vint16 operator -(const vint16& a, const vint16& b) { return _mm512_sub_epi32(a, b); } + __forceinline vint16 operator -(const vint16& a, int b) { return a - vint16(b); } + __forceinline vint16 operator -(int a, const vint16& b) { return vint16(a) - b; } + + __forceinline vint16 operator *(const vint16& a, const vint16& b) { return _mm512_mullo_epi32(a, b); } + __forceinline vint16 operator *(const vint16& a, int b) { return a * vint16(b); } + __forceinline vint16 operator *(int a, const vint16& b) { return vint16(a) * b; } + + __forceinline vint16 operator &(const vint16& a, const vint16& b) { return _mm512_and_epi32(a, b); } + __forceinline vint16 operator &(const vint16& a, int b) { return a & vint16(b); } + __forceinline vint16 operator &(int a, const vint16& b) { return vint16(a) & b; } + + __forceinline vint16 operator |(const vint16& a, const vint16& b) { return _mm512_or_epi32(a, b); } + __forceinline vint16 operator |(const vint16& a, int b) { return a | vint16(b); } + __forceinline vint16 operator |(int a, const vint16& b) { return vint16(a) | b; } + + __forceinline vint16 operator ^(const vint16& a, const vint16& b) { return _mm512_xor_epi32(a, b); } + __forceinline vint16 operator ^(const vint16& a, int b) { return a ^ vint16(b); } + __forceinline vint16 operator ^(int a, const vint16& b) { return vint16(a) ^ b; } + + __forceinline vint16 operator <<(const vint16& a, int n) { return _mm512_slli_epi32(a, n); } + __forceinline vint16 operator >>(const vint16& a, int n) { return _mm512_srai_epi32(a, n); } + + __forceinline vint16 operator <<(const vint16& a, const vint16& n) { return _mm512_sllv_epi32(a, n); } + __forceinline vint16 operator >>(const vint16& a, const vint16& n) { return _mm512_srav_epi32(a, n); } + + __forceinline vint16 sll (const vint16& a, int b) { return _mm512_slli_epi32(a, b); } + __forceinline vint16 sra (const vint16& a, int b) { return _mm512_srai_epi32(a, b); } + __forceinline vint16 srl (const vint16& a, int b) { return _mm512_srli_epi32(a, b); } + + __forceinline vint16 min(const vint16& a, const vint16& b) { return _mm512_min_epi32(a, b); } + __forceinline vint16 min(const vint16& a, int b) { return min(a,vint16(b)); } + __forceinline vint16 min(int a, const vint16& b) { return min(vint16(a),b); } + + __forceinline vint16 max(const vint16& a, const vint16& b) { return _mm512_max_epi32(a, b); } + __forceinline vint16 max(const vint16& a, int b) { return max(a,vint16(b)); } + __forceinline vint16 max(int a, const vint16& b) { return max(vint16(a),b); } + + __forceinline vint16 umin(const vint16& a, const vint16& b) { return _mm512_min_epu32(a, b); } + __forceinline vint16 umax(const vint16& a, const vint16& b) { return _mm512_max_epu32(a, b); } + + __forceinline vint16 mask_add(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); } + __forceinline vint16 mask_sub(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); } + + __forceinline vint16 mask_and(const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_and_epi32(c,m,a,b); } + __forceinline vint16 mask_or (const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_or_epi32(c,m,a,b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16& operator +=(vint16& a, const vint16& b) { return a = a + b; } + __forceinline vint16& operator +=(vint16& a, int b) { return a = a + b; } + + __forceinline vint16& operator -=(vint16& a, const vint16& b) { return a = a - b; } + __forceinline vint16& operator -=(vint16& a, int b) { return a = a - b; } + + __forceinline vint16& operator *=(vint16& a, const vint16& b) { return a = a * b; } + __forceinline vint16& operator *=(vint16& a, int b) { return a = a * b; } + + __forceinline vint16& operator &=(vint16& a, const vint16& b) { return a = a & b; } + __forceinline vint16& operator &=(vint16& a, int b) { return a = a & b; } + + __forceinline vint16& operator |=(vint16& a, const vint16& b) { return a = a | b; } + __forceinline vint16& operator |=(vint16& a, int b) { return a = a | b; } + + __forceinline vint16& operator <<=(vint16& a, int b) { return a = a << b; } + __forceinline vint16& operator >>=(vint16& a, int b) { return a = a >> b; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator ==(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 operator ==(const vint16& a, int b) { return a == vint16(b); } + __forceinline vboolf16 operator ==(int a, const vint16& b) { return vint16(a) == b; } + + __forceinline vboolf16 operator !=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 operator !=(const vint16& a, int b) { return a != vint16(b); } + __forceinline vboolf16 operator !=(int a, const vint16& b) { return vint16(a) != b; } + + __forceinline vboolf16 operator < (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 operator < (const vint16& a, int b) { return a < vint16(b); } + __forceinline vboolf16 operator < (int a, const vint16& b) { return vint16(a) < b; } + + __forceinline vboolf16 operator >=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 operator >=(const vint16& a, int b) { return a >= vint16(b); } + __forceinline vboolf16 operator >=(int a, const vint16& b) { return vint16(a) >= b; } + + __forceinline vboolf16 operator > (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 operator > (const vint16& a, int b) { return a > vint16(b); } + __forceinline vboolf16 operator > (int a, const vint16& b) { return vint16(a) > b; } + + __forceinline vboolf16 operator <=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 operator <=(const vint16& a, int b) { return a <= vint16(b); } + __forceinline vboolf16 operator <=(int a, const vint16& b) { return vint16(a) <= b; } + + __forceinline vboolf16 eq(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 uint_le(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 uint_gt(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + + __forceinline vboolf16 eq(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 uint_le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 uint_gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); } + + + __forceinline vint16 select(const vboolf16& m, const vint16& t, const vint16& f) { + return _mm512_mask_or_epi32(f,m,t,t); + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16 unpacklo(const vint16& a, const vint16& b) { return _mm512_unpacklo_epi32(a, b); } + __forceinline vint16 unpackhi(const vint16& a, const vint16& b) { return _mm512_unpackhi_epi32(a, b); } + + template<int i> + __forceinline vint16 shuffle(const vint16& v) { + return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vint16 shuffle(const vint16& v) { + return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i> + __forceinline vint16 shuffle4(const vint16& v) { + return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vint16 shuffle4(const vint16& v) { + return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i> + __forceinline vint16 align_shift_right(const vint16& a, const vint16& b) { + return _mm512_alignr_epi32(a, b, i); + }; + + __forceinline int toScalar(const vint16& v) { + return _mm_cvtsi128_si32(_mm512_castsi512_si128(v)); + } + + template<int i> __forceinline vint16 insert4(const vint16& a, const vint4& b) { return _mm512_inserti32x4(a, b, i); } + + template<int N, int i> + vint<N> extractN(const vint16& v); + + template<> __forceinline vint4 extractN<4,0>(const vint16& v) { return _mm512_castsi512_si128(v); } + template<> __forceinline vint4 extractN<4,1>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 1); } + template<> __forceinline vint4 extractN<4,2>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 2); } + template<> __forceinline vint4 extractN<4,3>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 3); } + + template<> __forceinline vint8 extractN<8,0>(const vint16& v) { return _mm512_castsi512_si256(v); } + template<> __forceinline vint8 extractN<8,1>(const vint16& v) { return _mm512_extracti32x8_epi32(v, 1); } + + template<int i> __forceinline vint4 extract4 (const vint16& v) { return _mm512_extracti32x4_epi32(v, i); } + template<> __forceinline vint4 extract4<0>(const vint16& v) { return _mm512_castsi512_si128(v); } + + template<int i> __forceinline vint8 extract8 (const vint16& v) { return _mm512_extracti32x8_epi32(v, i); } + template<> __forceinline vint8 extract8<0>(const vint16& v) { return _mm512_castsi512_si256(v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16 vreduce_min2(vint16 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vint16 vreduce_min4(vint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vint16 vreduce_min8(vint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } + __forceinline vint16 vreduce_min (vint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vint16 vreduce_max2(vint16 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vint16 vreduce_max4(vint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vint16 vreduce_max8(vint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } + __forceinline vint16 vreduce_max (vint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vint16 vreduce_and2(vint16 x) { return x & shuffle<1,0,3,2>(x); } + __forceinline vint16 vreduce_and4(vint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } + __forceinline vint16 vreduce_and8(vint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); } + __forceinline vint16 vreduce_and (vint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); } + + __forceinline vint16 vreduce_or2(vint16 x) { return x | shuffle<1,0,3,2>(x); } + __forceinline vint16 vreduce_or4(vint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } + __forceinline vint16 vreduce_or8(vint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); } + __forceinline vint16 vreduce_or (vint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); } + + __forceinline vint16 vreduce_add2(vint16 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vint16 vreduce_add4(vint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vint16 vreduce_add8(vint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } + __forceinline vint16 vreduce_add (vint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } + + __forceinline int reduce_min(const vint16& v) { return toScalar(vreduce_min(v)); } + __forceinline int reduce_max(const vint16& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_and(const vint16& v) { return toScalar(vreduce_and(v)); } + __forceinline int reduce_or (const vint16& v) { return toScalar(vreduce_or (v)); } + __forceinline int reduce_add(const vint16& v) { return toScalar(vreduce_add(v)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16 conflict(const vint16& index) + { + return _mm512_conflict_epi32(index); + } + + __forceinline vint16 conflict(const vboolf16& mask, vint16& dest, const vint16& index) + { + return _mm512_mask_conflict_epi32(dest,mask,index); + } + + __forceinline vint16 convert_uint32_t(const __m512& f) { + return _mm512_cvtps_epu32(f); + } + + __forceinline vint16 permute(vint16 v, vint16 index) { + return _mm512_permutexvar_epi32(index,v); + } + + __forceinline vint16 reverse(const vint16 &a) { + return permute(a,vint16(reverse_step)); + } + + __forceinline vint16 prefix_sum(const vint16& a) + { + const vint16 z(zero); + vint16 v = a; + v = v + align_shift_right<16-1>(v,z); + v = v + align_shift_right<16-2>(v,z); + v = v + align_shift_right<16-4>(v,z); + v = v + align_shift_right<16-8>(v,z); + return v; + } + + __forceinline vint16 reverse_prefix_sum(const vint16& a) + { + const vint16 z(zero); + vint16 v = a; + v = v + align_shift_right<1>(z,v); + v = v + align_shift_right<2>(z,v); + v = v + align_shift_right<4>(z,v); + v = v + align_shift_right<8>(z,v); + return v; + } + + /* this should use a vbool8 and a vint8_64...*/ + template<int scale = 1, int hint = _MM_HINT_T0> + __forceinline void gather_prefetch64(const void* base_addr, const vbool16& mask, const vint16& offset) + { +#if defined(__AVX512PF__) + _mm512_mask_prefetch_i64gather_pd(offset, mask, base_addr, scale, hint); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vint16& v) + { + cout << "<" << v[0]; + for (int i=1; i<16; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vint4_sse2.h b/thirdparty/embree/common/simd/vint4_sse2.h new file mode 100644 index 0000000000..9814d5c71c --- /dev/null +++ b/thirdparty/embree/common/simd/vint4_sse2.h @@ -0,0 +1,598 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../math/math.h" + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide SSE integer type */ + template<> + struct vint<4> + { + ALIGNED_STRUCT_(16); + + typedef vboolf4 Bool; + typedef vint4 Int; + typedef vfloat4 Float; + + enum { size = 4 }; // number of SIMD elements + union { __m128i v; int i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint() {} + __forceinline vint(const vint4& a) { v = a.v; } + __forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; } + + __forceinline vint(__m128i a) : v(a) {} + __forceinline operator const __m128i&() const { return v; } + __forceinline operator __m128i&() { return v; } + + __forceinline vint(int a) : v(_mm_set1_epi32(a)) {} + __forceinline vint(int a, int b, int c, int d) : v(_mm_set_epi32(d, c, b, a)) {} + + __forceinline explicit vint(__m128 a) : v(_mm_cvtps_epi32(a)) {} +#if defined(__AVX512VL__) + __forceinline explicit vint(const vboolf4& a) : v(_mm_movm_epi32(a)) {} +#else + __forceinline explicit vint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {} +#endif + + __forceinline vint(long long a, long long b) : v(_mm_set_epi64x(b,a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint(ZeroTy) : v(_mm_setzero_si128()) {} + __forceinline vint(OneTy) : v(_mm_set_epi32(1, 1, 1, 1)) {} + __forceinline vint(PosInfTy) : v(_mm_set_epi32(pos_inf, pos_inf, pos_inf, pos_inf)) {} + __forceinline vint(NegInfTy) : v(_mm_set_epi32(neg_inf, neg_inf, neg_inf, neg_inf)) {} + __forceinline vint(StepTy) : v(_mm_set_epi32(3, 2, 1, 0)) {} + __forceinline vint(ReverseStepTy) : v(_mm_set_epi32(0, 1, 2, 3)) {} + + __forceinline vint(TrueTy) { v = _mm_cmpeq_epi32(v,v); } + __forceinline vint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {} + + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vint4 load (const void* a) { return _mm_load_si128((__m128i*)a); } + static __forceinline vint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); } + + static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128((__m128i*)ptr,v); } + static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128((__m128i*)ptr,v); } + +#if defined(__AVX512VL__) + + static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) { + return _mm_mask_compress_epi32(v, mask, v); + } + static __forceinline vint4 compact(const vboolf4& mask, vint4 &a, const vint4& b) { + return _mm_mask_compress_epi32(a, mask, b); + } + + static __forceinline vint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); } + static __forceinline vint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_store_epi32 (ptr,mask,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); } +#elif defined(__AVX__) + static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } + static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } +#else + static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); } + static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { store (ptr,select(mask,i,load (ptr))); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); } +#endif + + +#if defined(__SSE4_1__) + static __forceinline vint4 load(const unsigned char* ptr) { + return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); + } + + static __forceinline vint4 loadu(const unsigned char* ptr) { + return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); + } +#else + + static __forceinline vint4 load(const unsigned char* ptr) { + return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); + } + + static __forceinline vint4 loadu(const unsigned char* ptr) { + return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); + } + +#endif + + static __forceinline vint4 load(const unsigned short* ptr) { +#if defined (__SSE4_1__) + return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); +#else + return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); +#endif + } + + static __forceinline void store(unsigned char* ptr, const vint4& v) { +#if defined(__SSE4_1__) + __m128i x = v; + x = _mm_packus_epi32(x, x); + x = _mm_packus_epi16(x, x); + *(int*)ptr = _mm_cvtsi128_si32(x); +#else + for (size_t i=0;i<4;i++) + ptr[i] = (unsigned char)v[i]; +#endif + } + + static __forceinline void store(unsigned short* ptr, const vint4& v) { + for (size_t i=0;i<4;i++) + ptr[i] = (unsigned short)v[i]; + } + + static __forceinline vint4 load_nt(void* ptr) { +#if defined(__SSE4_1__) + return _mm_stream_load_si128((__m128i*)ptr); +#else + return _mm_load_si128((__m128i*)ptr); +#endif + } + + static __forceinline void store_nt(void* ptr, const vint4& v) { +#if defined(__SSE4_1__) + _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v)); +#else + _mm_store_si128((__m128i*)ptr,v); +#endif + } + + template<int scale = 4> + static __forceinline vint4 gather(const int* ptr, const vint4& index) { +#if defined(__AVX2__) + return _mm_i32gather_epi32(ptr, index, scale); +#else + return vint4( + *(int*)(((char*)ptr)+scale*index[0]), + *(int*)(((char*)ptr)+scale*index[1]), + *(int*)(((char*)ptr)+scale*index[2]), + *(int*)(((char*)ptr)+scale*index[3])); +#endif + } + + template<int scale = 4> + static __forceinline vint4 gather(const vboolf4& mask, const int* ptr, const vint4& index) { + vint4 r = zero; +#if defined(__AVX512VL__) + return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale); +#elif defined(__AVX2__) + return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale); +#else + if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]); + return r; +#endif + } + + template<int scale = 4> + static __forceinline void scatter(void* ptr, const vint4& index, const vint4& v) + { +#if defined(__AVX512VL__) + _mm_i32scatter_epi32((int*)ptr, index, v, scale); +#else + *(int*)(((char*)ptr)+scale*index[0]) = v[0]; + *(int*)(((char*)ptr)+scale*index[1]) = v[1]; + *(int*)(((char*)ptr)+scale*index[2]) = v[2]; + *(int*)(((char*)ptr)+scale*index[3]) = v[3]; +#endif + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vint4& v) + { +#if defined(__AVX512VL__) + _mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale); +#else + if (likely(mask[0])) *(int*)(((char*)ptr)+scale*index[0]) = v[0]; + if (likely(mask[1])) *(int*)(((char*)ptr)+scale*index[1]) = v[1]; + if (likely(mask[2])) *(int*)(((char*)ptr)+scale*index[2]) = v[2]; + if (likely(mask[3])) *(int*)(((char*)ptr)+scale*index[3]) = v[3]; +#endif + } + +#if defined(__x86_64__) + static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator [](size_t index) const { assert(index < 4); return i[index]; } + __forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; } + + friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) { +#if defined(__AVX512VL__) + return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t); +#elif defined(__SSE4_1__) + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); +#else + return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); +#endif + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 asBool(const vint4& a) { return _mm_movepi32_mask(a); } +#else + __forceinline vboolf4 asBool(const vint4& a) { return _mm_castsi128_ps(a); } +#endif + + __forceinline vint4 operator +(const vint4& a) { return a; } + __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); } +#if defined(__SSSE3__) + __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint4 operator +(const vint4& a, const vint4& b) { return _mm_add_epi32(a, b); } + __forceinline vint4 operator +(const vint4& a, int b) { return a + vint4(b); } + __forceinline vint4 operator +(int a, const vint4& b) { return vint4(a) + b; } + + __forceinline vint4 operator -(const vint4& a, const vint4& b) { return _mm_sub_epi32(a, b); } + __forceinline vint4 operator -(const vint4& a, int b) { return a - vint4(b); } + __forceinline vint4 operator -(int a, const vint4& b) { return vint4(a) - b; } + +#if defined(__SSE4_1__) + __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); } +#else + __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); } +#endif + __forceinline vint4 operator *(const vint4& a, int b) { return a * vint4(b); } + __forceinline vint4 operator *(int a, const vint4& b) { return vint4(a) * b; } + + __forceinline vint4 operator &(const vint4& a, const vint4& b) { return _mm_and_si128(a, b); } + __forceinline vint4 operator &(const vint4& a, int b) { return a & vint4(b); } + __forceinline vint4 operator &(int a, const vint4& b) { return vint4(a) & b; } + + __forceinline vint4 operator |(const vint4& a, const vint4& b) { return _mm_or_si128(a, b); } + __forceinline vint4 operator |(const vint4& a, int b) { return a | vint4(b); } + __forceinline vint4 operator |(int a, const vint4& b) { return vint4(a) | b; } + + __forceinline vint4 operator ^(const vint4& a, const vint4& b) { return _mm_xor_si128(a, b); } + __forceinline vint4 operator ^(const vint4& a, int b) { return a ^ vint4(b); } + __forceinline vint4 operator ^(int a, const vint4& b) { return vint4(a) ^ b; } + + __forceinline vint4 operator <<(const vint4& a, int n) { return _mm_slli_epi32(a, n); } + __forceinline vint4 operator >>(const vint4& a, int n) { return _mm_srai_epi32(a, n); } + + __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); } + __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); } + __forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a, b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint4& operator +=(vint4& a, const vint4& b) { return a = a + b; } + __forceinline vint4& operator +=(vint4& a, int b) { return a = a + b; } + + __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; } + __forceinline vint4& operator -=(vint4& a, int b) { return a = a - b; } + +#if defined(__SSE4_1__) + __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; } + __forceinline vint4& operator *=(vint4& a, int b) { return a = a * b; } +#endif + + __forceinline vint4& operator &=(vint4& a, const vint4& b) { return a = a & b; } + __forceinline vint4& operator &=(vint4& a, int b) { return a = a & b; } + + __forceinline vint4& operator |=(vint4& a, const vint4& b) { return a = a | b; } + __forceinline vint4& operator |=(vint4& a, int b) { return a = a | b; } + + __forceinline vint4& operator <<=(vint4& a, int b) { return a = a << b; } + __forceinline vint4& operator >>=(vint4& a, int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } +#else + __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } + __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return !(a == b); } + __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmplt_epi32(a, b)); } + __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return !(a < b); } + __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epi32(a, b)); } + __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return !(a > b); } +#endif + + __forceinline vboolf4 operator ==(const vint4& a, int b) { return a == vint4(b); } + __forceinline vboolf4 operator ==(int a, const vint4& b) { return vint4(a) == b; } + + __forceinline vboolf4 operator !=(const vint4& a, int b) { return a != vint4(b); } + __forceinline vboolf4 operator !=(int a, const vint4& b) { return vint4(a) != b; } + + __forceinline vboolf4 operator < (const vint4& a, int b) { return a < vint4(b); } + __forceinline vboolf4 operator < (int a, const vint4& b) { return vint4(a) < b; } + + __forceinline vboolf4 operator >=(const vint4& a, int b) { return a >= vint4(b); } + __forceinline vboolf4 operator >=(int a, const vint4& b) { return vint4(a) >= b; } + + __forceinline vboolf4 operator > (const vint4& a, int b) { return a > vint4(b); } + __forceinline vboolf4 operator > (int a, const vint4& b) { return vint4(a) > b; } + + __forceinline vboolf4 operator <=(const vint4& a, int b) { return a <= vint4(b); } + __forceinline vboolf4 operator <=(int a, const vint4& b) { return vint4(a) <= b; } + + __forceinline vboolf4 eq(const vint4& a, const vint4& b) { return a == b; } + __forceinline vboolf4 ne(const vint4& a, const vint4& b) { return a != b; } + __forceinline vboolf4 lt(const vint4& a, const vint4& b) { return a < b; } + __forceinline vboolf4 ge(const vint4& a, const vint4& b) { return a >= b; } + __forceinline vboolf4 gt(const vint4& a, const vint4& b) { return a > b; } + __forceinline vboolf4 le(const vint4& a, const vint4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a == b); } + __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a != b); } + __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a < b); } + __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >= b); } + __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a > b); } + __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <= b); } +#endif + + template<int mask> + __forceinline vint4 select(const vint4& t, const vint4& f) { +#if defined(__SSE4_1__) + return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask)); +#else + return select(vboolf4(mask), t, f); +#endif + } + +#if defined(__SSE4_1__) + __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); } + __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); } + + __forceinline vint4 umin(const vint4& a, const vint4& b) { return _mm_min_epu32(a, b); } + __forceinline vint4 umax(const vint4& a, const vint4& b) { return _mm_max_epu32(a, b); } + +#else + __forceinline vint4 min(const vint4& a, const vint4& b) { return select(a < b,a,b); } + __forceinline vint4 max(const vint4& a, const vint4& b) { return select(a < b,b,a); } +#endif + + __forceinline vint4 min(const vint4& a, int b) { return min(a,vint4(b)); } + __forceinline vint4 min(int a, const vint4& b) { return min(vint4(a),b); } + __forceinline vint4 max(const vint4& a, int b) { return max(a,vint4(b)); } + __forceinline vint4 max(int a, const vint4& b) { return max(vint4(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } + __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } + + template<int i0, int i1, int i2, int i3> + __forceinline vint4 shuffle(const vint4& v) { + return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vint4 shuffle(const vint4& a, const vint4& b) { + return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + +#if defined(__SSE3__) + template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); } + template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); } + template<> __forceinline vint4 shuffle<0, 1, 0, 1>(const vint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); } +#endif + + template<int i> + __forceinline vint4 shuffle(const vint4& v) { + return shuffle<i,i,i,i>(v); + } + +#if defined(__SSE4_1__) + template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); } + template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); } +#else + template<int src> __forceinline int extract(const vint4& b) { return b[src&3]; } + template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; } +#endif + + + template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); } + + __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); } + + __forceinline size_t toSizeT(const vint4& v) { +#if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround + return toScalar(v); +#else + return _mm_cvtsi128_si64(v); +#endif + } + +#if defined(__AVX512VL__) + + __forceinline vint4 permute(const vint4 &a, const vint4 &index) { + return _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(a),index)); + } + + template<int i> + __forceinline vint4 align_shift_right(const vint4& a, const vint4& b) { + return _mm_alignr_epi32(a, b, i); + } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE4_1__) + __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } + __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } + __forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } + + __forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); } + __forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); } + + __forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); } + __forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); } + + __forceinline size_t select_min(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + __forceinline size_t select_max(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + +#else + + __forceinline int reduce_min(const vint4& v) { return min(v[0],v[1],v[2],v[3]); } + __forceinline int reduce_max(const vint4& v) { return max(v[0],v[1],v[2],v[3]); } + __forceinline int reduce_add(const vint4& v) { return v[0]+v[1]+v[2]+v[3]; } + +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Sorting networks + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE4_1__) + + __forceinline vint4 usort_ascending(const vint4& v) + { + const vint4 a0 = v; + const vint4 b0 = shuffle<1,0,3,2>(a0); + const vint4 c0 = umin(a0,b0); + const vint4 d0 = umax(a0,b0); + const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vint4 b1 = shuffle<2,3,0,1>(a1); + const vint4 c1 = umin(a1,b1); + const vint4 d1 = umax(a1,b1); + const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vint4 b2 = shuffle<0,2,1,3>(a2); + const vint4 c2 = umin(a2,b2); + const vint4 d2 = umax(a2,b2); + const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3; + } + + __forceinline vint4 usort_descending(const vint4& v) + { + const vint4 a0 = v; + const vint4 b0 = shuffle<1,0,3,2>(a0); + const vint4 c0 = umax(a0,b0); + const vint4 d0 = umin(a0,b0); + const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vint4 b1 = shuffle<2,3,0,1>(a1); + const vint4 c1 = umax(a1,b1); + const vint4 d1 = umin(a1,b1); + const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vint4 b2 = shuffle<0,2,1,3>(a2); + const vint4 c2 = umax(a2,b2); + const vint4 d2 = umin(a2,b2); + const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3; + } + +#else + + __forceinline vint4 usort_ascending(const vint4& v) + { + const vint4 a0 = v-vint4(0x80000000); + const vint4 b0 = shuffle<1,0,3,2>(a0); + const vint4 c0 = min(a0,b0); + const vint4 d0 = max(a0,b0); + const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vint4 b1 = shuffle<2,3,0,1>(a1); + const vint4 c1 = min(a1,b1); + const vint4 d1 = max(a1,b1); + const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vint4 b2 = shuffle<0,2,1,3>(a2); + const vint4 c2 = min(a2,b2); + const vint4 d2 = max(a2,b2); + const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3+vint4(0x80000000); + } + + __forceinline vint4 usort_descending(const vint4& v) + { + const vint4 a0 = v-vint4(0x80000000); + const vint4 b0 = shuffle<1,0,3,2>(a0); + const vint4 c0 = max(a0,b0); + const vint4 d0 = min(a0,b0); + const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vint4 b1 = shuffle<2,3,0,1>(a1); + const vint4 c1 = max(a1,b1); + const vint4 d1 = min(a1,b1); + const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vint4 b2 = shuffle<0,2,1,3>(a2); + const vint4 c2 = max(a2,b2); + const vint4 d2 = min(a2,b2); + const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3+vint4(0x80000000); + } + +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vint4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vint8_avx.h b/thirdparty/embree/common/simd/vint8_avx.h new file mode 100644 index 0000000000..f43e9a8c22 --- /dev/null +++ b/thirdparty/embree/common/simd/vint8_avx.h @@ -0,0 +1,470 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX integer type */ + template<> + struct vint<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256i v; + struct { __m128i vl,vh; }; + int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint() {} + __forceinline vint(const vint8& a) { v = a.v; } + __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; } + + __forceinline vint(__m256i a) : v(a) {} + __forceinline operator const __m256i&() const { return v; } + __forceinline operator __m256i&() { return v; } + + __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} + __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + __forceinline vint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {} + + __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} + __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {} + __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} + __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} + __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {} + + __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vint(OneTy) : v(_mm256_set_epi32(1,1,1,1,1,1,1,1)) {} + __forceinline vint(PosInfTy) : v(_mm256_set_epi32(pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf)) {} + __forceinline vint(NegInfTy) : v(_mm256_set_epi32(neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf)) {} + __forceinline vint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} + __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {} + __forceinline vint(UndefinedTy) : v(_mm256_undefined_si256()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); } + static __forceinline vint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); } + + static __forceinline vint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } + static __forceinline vint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } + + static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } + + static __forceinline void store_nt(void* ptr, const vint8& v) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); + } + + static __forceinline vint8 load(const unsigned char* ptr) { + vint4 il = vint4::load(ptr+0); + vint4 ih = vint4::load(ptr+4); + return vint8(il,ih); + } + + static __forceinline vint8 loadu(const unsigned char* ptr) { + vint4 il = vint4::loadu(ptr+0); + vint4 ih = vint4::loadu(ptr+4); + return vint8(il,ih); + } + + static __forceinline vint8 load(const unsigned short* ptr) { + vint4 il = vint4::load(ptr+0); + vint4 ih = vint4::load(ptr+4); + return vint8(il,ih); + } + + static __forceinline vint8 loadu(const unsigned short* ptr) { + vint4 il = vint4::loadu(ptr+0); + vint4 ih = vint4::loadu(ptr+4); + return vint8(il,ih); + } + + static __forceinline void store(unsigned char* ptr, const vint8& i) { + vint4 il(i.vl); + vint4 ih(i.vh); + vint4::store(ptr + 0,il); + vint4::store(ptr + 4,ih); + } + + static __forceinline void store(unsigned short* ptr, const vint8& v) { + for (size_t i=0;i<8;i++) + ptr[i] = (unsigned short)v[i]; + } + + template<int scale = 4> + static __forceinline vint8 gather(const int* ptr, const vint8& index) { + return vint8( + *(int*)(((char*)ptr)+scale*index[0]), + *(int*)(((char*)ptr)+scale*index[1]), + *(int*)(((char*)ptr)+scale*index[2]), + *(int*)(((char*)ptr)+scale*index[3]), + *(int*)(((char*)ptr)+scale*index[4]), + *(int*)(((char*)ptr)+scale*index[5]), + *(int*)(((char*)ptr)+scale*index[6]), + *(int*)(((char*)ptr)+scale*index[7])); + } + + template<int scale = 4> + static __forceinline vint8 gather(const vboolf8& mask, const int* ptr, const vint8& index) { + vint8 r = zero; + if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]); + if (likely(mask[4])) r[4] = *(int*)(((char*)ptr)+scale*index[4]); + if (likely(mask[5])) r[5] = *(int*)(((char*)ptr)+scale*index[5]); + if (likely(mask[6])) r[6] = *(int*)(((char*)ptr)+scale*index[6]); + if (likely(mask[7])) r[7] = *(int*)(((char*)ptr)+scale*index[7]); + return r; + } + + template<int scale = 4> + static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v) + { + *(int*)(((char*)ptr)+scale*ofs[0]) = v[0]; + *(int*)(((char*)ptr)+scale*ofs[1]) = v[1]; + *(int*)(((char*)ptr)+scale*ofs[2]) = v[2]; + *(int*)(((char*)ptr)+scale*ofs[3]) = v[3]; + *(int*)(((char*)ptr)+scale*ofs[4]) = v[4]; + *(int*)(((char*)ptr)+scale*ofs[5]) = v[5]; + *(int*)(((char*)ptr)+scale*ofs[6]) = v[6]; + *(int*)(((char*)ptr)+scale*ofs[7]) = v[7]; + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v) + { + if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7]; + } + + + static __forceinline vint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; } + __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); } + + __forceinline vint8 operator +(const vint8& a) { return a; } + __forceinline vint8 operator -(const vint8& a) { return vint8(_mm_sub_epi32(_mm_setzero_si128(), a.vl), _mm_sub_epi32(_mm_setzero_si128(), a.vh)); } + __forceinline vint8 abs (const vint8& a) { return vint8(_mm_abs_epi32(a.vl), _mm_abs_epi32(a.vh)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 operator +(const vint8& a, const vint8& b) { return vint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); } + __forceinline vint8 operator +(const vint8& a, int b) { return a + vint8(b); } + __forceinline vint8 operator +(int a, const vint8& b) { return vint8(a) + b; } + + __forceinline vint8 operator -(const vint8& a, const vint8& b) { return vint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); } + __forceinline vint8 operator -(const vint8& a, int b) { return a - vint8(b); } + __forceinline vint8 operator -(int a, const vint8& b) { return vint8(a) - b; } + + __forceinline vint8 operator *(const vint8& a, const vint8& b) { return vint8(_mm_mullo_epi32(a.vl, b.vl), _mm_mullo_epi32(a.vh, b.vh)); } + __forceinline vint8 operator *(const vint8& a, int b) { return a * vint8(b); } + __forceinline vint8 operator *(int a, const vint8& b) { return vint8(a) * b; } + + __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vint8 operator &(const vint8& a, int b) { return a & vint8(b); } + __forceinline vint8 operator &(int a, const vint8& b) { return vint8(a) & b; } + + __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vint8 operator |(const vint8& a, int b) { return a | vint8(b); } + __forceinline vint8 operator |(int a, const vint8& b) { return vint8(a) | b; } + + __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vint8 operator ^(const vint8& a, int b) { return a ^ vint8(b); } + __forceinline vint8 operator ^(int a, const vint8& b) { return vint8(a) ^ b; } + + __forceinline vint8 operator <<(const vint8& a, int n) { return vint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); } + __forceinline vint8 operator >>(const vint8& a, int n) { return vint8(_mm_srai_epi32(a.vl, n), _mm_srai_epi32(a.vh, n)); } + + __forceinline vint8 sll (const vint8& a, int b) { return vint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); } + __forceinline vint8 sra (const vint8& a, int b) { return vint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); } + __forceinline vint8 srl (const vint8& a, int b) { return vint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); } + + __forceinline vint8 min(const vint8& a, const vint8& b) { return vint8(_mm_min_epi32(a.vl, b.vl), _mm_min_epi32(a.vh, b.vh)); } + __forceinline vint8 min(const vint8& a, int b) { return min(a,vint8(b)); } + __forceinline vint8 min(int a, const vint8& b) { return min(vint8(a),b); } + + __forceinline vint8 max(const vint8& a, const vint8& b) { return vint8(_mm_max_epi32(a.vl, b.vl), _mm_max_epi32(a.vh, b.vh)); } + __forceinline vint8 max(const vint8& a, int b) { return max(a,vint8(b)); } + __forceinline vint8 max(int a, const vint8& b) { return max(vint8(a),b); } + + __forceinline vint8 umin(const vint8& a, const vint8& b) { return vint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); } + __forceinline vint8 umin(const vint8& a, int b) { return umin(a,vint8(b)); } + __forceinline vint8 umin(int a, const vint8& b) { return umin(vint8(a),b); } + + __forceinline vint8 umax(const vint8& a, const vint8& b) { return vint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); } + __forceinline vint8 umax(const vint8& a, int b) { return umax(a,vint8(b)); } + __forceinline vint8 umax(int a, const vint8& b) { return umax(vint8(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; } + __forceinline vint8& operator +=(vint8& a, int b) { return a = a + b; } + + __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; } + __forceinline vint8& operator -=(vint8& a, int b) { return a = a - b; } + + __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; } + __forceinline vint8& operator *=(vint8& a, int b) { return a = a * b; } + + __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; } + __forceinline vint8& operator &=(vint8& a, int b) { return a = a & b; } + + __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; } + __forceinline vint8& operator |=(vint8& a, int b) { return a = a | b; } + + __forceinline vint8& operator <<=(vint8& a, int b) { return a = a << b; } + __forceinline vint8& operator >>=(vint8& a, int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)), + _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); } + __forceinline vboolf8 operator ==(const vint8& a, int b) { return a == vint8(b); } + __forceinline vboolf8 operator ==(int a, const vint8& b) { return vint8(a) == b; } + + __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); } + __forceinline vboolf8 operator !=(const vint8& a, int b) { return a != vint8(b); } + __forceinline vboolf8 operator !=(int a, const vint8& b) { return vint8(a) != b; } + + __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epi32 (a.vl, b.vl)), + _mm_castsi128_ps(_mm_cmplt_epi32 (a.vh, b.vh))); } + __forceinline vboolf8 operator < (const vint8& a, int b) { return a < vint8(b); } + __forceinline vboolf8 operator < (int a, const vint8& b) { return vint8(a) < b; } + + __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a < b); } + __forceinline vboolf8 operator >=(const vint8& a, int b) { return a >= vint8(b); } + __forceinline vboolf8 operator >=(int a, const vint8& b) { return vint8(a) >= b; } + + __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epi32 (a.vl, b.vl)), + _mm_castsi128_ps(_mm_cmpgt_epi32 (a.vh, b.vh))); } + __forceinline vboolf8 operator > (const vint8& a, int b) { return a > vint8(b); } + __forceinline vboolf8 operator > (int a, const vint8& b) { return vint8(a) > b; } + + __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a > b); } + __forceinline vboolf8 operator <=(const vint8& a, int b) { return a <= vint8(b); } + __forceinline vboolf8 operator <=(int a, const vint8& b) { return vint8(a) <= b; } + + __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; } + __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; } + __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a < b; } + __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; } + __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a > b; } + __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; } + + __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); } + __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); } + __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a < b); } + __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); } + __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a > b); } + __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); } + + __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + + template<int i> + __forceinline vint8 shuffle(const vint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template<int i0, int i1> + __forceinline vint8 shuffle4(const vint8& v) { + return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1> + __forceinline vint8 shuffle4(const vint8& a, const vint8& b) { + return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vint8 shuffle(const vint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vint8 shuffle(const vint8& a, const vint8& b) { + return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } + + __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } + template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); } + template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); } + template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); } + + __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } + + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); } + __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); } + __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); } + __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); } + + __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); } + __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); } + + __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + + //////////////////////////////////////////////////////////////////////////////// + /// Sorting networks + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 usort_ascending(const vint8& v) + { + const vint8 a0 = v; + const vint8 b0 = shuffle<1,0,3,2>(a0); + const vint8 c0 = umin(a0,b0); + const vint8 d0 = umax(a0,b0); + const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0); + const vint8 b1 = shuffle<2,3,0,1>(a1); + const vint8 c1 = umin(a1,b1); + const vint8 d1 = umax(a1,b1); + const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1); + const vint8 b2 = shuffle<1,0,3,2>(a2); + const vint8 c2 = umin(a2,b2); + const vint8 d2 = umax(a2,b2); + const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2); + const vint8 b3 = shuffle4<1,0>(a3); + const vint8 c3 = umin(a3,b3); + const vint8 d3 = umax(a3,b3); + const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3); + const vint8 b4 = shuffle<2,3,0,1>(a4); + const vint8 c4 = umin(a4,b4); + const vint8 d4 = umax(a4,b4); + const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4); + const vint8 b5 = shuffle<1,0,3,2>(a5); + const vint8 c5 = umin(a5,b5); + const vint8 d5 = umax(a5,b5); + const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5); + return a6; + } + + __forceinline vint8 usort_descending(const vint8& v) + { + const vint8 a0 = v; + const vint8 b0 = shuffle<1,0,3,2>(a0); + const vint8 c0 = umax(a0,b0); + const vint8 d0 = umin(a0,b0); + const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0); + const vint8 b1 = shuffle<2,3,0,1>(a1); + const vint8 c1 = umax(a1,b1); + const vint8 d1 = umin(a1,b1); + const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1); + const vint8 b2 = shuffle<1,0,3,2>(a2); + const vint8 c2 = umax(a2,b2); + const vint8 d2 = umin(a2,b2); + const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2); + const vint8 b3 = shuffle4<1,0>(a3); + const vint8 c3 = umax(a3,b3); + const vint8 d3 = umin(a3,b3); + const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3); + const vint8 b4 = shuffle<2,3,0,1>(a4); + const vint8 c4 = umax(a4,b4); + const vint8 d4 = umin(a4,b4); + const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4); + const vint8 b5 = shuffle<1,0,3,2>(a5); + const vint8 c5 = umax(a5,b5); + const vint8 d5 = umin(a5,b5); + const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5); + return a6; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vint8_avx2.h b/thirdparty/embree/common/simd/vint8_avx2.h new file mode 100644 index 0000000000..e04737ffbe --- /dev/null +++ b/thirdparty/embree/common/simd/vint8_avx2.h @@ -0,0 +1,518 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX integer type */ + template<> + struct vint<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256i v; + int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint() {} + __forceinline vint(const vint8& a) { v = a.v; } + __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; } + + __forceinline vint(__m256i a) : v(a) {} + __forceinline operator const __m256i&() const { return v; } + __forceinline operator __m256i&() { return v; } + + __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} + __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + __forceinline vint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + + __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} + __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {} + __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} + __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} + __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {} + + __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} + +#if defined(__AVX512VL__) + __forceinline explicit vint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {} +#else + __forceinline explicit vint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {} +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vint(OneTy) : v(_mm256_set1_epi32(1)) {} + __forceinline vint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {} + __forceinline vint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {} + __forceinline vint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} + __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {} + __forceinline vint(UndefinedTy) : v(_mm256_undefined_si256()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vint8 load(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } + static __forceinline vint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } + static __forceinline vint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); } + static __forceinline vint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); } + + static __forceinline vint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); } + static __forceinline vint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); } + + static __forceinline void store (void* ptr, const vint8& v) { _mm256_store_si256((__m256i*)ptr,v); } + static __forceinline void storeu(void* ptr, const vint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); } + +#if defined(__AVX512VL__) + + static __forceinline vint8 compact(const vboolf8& mask, vint8 &v) { + return _mm256_mask_compress_epi32(v, mask, v); + } + static __forceinline vint8 compact(const vboolf8& mask, vint8 &a, const vint8& b) { + return _mm256_mask_compress_epi32(a, mask, b); + } + + static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); } + static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); } +#else + static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } + static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } +#endif + + static __forceinline vint8 load_nt(void* ptr) { + return _mm256_stream_load_si256((__m256i*)ptr); + } + + static __forceinline void store_nt(void* ptr, const vint8& v) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); + } + + static __forceinline void store(unsigned char* ptr, const vint8& i) + { + for (size_t j=0; j<8; j++) + ptr[j] = i[j]; + } + + static __forceinline void store(unsigned short* ptr, const vint8& v) { + for (size_t i=0;i<8;i++) + ptr[i] = (unsigned short)v[i]; + } + + template<int scale = 4> + static __forceinline vint8 gather(const int *const ptr, const vint8& index) { + return _mm256_i32gather_epi32(ptr, index, scale); + } + + template<int scale = 4> + static __forceinline vint8 gather(const vboolf8& mask, const int *const ptr, const vint8& index) { + vint8 r = zero; +#if defined(__AVX512VL__) + return _mm256_mmask_i32gather_epi32(r, mask, index, ptr, scale); +#else + return _mm256_mask_i32gather_epi32(r, ptr, index, mask, scale); +#endif + } + + template<int scale = 4> + static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v) + { +#if defined(__AVX512VL__) + _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale); +#else + *(int*)(((char*)ptr)+scale*ofs[0]) = v[0]; + *(int*)(((char*)ptr)+scale*ofs[1]) = v[1]; + *(int*)(((char*)ptr)+scale*ofs[2]) = v[2]; + *(int*)(((char*)ptr)+scale*ofs[3]) = v[3]; + *(int*)(((char*)ptr)+scale*ofs[4]) = v[4]; + *(int*)(((char*)ptr)+scale*ofs[5]) = v[5]; + *(int*)(((char*)ptr)+scale*ofs[6]) = v[6]; + *(int*)(((char*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v) + { +#if defined(__AVX512VL__) + _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale); +#else + if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + static __forceinline vint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; } + __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_movepi32_mask(a); } +#else + static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); } +#endif + + __forceinline vint8 operator +(const vint8& a) { return a; } + __forceinline vint8 operator -(const vint8& a) { return _mm256_sub_epi32(_mm256_setzero_si256(), a); } + __forceinline vint8 abs (const vint8& a) { return _mm256_abs_epi32(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 operator +(const vint8& a, const vint8& b) { return _mm256_add_epi32(a, b); } + __forceinline vint8 operator +(const vint8& a, int b) { return a + vint8(b); } + __forceinline vint8 operator +(int a, const vint8& b) { return vint8(a) + b; } + + __forceinline vint8 operator -(const vint8& a, const vint8& b) { return _mm256_sub_epi32(a, b); } + __forceinline vint8 operator -(const vint8& a, int b) { return a - vint8(b); } + __forceinline vint8 operator -(int a, const vint8& b) { return vint8(a) - b; } + + __forceinline vint8 operator *(const vint8& a, const vint8& b) { return _mm256_mullo_epi32(a, b); } + __forceinline vint8 operator *(const vint8& a, int b) { return a * vint8(b); } + __forceinline vint8 operator *(int a, const vint8& b) { return vint8(a) * b; } + + __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_and_si256(a, b); } + __forceinline vint8 operator &(const vint8& a, int b) { return a & vint8(b); } + __forceinline vint8 operator &(int a, const vint8& b) { return vint8(a) & b; } + + __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_or_si256(a, b); } + __forceinline vint8 operator |(const vint8& a, int b) { return a | vint8(b); } + __forceinline vint8 operator |(int a, const vint8& b) { return vint8(a) | b; } + + __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_xor_si256(a, b); } + __forceinline vint8 operator ^(const vint8& a, int b) { return a ^ vint8(b); } + __forceinline vint8 operator ^(int a, const vint8& b) { return vint8(a) ^ b; } + + __forceinline vint8 operator <<(const vint8& a, int n) { return _mm256_slli_epi32(a, n); } + __forceinline vint8 operator >>(const vint8& a, int n) { return _mm256_srai_epi32(a, n); } + + __forceinline vint8 operator <<(const vint8& a, const vint8& n) { return _mm256_sllv_epi32(a, n); } + __forceinline vint8 operator >>(const vint8& a, const vint8& n) { return _mm256_srav_epi32(a, n); } + + __forceinline vint8 sll(const vint8& a, int b) { return _mm256_slli_epi32(a, b); } + __forceinline vint8 sra(const vint8& a, int b) { return _mm256_srai_epi32(a, b); } + __forceinline vint8 srl(const vint8& a, int b) { return _mm256_srli_epi32(a, b); } + + __forceinline vint8 sll(const vint8& a, const vint8& b) { return _mm256_sllv_epi32(a, b); } + __forceinline vint8 sra(const vint8& a, const vint8& b) { return _mm256_srav_epi32(a, b); } + __forceinline vint8 srl(const vint8& a, const vint8& b) { return _mm256_srlv_epi32(a, b); } + + __forceinline vint8 min(const vint8& a, const vint8& b) { return _mm256_min_epi32(a, b); } + __forceinline vint8 min(const vint8& a, int b) { return min(a,vint8(b)); } + __forceinline vint8 min(int a, const vint8& b) { return min(vint8(a),b); } + + __forceinline vint8 max(const vint8& a, const vint8& b) { return _mm256_max_epi32(a, b); } + __forceinline vint8 max(const vint8& a, int b) { return max(a,vint8(b)); } + __forceinline vint8 max(int a, const vint8& b) { return max(vint8(a),b); } + + __forceinline vint8 umin(const vint8& a, const vint8& b) { return _mm256_min_epu32(a, b); } + __forceinline vint8 umax(const vint8& a, const vint8& b) { return _mm256_max_epu32(a, b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; } + __forceinline vint8& operator +=(vint8& a, int b) { return a = a + b; } + + __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; } + __forceinline vint8& operator -=(vint8& a, int b) { return a = a - b; } + + __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; } + __forceinline vint8& operator *=(vint8& a, int b) { return a = a * b; } + + __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; } + __forceinline vint8& operator &=(vint8& a, int b) { return a = a & b; } + + __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; } + __forceinline vint8& operator |=(vint8& a, int b) { return a = a | b; } + + __forceinline vint8& operator <<=(vint8& a, const int b) { return a = a << b; } + __forceinline vint8& operator >>=(vint8& a, const int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } + static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } + static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } + static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } + static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } + static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } + + static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { + return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t); + } +#else + static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); } + static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); } + static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b, a)); } + static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a < b); } + static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, b)); } + static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a > b); } + + static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); + } +#endif + + template<int mask> + __forceinline vint8 select(const vint8& t, const vint8& f) { + return _mm256_blend_epi32(f, t, mask); + } + + __forceinline vboolf8 operator ==(const vint8& a, int b) { return a == vint8(b); } + __forceinline vboolf8 operator ==(int a, const vint8& b) { return vint8(a) == b; } + + __forceinline vboolf8 operator !=(const vint8& a, int b) { return a != vint8(b); } + __forceinline vboolf8 operator !=(int a, const vint8& b) { return vint8(a) != b; } + + __forceinline vboolf8 operator < (const vint8& a, int b) { return a < vint8(b); } + __forceinline vboolf8 operator < (int a, const vint8& b) { return vint8(a) < b; } + + __forceinline vboolf8 operator >=(const vint8& a, int b) { return a >= vint8(b); } + __forceinline vboolf8 operator >=(int a, const vint8& b) { return vint8(a) >= b; } + + __forceinline vboolf8 operator > (const vint8& a, int b) { return a > vint8(b); } + __forceinline vboolf8 operator > (int a, const vint8& b) { return vint8(a) > b; } + + __forceinline vboolf8 operator <=(const vint8& a, int b) { return a <= vint8(b); } + __forceinline vboolf8 operator <=(int a, const vint8& b) { return vint8(a) <= b; } + + __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; } + __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; } + __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a < b; } + __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; } + __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a > b; } + __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; } + +#if defined(__AVX512VL__) + static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); } + static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); } + static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); } + static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); } + static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); } + static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); } +#else + static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); } + static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); } + static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a < b); } + static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); } + static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a > b); } + static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_unpacklo_epi32(a, b); } + __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_unpackhi_epi32(a, b); } + + template<int i> + __forceinline vint8 shuffle(const vint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template<int i0, int i1> + __forceinline vint8 shuffle4(const vint8& v) { + return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1> + __forceinline vint8 shuffle4(const vint8& a, const vint8& b) { + return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vint8 shuffle(const vint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vint8 shuffle(const vint8& a, const vint8& b) { + return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } + + __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } + + template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); } + template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); } + template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); } + + __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } + + __forceinline vint8 permute(const vint8& v, const __m256i& index) { + return _mm256_permutevar8x32_epi32(v, index); + } + + __forceinline vint8 shuffle(const vint8& v, const __m256i& index) { + return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index)); + } + + template<int i> + static __forceinline vint8 align_shift_right(const vint8& a, const vint8& b) { +#if defined(__AVX512VL__) + return _mm256_alignr_epi32(a, b, i); +#else + return _mm256_alignr_epi8(a, b, 4*i); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); } + __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); } + __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); } + __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); } + + __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); } + __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); } + + __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + + //////////////////////////////////////////////////////////////////////////////// + /// Sorting networks + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 usort_ascending(const vint8& v) + { + const vint8 a0 = v; + const vint8 b0 = shuffle<1,0,3,2>(a0); + const vint8 c0 = umin(a0,b0); + const vint8 d0 = umax(a0,b0); + const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); + const vint8 b1 = shuffle<2,3,0,1>(a1); + const vint8 c1 = umin(a1,b1); + const vint8 d1 = umax(a1,b1); + const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); + const vint8 b2 = shuffle<1,0,3,2>(a2); + const vint8 c2 = umin(a2,b2); + const vint8 d2 = umax(a2,b2); + const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); + const vint8 b3 = shuffle4<1,0>(a3); + const vint8 c3 = umin(a3,b3); + const vint8 d3 = umax(a3,b3); + const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3); + const vint8 b4 = shuffle<2,3,0,1>(a4); + const vint8 c4 = umin(a4,b4); + const vint8 d4 = umax(a4,b4); + const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); + const vint8 b5 = shuffle<1,0,3,2>(a5); + const vint8 c5 = umin(a5,b5); + const vint8 d5 = umax(a5,b5); + const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); + return a6; + } + + __forceinline vint8 usort_descending(const vint8& v) + { + const vint8 a0 = v; + const vint8 b0 = shuffle<1,0,3,2>(a0); + const vint8 c0 = umax(a0,b0); + const vint8 d0 = umin(a0,b0); + const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); + const vint8 b1 = shuffle<2,3,0,1>(a1); + const vint8 c1 = umax(a1,b1); + const vint8 d1 = umin(a1,b1); + const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); + const vint8 b2 = shuffle<1,0,3,2>(a2); + const vint8 c2 = umax(a2,b2); + const vint8 d2 = umin(a2,b2); + const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); + const vint8 b3 = shuffle4<1,0>(a3); + const vint8 c3 = umax(a3,b3); + const vint8 d3 = umin(a3,b3); + const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3); + const vint8 b4 = shuffle<2,3,0,1>(a4); + const vint8 c4 = umax(a4,b4); + const vint8 d4 = umin(a4,b4); + const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); + const vint8 b5 = shuffle<1,0,3,2>(a5); + const vint8 c5 = umax(a5,b5); + const vint8 d5 = umin(a5,b5); + const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); + return a6; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vllong4_avx2.h b/thirdparty/embree/common/simd/vllong4_avx2.h new file mode 100644 index 0000000000..6c86845877 --- /dev/null +++ b/thirdparty/embree/common/simd/vllong4_avx2.h @@ -0,0 +1,352 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide AVX2 64-bit long long type */ + template<> + struct vllong<4> + { + ALIGNED_STRUCT_(32); + + typedef vboold4 Bool; + + enum { size = 4 }; // number of SIMD elements + union { // data + __m256i v; + long long i[4]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong() {} + __forceinline vllong(const vllong4& t) { v = t.v; } + __forceinline vllong4& operator =(const vllong4& f) { v = f.v; return *this; } + + __forceinline vllong(const __m256i& t) { v = t; } + __forceinline operator __m256i() const { return v; } + __forceinline operator __m256d() const { return _mm256_castsi256_pd(v); } + + + __forceinline vllong(long long i) { + v = _mm256_set1_epi64x(i); + } + + __forceinline vllong(long long a, long long b, long long c, long long d) { + v = _mm256_set_epi64x(d,c,b,a); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vllong(OneTy) : v(_mm256_set1_epi64x(1)) {} + __forceinline vllong(StepTy) : v(_mm256_set_epi64x(3,2,1,0)) {} + __forceinline vllong(ReverseStepTy) : v(_mm256_set_epi64x(0,1,2,3)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(void* __restrict__ ptr, const vllong4& a) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(a)); + } + + static __forceinline vllong4 loadu(const void* addr) + { + return _mm256_loadu_si256((__m256i*)addr); + } + + static __forceinline vllong4 load(const vllong4* addr) { + return _mm256_load_si256((__m256i*)addr); + } + + static __forceinline vllong4 load(const long long* addr) { + return _mm256_load_si256((__m256i*)addr); + } + + static __forceinline void store(void* ptr, const vllong4& v) { + _mm256_store_si256((__m256i*)ptr,v); + } + + static __forceinline void storeu(void* ptr, const vllong4& v) { + _mm256_storeu_si256((__m256i*)ptr,v); + } + + static __forceinline void storeu(const vboold4& mask, long long* ptr, const vllong4& f) { +#if defined(__AVX512VL__) + _mm256_mask_storeu_epi64(ptr,mask,f); +#else + _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f)); +#endif + } + + static __forceinline void store(const vboold4& mask, void* ptr, const vllong4& f) { +#if defined(__AVX512VL__) + _mm256_mask_store_epi64(ptr,mask,f); +#else + _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f)); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline long long& operator [](size_t index) { assert(index < 4); return i[index]; } + __forceinline const long long& operator [](size_t index) const { assert(index < 4); return i[index]; } + + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong4 select(const vboold4& m, const vllong4& t, const vllong4& f) { + #if defined(__AVX512VL__) + return _mm256_mask_blend_epi64(m, f, t); + #else + return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(f), _mm256_castsi256_pd(t), m)); + #endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboold4 asBool(const vllong4& a) { return _mm256_movepi64_mask(a); } +#else + __forceinline vboold4 asBool(const vllong4& a) { return _mm256_castsi256_pd(a); } +#endif + + __forceinline vllong4 operator +(const vllong4& a) { return a; } + __forceinline vllong4 operator -(const vllong4& a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong4 operator +(const vllong4& a, const vllong4& b) { return _mm256_add_epi64(a, b); } + __forceinline vllong4 operator +(const vllong4& a, long long b) { return a + vllong4(b); } + __forceinline vllong4 operator +(long long a, const vllong4& b) { return vllong4(a) + b; } + + __forceinline vllong4 operator -(const vllong4& a, const vllong4& b) { return _mm256_sub_epi64(a, b); } + __forceinline vllong4 operator -(const vllong4& a, long long b) { return a - vllong4(b); } + __forceinline vllong4 operator -(long long a, const vllong4& b) { return vllong4(a) - b; } + + /* only low 32bit part */ + __forceinline vllong4 operator *(const vllong4& a, const vllong4& b) { return _mm256_mul_epi32(a, b); } + __forceinline vllong4 operator *(const vllong4& a, long long b) { return a * vllong4(b); } + __forceinline vllong4 operator *(long long a, const vllong4& b) { return vllong4(a) * b; } + + __forceinline vllong4 operator &(const vllong4& a, const vllong4& b) { return _mm256_and_si256(a, b); } + __forceinline vllong4 operator &(const vllong4& a, long long b) { return a & vllong4(b); } + __forceinline vllong4 operator &(long long a, const vllong4& b) { return vllong4(a) & b; } + + __forceinline vllong4 operator |(const vllong4& a, const vllong4& b) { return _mm256_or_si256(a, b); } + __forceinline vllong4 operator |(const vllong4& a, long long b) { return a | vllong4(b); } + __forceinline vllong4 operator |(long long a, const vllong4& b) { return vllong4(a) | b; } + + __forceinline vllong4 operator ^(const vllong4& a, const vllong4& b) { return _mm256_xor_si256(a, b); } + __forceinline vllong4 operator ^(const vllong4& a, long long b) { return a ^ vllong4(b); } + __forceinline vllong4 operator ^(long long a, const vllong4& b) { return vllong4(a) ^ b; } + + __forceinline vllong4 operator <<(const vllong4& a, long long n) { return _mm256_slli_epi64(a, (int)n); } + //__forceinline vllong4 operator >>(const vllong4& a, long long n) { return _mm256_srai_epi64(a, n); } + + __forceinline vllong4 operator <<(const vllong4& a, const vllong4& n) { return _mm256_sllv_epi64(a, n); } + //__forceinline vllong4 operator >>(const vllong4& a, const vllong4& n) { return _mm256_srav_epi64(a, n); } + //__forceinline vllong4 sra(const vllong4& a, long long b) { return _mm256_srai_epi64(a, b); } + + __forceinline vllong4 srl(const vllong4& a, long long b) { return _mm256_srli_epi64(a, (int)b); } + + //__forceinline vllong4 min(const vllong4& a, const vllong4& b) { return _mm256_min_epi64(a, b); } + //__forceinline vllong4 min(const vllong4& a, long long b) { return min(a,vllong4(b)); } + //__forceinline vllong4 min(long long a, const vllong4& b) { return min(vllong4(a),b); } + + //__forceinline vllong4 max(const vllong4& a, const vllong4& b) { return _mm256_max_epi64(a, b); } + //__forceinline vllong4 max(const vllong4& a, long long b) { return max(a,vllong4(b)); } + //__forceinline vllong4 max(long long a, const vllong4& b) { return max(vllong4(a),b); } + +#if defined(__AVX512VL__) + __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_and_epi64(c,m,a,b); } + __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_or_epi64(c,m,a,b); } +#else + __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a & b, c); } + __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a | b, c); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong4& operator +=(vllong4& a, const vllong4& b) { return a = a + b; } + __forceinline vllong4& operator +=(vllong4& a, long long b) { return a = a + b; } + + __forceinline vllong4& operator -=(vllong4& a, const vllong4& b) { return a = a - b; } + __forceinline vllong4& operator -=(vllong4& a, long long b) { return a = a - b; } + + __forceinline vllong4& operator *=(vllong4& a, const vllong4& b) { return a = a * b; } + __forceinline vllong4& operator *=(vllong4& a, long long b) { return a = a * b; } + + __forceinline vllong4& operator &=(vllong4& a, const vllong4& b) { return a = a & b; } + __forceinline vllong4& operator &=(vllong4& a, long long b) { return a = a & b; } + + __forceinline vllong4& operator |=(vllong4& a, const vllong4& b) { return a = a | b; } + __forceinline vllong4& operator |=(vllong4& a, long long b) { return a = a | b; } + + __forceinline vllong4& operator <<=(vllong4& a, long long b) { return a = a << b; } + //__forceinline vllong4& operator >>=(vllong4& a, long long b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } +#else + __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmpeq_epi64(a,b); } + __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return !(a == b); } + __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(a,b); } + __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(b,a); } + __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return !(a < b); } + __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return !(a > b); } +#endif + + __forceinline vboold4 operator ==(const vllong4& a, long long b) { return a == vllong4(b); } + __forceinline vboold4 operator ==(long long a, const vllong4& b) { return vllong4(a) == b; } + + __forceinline vboold4 operator !=(const vllong4& a, long long b) { return a != vllong4(b); } + __forceinline vboold4 operator !=(long long a, const vllong4& b) { return vllong4(a) != b; } + + __forceinline vboold4 operator > (const vllong4& a, long long b) { return a > vllong4(b); } + __forceinline vboold4 operator > (long long a, const vllong4& b) { return vllong4(a) > b; } + + __forceinline vboold4 operator < (const vllong4& a, long long b) { return a < vllong4(b); } + __forceinline vboold4 operator < (long long a, const vllong4& b) { return vllong4(a) < b; } + + __forceinline vboold4 operator >=(const vllong4& a, long long b) { return a >= vllong4(b); } + __forceinline vboold4 operator >=(long long a, const vllong4& b) { return vllong4(a) >= b; } + + __forceinline vboold4 operator <=(const vllong4& a, long long b) { return a <= vllong4(b); } + __forceinline vboold4 operator <=(long long a, const vllong4& b) { return vllong4(a) <= b; } + + __forceinline vboold4 eq(const vllong4& a, const vllong4& b) { return a == b; } + __forceinline vboold4 ne(const vllong4& a, const vllong4& b) { return a != b; } + __forceinline vboold4 lt(const vllong4& a, const vllong4& b) { return a < b; } + __forceinline vboold4 ge(const vllong4& a, const vllong4& b) { return a >= b; } + __forceinline vboold4 gt(const vllong4& a, const vllong4& b) { return a > b; } + __forceinline vboold4 le(const vllong4& a, const vllong4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a == b); } + __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a != b); } + __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a < b); } + __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a >= b); } + __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a > b); } + __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a <= b); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template<int i0, int i1> + __forceinline vllong4 shuffle(const vllong4& v) { + return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(v), (i1 << 3) | (i0 << 2) | (i1 << 1) | i0)); + } + + template<int i> + __forceinline vllong4 shuffle(const vllong4& v) { + return shuffle<i, i>(v); + } + + template<int i0, int i1> + __forceinline vllong4 shuffle2(const vllong4& v) { + return _mm256_castpd_si256(_mm256_permute2f128_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(v), (i1 << 4) | i0)); + } + + __forceinline long long toScalar(const vllong4& v) { + return _mm_cvtsi128_si64(_mm256_castsi256_si128(v)); + } + +#if defined(__AVX512VL__) + __forceinline vllong4 permute(const vllong4& a, const __m256i& index) { + // workaround for GCC 7.x +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) + return _mm256_permutex2var_epi64(a,index,a); +#else + return _mm256_permutexvar_epi64(index,a); +#endif + } + + __forceinline vllong4 permutex2var(const vllong4& index, const vllong4& a, const vllong4& b) { + return _mm256_permutex2var_epi64(a,index,b); + } + +#endif + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + + __forceinline vllong4 vreduce_and2(const vllong4& x) { return x & shuffle<1,0>(x); } + __forceinline vllong4 vreduce_and (const vllong4& y) { const vllong4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); } + + __forceinline vllong4 vreduce_or2(const vllong4& x) { return x | shuffle<1,0>(x); } + __forceinline vllong4 vreduce_or (const vllong4& y) { const vllong4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); } + + __forceinline vllong4 vreduce_add2(const vllong4& x) { return x + shuffle<1,0>(x); } + __forceinline vllong4 vreduce_add (const vllong4& y) { const vllong4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); } + + __forceinline long long reduce_add(const vllong4& a) { return toScalar(vreduce_add(a)); } + __forceinline long long reduce_or (const vllong4& a) { return toScalar(vreduce_or(a)); } + __forceinline long long reduce_and(const vllong4& a) { return toScalar(vreduce_and(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vllong4& v) + { + cout << "<" << v[0]; + for (size_t i=1; i<4; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vllong8_avx512.h b/thirdparty/embree/common/simd/vllong8_avx512.h new file mode 100644 index 0000000000..ee69411637 --- /dev/null +++ b/thirdparty/embree/common/simd/vllong8_avx512.h @@ -0,0 +1,358 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX-512 64-bit long long type */ + template<> + struct vllong<8> + { + ALIGNED_STRUCT_(64); + + typedef vboold8 Bool; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m512i v; + long long i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong() {} + __forceinline vllong(const vllong8& t) { v = t.v; } + __forceinline vllong8& operator =(const vllong8& f) { v = f.v; return *this; } + + __forceinline vllong(const __m512i& t) { v = t; } + __forceinline operator __m512i() const { return v; } + __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } + + __forceinline vllong(long long i) { + v = _mm512_set1_epi64(i); + } + + __forceinline vllong(long long a, long long b, long long c, long long d) { + v = _mm512_set4_epi64(d,c,b,a); + } + + __forceinline vllong(long long a0, long long a1, long long a2, long long a3, + long long a4, long long a5, long long a6, long long a7) + { + v = _mm512_set_epi64(a7,a6,a5,a4,a3,a2,a1,a0); + } + + __forceinline vllong(const vllong<4>& i) { + v = _mm512_broadcast_i64x4(i); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong(ZeroTy) : v(_mm512_setzero_epi32()) {} + __forceinline vllong(OneTy) : v(_mm512_set1_epi64(1)) {} + __forceinline vllong(StepTy) : v(_mm512_set_epi64(7,6,5,4,3,2,1,0)) {} + __forceinline vllong(ReverseStepTy) : v(_mm512_setr_epi64(7,6,5,4,3,2,1,0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(void* __restrict__ ptr, const vllong8& a) { + _mm512_stream_si512((__m512i*)ptr,a); + } + + static __forceinline vllong8 loadu(const void* addr) { + return _mm512_loadu_si512(addr); + } + + static __forceinline vllong8 load(const vllong8* addr) { + return _mm512_load_si512(addr); + } + + static __forceinline vllong8 load(const long long* addr) { + return _mm512_load_si512(addr); + } + + static __forceinline vllong8 load(const unsigned char* ptr) { + return _mm512_cvtepu8_epi64(*(__m128i*)ptr); + } + + static __forceinline void store(void* ptr, const vllong8& v) { + _mm512_store_si512(ptr,v); + } + + static __forceinline void storeu(void* ptr, const vllong8& v) { + _mm512_storeu_si512(ptr,v); + } + + static __forceinline void storeu(const vboold8& mask, long long* ptr, const vllong8& f) { + _mm512_mask_storeu_epi64(ptr,mask,f); + } + + static __forceinline void store(const vboold8& mask, void* addr, const vllong8& v2) { + _mm512_mask_store_epi64(addr,mask,v2); + } + + static __forceinline vllong8 compact(const vboold8& mask, vllong8& v) { + return _mm512_mask_compress_epi64(v,mask,v); + } + + static __forceinline vllong8 compact(const vboold8& mask, const vllong8& a, vllong8& b) { + return _mm512_mask_compress_epi64(a,mask,b); + } + + static __forceinline vllong8 expand(const vboold8& mask, const vllong8& a, vllong8& b) { + return _mm512_mask_expand_epi64(b,mask,a); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline long long& operator [](size_t index) { assert(index < 8); return i[index]; } + __forceinline const long long& operator [](size_t index) const { assert(index < 8); return i[index]; } + + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 asBool(const vllong8& a) { return _mm512_movepi64_mask(a); } + + __forceinline vllong8 operator +(const vllong8& a) { return a; } + __forceinline vllong8 operator -(const vllong8& a) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong8 operator +(const vllong8& a, const vllong8& b) { return _mm512_add_epi64(a, b); } + __forceinline vllong8 operator +(const vllong8& a, long long b) { return a + vllong8(b); } + __forceinline vllong8 operator +(long long a, const vllong8& b) { return vllong8(a) + b; } + + __forceinline vllong8 operator -(const vllong8& a, const vllong8& b) { return _mm512_sub_epi64(a, b); } + __forceinline vllong8 operator -(const vllong8& a, long long b) { return a - vllong8(b); } + __forceinline vllong8 operator -(long long a, const vllong8& b) { return vllong8(a) - b; } + + __forceinline vllong8 operator *(const vllong8& a, const vllong8& b) { return _mm512_mullo_epi64(a, b); } + __forceinline vllong8 operator *(const vllong8& a, long long b) { return a * vllong8(b); } + __forceinline vllong8 operator *(long long a, const vllong8& b) { return vllong8(a) * b; } + + __forceinline vllong8 operator &(const vllong8& a, const vllong8& b) { return _mm512_and_epi64(a, b); } + __forceinline vllong8 operator &(const vllong8& a, long long b) { return a & vllong8(b); } + __forceinline vllong8 operator &(long long a, const vllong8& b) { return vllong8(a) & b; } + + __forceinline vllong8 operator |(const vllong8& a, const vllong8& b) { return _mm512_or_epi64(a, b); } + __forceinline vllong8 operator |(const vllong8& a, long long b) { return a | vllong8(b); } + __forceinline vllong8 operator |(long long a, const vllong8& b) { return vllong8(a) | b; } + + __forceinline vllong8 operator ^(const vllong8& a, const vllong8& b) { return _mm512_xor_epi64(a, b); } + __forceinline vllong8 operator ^(const vllong8& a, long long b) { return a ^ vllong8(b); } + __forceinline vllong8 operator ^(long long a, const vllong8& b) { return vllong8(a) ^ b; } + + __forceinline vllong8 operator <<(const vllong8& a, long long n) { return _mm512_slli_epi64(a, n); } + __forceinline vllong8 operator >>(const vllong8& a, long long n) { return _mm512_srai_epi64(a, n); } + + __forceinline vllong8 operator <<(const vllong8& a, const vllong8& n) { return _mm512_sllv_epi64(a, n); } + __forceinline vllong8 operator >>(const vllong8& a, const vllong8& n) { return _mm512_srav_epi64(a, n); } + + __forceinline vllong8 sll (const vllong8& a, long long b) { return _mm512_slli_epi64(a, b); } + __forceinline vllong8 sra (const vllong8& a, long long b) { return _mm512_srai_epi64(a, b); } + __forceinline vllong8 srl (const vllong8& a, long long b) { return _mm512_srli_epi64(a, b); } + + __forceinline vllong8 min(const vllong8& a, const vllong8& b) { return _mm512_min_epi64(a, b); } + __forceinline vllong8 min(const vllong8& a, long long b) { return min(a,vllong8(b)); } + __forceinline vllong8 min(long long a, const vllong8& b) { return min(vllong8(a),b); } + + __forceinline vllong8 max(const vllong8& a, const vllong8& b) { return _mm512_max_epi64(a, b); } + __forceinline vllong8 max(const vllong8& a, long long b) { return max(a,vllong8(b)); } + __forceinline vllong8 max(long long a, const vllong8& b) { return max(vllong8(a),b); } + + __forceinline vllong8 mask_add(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_add_epi64(c,m,a,b); } + __forceinline vllong8 mask_sub(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_sub_epi64(c,m,a,b); } + + __forceinline vllong8 mask_and(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_and_epi64(c,m,a,b); } + __forceinline vllong8 mask_or (const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_or_epi64(c,m,a,b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong8& operator +=(vllong8& a, const vllong8& b) { return a = a + b; } + __forceinline vllong8& operator +=(vllong8& a, long long b) { return a = a + b; } + + __forceinline vllong8& operator -=(vllong8& a, const vllong8& b) { return a = a - b; } + __forceinline vllong8& operator -=(vllong8& a, long long b) { return a = a - b; } + + __forceinline vllong8& operator *=(vllong8& a, const vllong8& b) { return a = a * b; } + __forceinline vllong8& operator *=(vllong8& a, long long b) { return a = a * b; } + + __forceinline vllong8& operator &=(vllong8& a, const vllong8& b) { return a = a & b; } + __forceinline vllong8& operator &=(vllong8& a, long long b) { return a = a & b; } + + __forceinline vllong8& operator |=(vllong8& a, const vllong8& b) { return a = a | b; } + __forceinline vllong8& operator |=(vllong8& a, long long b) { return a = a | b; } + + __forceinline vllong8& operator <<=(vllong8& a, long long b) { return a = a << b; } + __forceinline vllong8& operator >>=(vllong8& a, long long b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator ==(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 operator ==(const vllong8& a, long long b) { return a == vllong8(b); } + __forceinline vboold8 operator ==(long long a, const vllong8& b) { return vllong8(a) == b; } + + __forceinline vboold8 operator !=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold8 operator !=(const vllong8& a, long long b) { return a != vllong8(b); } + __forceinline vboold8 operator !=(long long a, const vllong8& b) { return vllong8(a) != b; } + + __forceinline vboold8 operator < (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold8 operator < (const vllong8& a, long long b) { return a < vllong8(b); } + __forceinline vboold8 operator < (long long a, const vllong8& b) { return vllong8(a) < b; } + + __forceinline vboold8 operator >=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold8 operator >=(const vllong8& a, long long b) { return a >= vllong8(b); } + __forceinline vboold8 operator >=(long long a, const vllong8& b) { return vllong8(a) >= b; } + + __forceinline vboold8 operator > (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold8 operator > (const vllong8& a, long long b) { return a > vllong8(b); } + __forceinline vboold8 operator > (long long a, const vllong8& b) { return vllong8(a) > b; } + + __forceinline vboold8 operator <=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboold8 operator <=(const vllong8& a, long long b) { return a <= vllong8(b); } + __forceinline vboold8 operator <=(long long a, const vllong8& b) { return vllong8(a) <= b; } + + __forceinline vboold8 eq(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 ne(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold8 lt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold8 ge(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold8 gt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold8 le(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vboold8 eq(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 ne(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboold8 lt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboold8 ge(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboold8 gt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboold8 le(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LE); } + + __forceinline vllong8 select(const vboold8& m, const vllong8& t, const vllong8& f) { + return _mm512_mask_or_epi64(f,m,t,t); + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template<int i0, int i1> + __forceinline vllong8 shuffle(const vllong8& v) { + return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(v), (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0)); + } + + template<int i> + __forceinline vllong8 shuffle(const vllong8& v) { + return shuffle<i, i>(v); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vllong8 shuffle(const vllong8& v) { + return _mm512_permutex_epi64(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i0, int i1> + __forceinline vllong8 shuffle4(const vllong8& v) { + return _mm512_shuffle_i64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2)); + } + + template<int i> + __forceinline vllong8 shuffle4(const vllong8& v) { + return shuffle4<i, i>(v); + } + + template<int i> + __forceinline vllong8 align_shift_right(const vllong8& a, const vllong8& b) { + return _mm512_alignr_epi64(a, b, i); + }; + + __forceinline long long toScalar(const vllong8& v) { + return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong8 vreduce_min2(vllong8 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vllong8 vreduce_min4(vllong8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vllong8 vreduce_min (vllong8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); } + + __forceinline vllong8 vreduce_max2(vllong8 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vllong8 vreduce_max4(vllong8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vllong8 vreduce_max (vllong8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); } + + __forceinline vllong8 vreduce_and2(vllong8 x) { return x & shuffle<1,0,3,2>(x); } + __forceinline vllong8 vreduce_and4(vllong8 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } + __forceinline vllong8 vreduce_and (vllong8 x) { x = vreduce_and4(x); return x & shuffle4<1,0>(x); } + + __forceinline vllong8 vreduce_or2(vllong8 x) { return x | shuffle<1,0,3,2>(x); } + __forceinline vllong8 vreduce_or4(vllong8 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } + __forceinline vllong8 vreduce_or (vllong8 x) { x = vreduce_or4(x); return x | shuffle4<1,0>(x); } + + __forceinline vllong8 vreduce_add2(vllong8 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vllong8 vreduce_add4(vllong8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vllong8 vreduce_add (vllong8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); } + + __forceinline long long reduce_min(const vllong8& v) { return toScalar(vreduce_min(v)); } + __forceinline long long reduce_max(const vllong8& v) { return toScalar(vreduce_max(v)); } + __forceinline long long reduce_and(const vllong8& v) { return toScalar(vreduce_and(v)); } + __forceinline long long reduce_or (const vllong8& v) { return toScalar(vreduce_or (v)); } + __forceinline long long reduce_add(const vllong8& v) { return toScalar(vreduce_add(v)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong8 permute(const vllong8& v, const vllong8& index) { + return _mm512_permutexvar_epi64(index,v); + } + + __forceinline vllong8 reverse(const vllong8& a) { + return permute(a,vllong8(reverse_step)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vllong8& v) + { + cout << "<" << v[0]; + for (size_t i=1; i<8; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vuint16_avx512.h b/thirdparty/embree/common/simd/vuint16_avx512.h new file mode 100644 index 0000000000..c9eb6682ff --- /dev/null +++ b/thirdparty/embree/common/simd/vuint16_avx512.h @@ -0,0 +1,424 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 16-wide AVX-512 unsigned integer type */ + template<> + struct vuint<16> + { + ALIGNED_STRUCT_(64); + + typedef vboolf16 Bool; + typedef vuint16 UInt; + typedef vfloat16 Float; + + enum { size = 16 }; // number of SIMD elements + union { // data + __m512i v; + unsigned int i[16]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint() {} + __forceinline vuint(const vuint16& t) { v = t.v; } + __forceinline vuint16& operator =(const vuint16& f) { v = f.v; return *this; } + + __forceinline vuint(const __m512i& t) { v = t; } + __forceinline operator __m512i() const { return v; } + __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } + + __forceinline vuint(unsigned int i) { + v = _mm512_set1_epi32(i); + } + + __forceinline vuint(const vuint4& i) { + v = _mm512_broadcast_i32x4(i); + } + + __forceinline vuint(const vuint8& i) { + v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i)))); + } + + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) { + v = _mm512_set4_epi32(d,c,b,a); + } + + __forceinline vuint(unsigned int a0 , unsigned int a1 , unsigned int a2 , unsigned int a3, + unsigned int a4 , unsigned int a5 , unsigned int a6 , unsigned int a7, + unsigned int a8 , unsigned int a9 , unsigned int a10, unsigned int a11, + unsigned int a12, unsigned int a13, unsigned int a14, unsigned int a15) + { + v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0); + } + + __forceinline explicit vuint(const __m512& f) { + v = _mm512_cvtps_epu32(f); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint(ZeroTy) : v(_mm512_setzero_epi32()) {} + __forceinline vuint(OneTy) : v(_mm512_set1_epi32(1)) {} + __forceinline vuint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + __forceinline vuint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(void* __restrict__ ptr, const vuint16& a) { + _mm512_stream_si512((__m512i*)ptr,a); + } + + static __forceinline vuint16 loadu(const void* addr) + { + return _mm512_loadu_si512(addr); + } + + static __forceinline vuint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); } + static __forceinline vuint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); } + + static __forceinline vuint16 load(const vuint16* addr) { + return _mm512_load_si512(addr); + } + + static __forceinline vuint16 load(const unsigned int* addr) { + return _mm512_load_si512(addr); + } + + static __forceinline vuint16 load(unsigned short* ptr) { return _mm512_cvtepu16_epi32(*(__m256i*)ptr); } + + + static __forceinline void store(void* ptr, const vuint16& v) { + _mm512_store_si512(ptr,v); + } + + static __forceinline void storeu(void* ptr, const vuint16& v) { + _mm512_storeu_si512(ptr,v); + } + + static __forceinline void storeu(const vboolf16& mask, void* ptr, const vuint16& f) { + _mm512_mask_storeu_epi32(ptr,mask,f); + } + + static __forceinline void store(const vboolf16& mask, void* addr, const vuint16& v2) { + _mm512_mask_store_epi32(addr,mask,v2); + } + + static __forceinline vuint16 compact(const vboolf16& mask, vuint16& v) { + return _mm512_mask_compress_epi32(v,mask,v); + } + + static __forceinline vuint16 compact(const vboolf16& mask, const vuint16& a, vuint16& b) { + return _mm512_mask_compress_epi32(a,mask,b); + } + + static __forceinline vuint16 expand(const vboolf16& mask, const vuint16& a, vuint16& b) { + return _mm512_mask_expand_epi32(b,mask,a); + } + + template<int scale = 4> + static __forceinline vuint16 gather(const unsigned int* ptr, const vint16& index) { + return _mm512_i32gather_epi32(index,ptr,scale); + } + + template<int scale = 4> + static __forceinline vuint16 gather(const vboolf16& mask, const unsigned int* ptr, const vint16& index) { + return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale); + } + + template<int scale = 4> + static __forceinline vuint16 gather(const vboolf16& mask, vuint16& dest, const unsigned int* ptr, const vint16& index) { + return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale); + } + + template<int scale = 4> + static __forceinline void scatter(unsigned int* ptr, const vint16& index, const vuint16& v) { + _mm512_i32scatter_epi32((int*)ptr,index,v,scale); + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf16& mask, unsigned int* ptr, const vint16& index, const vuint16& v) { + _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int& operator [](size_t index) { assert(index < 16); return i[index]; } + __forceinline const unsigned int& operator [](size_t index) const { assert(index < 16); return i[index]; } + + __forceinline unsigned int uint (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; } + __forceinline size_t& uint64_t(size_t index) const { assert(index < 8); return ((size_t*)i)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 asBool(const vuint16& a) { return _mm512_movepi32_mask(a); } + + __forceinline vuint16 operator +(const vuint16& a) { return a; } + __forceinline vuint16 operator -(const vuint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint16 operator +(const vuint16& a, const vuint16& b) { return _mm512_add_epi32(a, b); } + __forceinline vuint16 operator +(const vuint16& a, unsigned int b) { return a + vuint16(b); } + __forceinline vuint16 operator +(unsigned int a, const vuint16& b) { return vuint16(a) + b; } + + __forceinline vuint16 operator -(const vuint16& a, const vuint16& b) { return _mm512_sub_epi32(a, b); } + __forceinline vuint16 operator -(const vuint16& a, unsigned int b) { return a - vuint16(b); } + __forceinline vuint16 operator -(unsigned int a, const vuint16& b) { return vuint16(a) - b; } + + __forceinline vuint16 operator *(const vuint16& a, const vuint16& b) { return _mm512_mul_epu32(a, b); } + __forceinline vuint16 operator *(const vuint16& a, unsigned int b) { return a * vuint16(b); } + __forceinline vuint16 operator *(unsigned int a, const vuint16& b) { return vuint16(a) * b; } + + __forceinline vuint16 operator &(const vuint16& a, const vuint16& b) { return _mm512_and_epi32(a, b); } + __forceinline vuint16 operator &(const vuint16& a, unsigned int b) { return a & vuint16(b); } + __forceinline vuint16 operator &(unsigned int a, const vuint16& b) { return vuint16(a) & b; } + + __forceinline vuint16 operator |(const vuint16& a, const vuint16& b) { return _mm512_or_epi32(a, b); } + __forceinline vuint16 operator |(const vuint16& a, unsigned int b) { return a | vuint16(b); } + __forceinline vuint16 operator |(unsigned int a, const vuint16& b) { return vuint16(a) | b; } + + __forceinline vuint16 operator ^(const vuint16& a, const vuint16& b) { return _mm512_xor_epi32(a, b); } + __forceinline vuint16 operator ^(const vuint16& a, unsigned int b) { return a ^ vuint16(b); } + __forceinline vuint16 operator ^(unsigned int a, const vuint16& b) { return vuint16(a) ^ b; } + + __forceinline vuint16 operator <<(const vuint16& a, unsigned int n) { return _mm512_slli_epi32(a, n); } + __forceinline vuint16 operator >>(const vuint16& a, unsigned int n) { return _mm512_srli_epi32(a, n); } + + __forceinline vuint16 operator <<(const vuint16& a, const vuint16& n) { return _mm512_sllv_epi32(a, n); } + __forceinline vuint16 operator >>(const vuint16& a, const vuint16& n) { return _mm512_srlv_epi32(a, n); } + + __forceinline vuint16 sll (const vuint16& a, unsigned int b) { return _mm512_slli_epi32(a, b); } + __forceinline vuint16 sra (const vuint16& a, unsigned int b) { return _mm512_srai_epi32(a, b); } + __forceinline vuint16 srl (const vuint16& a, unsigned int b) { return _mm512_srli_epi32(a, b); } + + __forceinline vuint16 min(const vuint16& a, const vuint16& b) { return _mm512_min_epu32(a, b); } + __forceinline vuint16 min(const vuint16& a, unsigned int b) { return min(a,vuint16(b)); } + __forceinline vuint16 min(unsigned int a, const vuint16& b) { return min(vuint16(a),b); } + + __forceinline vuint16 max(const vuint16& a, const vuint16& b) { return _mm512_max_epu32(a, b); } + __forceinline vuint16 max(const vuint16& a, unsigned int b) { return max(a,vuint16(b)); } + __forceinline vuint16 max(unsigned int a, const vuint16& b) { return max(vuint16(a),b); } + + __forceinline vuint16 mask_add(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); } + __forceinline vuint16 mask_sub(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); } + + __forceinline vuint16 mask_and(const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_and_epi32(c,m,a,b); } + __forceinline vuint16 mask_or (const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_or_epi32(c,m,a,b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint16& operator +=(vuint16& a, const vuint16& b) { return a = a + b; } + __forceinline vuint16& operator +=(vuint16& a, unsigned int b) { return a = a + b; } + + __forceinline vuint16& operator -=(vuint16& a, const vuint16& b) { return a = a - b; } + __forceinline vuint16& operator -=(vuint16& a, unsigned int b) { return a = a - b; } + + __forceinline vuint16& operator *=(vuint16& a, const vuint16& b) { return a = a * b; } + __forceinline vuint16& operator *=(vuint16& a, unsigned int b) { return a = a * b; } + + __forceinline vuint16& operator &=(vuint16& a, const vuint16& b) { return a = a & b; } + __forceinline vuint16& operator &=(vuint16& a, unsigned int b) { return a = a & b; } + + __forceinline vuint16& operator |=(vuint16& a, const vuint16& b) { return a = a | b; } + __forceinline vuint16& operator |=(vuint16& a, unsigned int b) { return a = a | b; } + + __forceinline vuint16& operator <<=(vuint16& a, unsigned int b) { return a = a << b; } + __forceinline vuint16& operator >>=(vuint16& a, unsigned int b) { return a = a >> b; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator ==(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 operator ==(const vuint16& a, unsigned int b) { return a == vuint16(b); } + __forceinline vboolf16 operator ==(unsigned int a, const vuint16& b) { return vuint16(a) == b; } + + __forceinline vboolf16 operator !=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 operator !=(const vuint16& a, unsigned int b) { return a != vuint16(b); } + __forceinline vboolf16 operator !=(unsigned int a, const vuint16& b) { return vuint16(a) != b; } + + __forceinline vboolf16 operator < (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 operator < (const vuint16& a, unsigned int b) { return a < vuint16(b); } + __forceinline vboolf16 operator < (unsigned int a, const vuint16& b) { return vuint16(a) < b; } + + __forceinline vboolf16 operator >=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 operator >=(const vuint16& a, unsigned int b) { return a >= vuint16(b); } + __forceinline vboolf16 operator >=(unsigned int a, const vuint16& b) { return vuint16(a) >= b; } + + __forceinline vboolf16 operator > (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 operator > (const vuint16& a, unsigned int b) { return a > vuint16(b); } + __forceinline vboolf16 operator > (unsigned int a, const vuint16& b) { return vuint16(a) > b; } + + __forceinline vboolf16 operator <=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 operator <=(const vuint16& a, unsigned int b) { return a <= vuint16(b); } + __forceinline vboolf16 operator <=(unsigned int a, const vuint16& b) { return vuint16(a) <= b; } + + __forceinline vboolf16 eq(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vboolf16 eq(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); } + + + __forceinline vuint16 select(const vboolf16& m, const vuint16& t, const vuint16& f) { + return _mm512_mask_or_epi32(f,m,t,t); + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template<int i> + __forceinline vuint16 shuffle(const vuint16& v) { + return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vuint16 shuffle(const vuint16& v) { + return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i> + __forceinline vuint16 shuffle4(const vuint16& v) { + return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v) ,_MM_SHUFFLE(i, i, i, i))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vuint16 shuffle4(const vuint16& v) { + return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i> + __forceinline vuint16 align_shift_right(const vuint16& a, const vuint16& b) { + return _mm512_alignr_epi32(a, b, i); + }; + + __forceinline unsigned int toScalar(const vuint16& v) { + return _mm_cvtsi128_si32(_mm512_castsi512_si128(v)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint16 vreduce_min2(vuint16 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vuint16 vreduce_min4(vuint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vuint16 vreduce_min8(vuint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } + __forceinline vuint16 vreduce_min (vuint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vuint16 vreduce_max2(vuint16 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vuint16 vreduce_max4(vuint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vuint16 vreduce_max8(vuint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } + __forceinline vuint16 vreduce_max (vuint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vuint16 vreduce_and2(vuint16 x) { return x & shuffle<1,0,3,2>(x); } + __forceinline vuint16 vreduce_and4(vuint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } + __forceinline vuint16 vreduce_and8(vuint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); } + __forceinline vuint16 vreduce_and (vuint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); } + + __forceinline vuint16 vreduce_or2(vuint16 x) { return x | shuffle<1,0,3,2>(x); } + __forceinline vuint16 vreduce_or4(vuint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } + __forceinline vuint16 vreduce_or8(vuint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); } + __forceinline vuint16 vreduce_or (vuint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); } + + __forceinline vuint16 vreduce_add2(vuint16 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vuint16 vreduce_add4(vuint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vuint16 vreduce_add8(vuint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } + __forceinline vuint16 vreduce_add (vuint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } + + __forceinline unsigned int reduce_min(const vuint16& v) { return toScalar(vreduce_min(v)); } + __forceinline unsigned int reduce_max(const vuint16& v) { return toScalar(vreduce_max(v)); } + __forceinline unsigned int reduce_and(const vuint16& v) { return toScalar(vreduce_and(v)); } + __forceinline unsigned int reduce_or (const vuint16& v) { return toScalar(vreduce_or (v)); } + __forceinline unsigned int reduce_add(const vuint16& v) { return toScalar(vreduce_add(v)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint16 permute(vuint16 v, vuint16 index) { + return _mm512_permutexvar_epi32(index,v); + } + + __forceinline vuint16 reverse(const vuint16& a) { + return permute(a,vuint16(reverse_step)); + } + + __forceinline vuint16 prefix_sum(const vuint16& a) + { + const vuint16 z(zero); + vuint16 v = a; + v = v + align_shift_right<16-1>(v,z); + v = v + align_shift_right<16-2>(v,z); + v = v + align_shift_right<16-4>(v,z); + v = v + align_shift_right<16-8>(v,z); + return v; + } + + __forceinline vuint16 reverse_prefix_sum(const vuint16& a) + { + const vuint16 z(zero); + vuint16 v = a; + v = v + align_shift_right<1>(z,v); + v = v + align_shift_right<2>(z,v); + v = v + align_shift_right<4>(z,v); + v = v + align_shift_right<8>(z,v); + return v; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vuint16& v) + { + cout << "<" << v[0]; + for (int i=1; i<16; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vuint4_sse2.h b/thirdparty/embree/common/simd/vuint4_sse2.h new file mode 100644 index 0000000000..0601b9ab80 --- /dev/null +++ b/thirdparty/embree/common/simd/vuint4_sse2.h @@ -0,0 +1,426 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../math/math.h" + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide SSE integer type */ + template<> + struct vuint<4> + { + ALIGNED_STRUCT_(16); + + typedef vboolf4 Bool; + typedef vuint4 Int; + typedef vfloat4 Float; + + enum { size = 4 }; // number of SIMD elements + union { __m128i v; unsigned int i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint() {} + __forceinline vuint(const vuint4& a) { v = a.v; } + __forceinline vuint4& operator =(const vuint4& a) { v = a.v; return *this; } + + __forceinline vuint(const __m128i a) : v(a) {} + __forceinline operator const __m128i&() const { return v; } + __forceinline operator __m128i&() { return v; } + + + __forceinline vuint(unsigned int a) : v(_mm_set1_epi32(a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm_set_epi32(d, c, b, a)) {} + +#if defined(__AVX512VL__) + __forceinline explicit vuint(__m128 a) : v(_mm_cvtps_epu32(a)) {} +#endif + +#if defined(__AVX512VL__) + __forceinline explicit vuint(const vboolf4& a) : v(_mm_movm_epi32(a)) {} +#else + __forceinline explicit vuint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {} +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint(ZeroTy) : v(_mm_setzero_si128()) {} + __forceinline vuint(OneTy) : v(_mm_set1_epi32(1)) {} + __forceinline vuint(PosInfTy) : v(_mm_set1_epi32(unsigned(pos_inf))) {} + __forceinline vuint(StepTy) : v(_mm_set_epi32(3, 2, 1, 0)) {} + __forceinline vuint(TrueTy) { v = _mm_cmpeq_epi32(v,v); } + __forceinline vuint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vuint4 load (const void* a) { return _mm_load_si128((__m128i*)a); } + static __forceinline vuint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); } + + static __forceinline void store (void* ptr, const vuint4& v) { _mm_store_si128((__m128i*)ptr,v); } + static __forceinline void storeu(void* ptr, const vuint4& v) { _mm_storeu_si128((__m128i*)ptr,v); } + +#if defined(__AVX512VL__) + static __forceinline vuint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); } + static __forceinline vuint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_store_epi32 (ptr,mask,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); } +#elif defined(__AVX__) + static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } + static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } +#else + static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); } + static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { store (ptr,select(mask,i,load (ptr))); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); } +#endif + +#if defined(__SSE4_1__) + static __forceinline vuint4 load(const unsigned char* ptr) { + return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); + } + + static __forceinline vuint4 loadu(const unsigned char* ptr) { + return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); + } + +#endif + + static __forceinline vuint4 load(const unsigned short* ptr) { +#if defined (__SSE4_1__) + return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); +#else + return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]); +#endif + } + + static __forceinline vuint4 load_nt(void* ptr) { +#if defined(__SSE4_1__) + return _mm_stream_load_si128((__m128i*)ptr); +#else + return _mm_load_si128((__m128i*)ptr); +#endif + } + + static __forceinline void store_nt(void* ptr, const vuint4& v) { +#if defined(__SSE4_1__) + _mm_stream_ps((float*)ptr,_mm_castsi128_ps(v)); +#else + _mm_store_si128((__m128i*)ptr,v); +#endif + } + + template<int scale = 4> + static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) { +#if defined(__AVX2__) + return _mm_i32gather_epi32((const int*)ptr, index, scale); +#else + return vuint4( + *(unsigned int*)(((char*)ptr)+scale*index[0]), + *(unsigned int*)(((char*)ptr)+scale*index[1]), + *(unsigned int*)(((char*)ptr)+scale*index[2]), + *(unsigned int*)(((char*)ptr)+scale*index[3])); +#endif + } + + template<int scale = 4> + static __forceinline vuint4 gather(const vboolf4& mask, const unsigned int* ptr, const vint4& index) { + vuint4 r = zero; +#if defined(__AVX512VL__) + return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale); +#elif defined(__AVX2__) + return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale); +#else + if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]); + return r; +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const unsigned int& operator [](size_t index) const { assert(index < 4); return i[index]; } + __forceinline unsigned int& operator [](size_t index) { assert(index < 4); return i[index]; } + + friend __forceinline vuint4 select(const vboolf4& m, const vuint4& t, const vuint4& f) { +#if defined(__AVX512VL__) + return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t); +#elif defined(__SSE4_1__) + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); +#else + return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); +#endif + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 asBool(const vuint4& a) { return _mm_movepi32_mask(a); } +#else + __forceinline vboolf4 asBool(const vuint4& a) { return _mm_castsi128_ps(a); } +#endif + + __forceinline vuint4 operator +(const vuint4& a) { return a; } + __forceinline vuint4 operator -(const vuint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint4 operator +(const vuint4& a, const vuint4& b) { return _mm_add_epi32(a, b); } + __forceinline vuint4 operator +(const vuint4& a, unsigned int b) { return a + vuint4(b); } + __forceinline vuint4 operator +(unsigned int a, const vuint4& b) { return vuint4(a) + b; } + + __forceinline vuint4 operator -(const vuint4& a, const vuint4& b) { return _mm_sub_epi32(a, b); } + __forceinline vuint4 operator -(const vuint4& a, unsigned int b) { return a - vuint4(b); } + __forceinline vuint4 operator -(unsigned int a, const vuint4& b) { return vuint4(a) - b; } + +//#if defined(__SSE4_1__) +// __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return _mm_mullo_epu32(a, b); } +//#else +// __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return vuint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); } +//#endif +// __forceinline vuint4 operator *(const vuint4& a, unsigned int b) { return a * vuint4(b); } +// __forceinline vuint4 operator *(unsigned int a, const vuint4& b) { return vuint4(a) * b; } + + __forceinline vuint4 operator &(const vuint4& a, const vuint4& b) { return _mm_and_si128(a, b); } + __forceinline vuint4 operator &(const vuint4& a, unsigned int b) { return a & vuint4(b); } + __forceinline vuint4 operator &(unsigned int a, const vuint4& b) { return vuint4(a) & b; } + + __forceinline vuint4 operator |(const vuint4& a, const vuint4& b) { return _mm_or_si128(a, b); } + __forceinline vuint4 operator |(const vuint4& a, unsigned int b) { return a | vuint4(b); } + __forceinline vuint4 operator |(unsigned int a, const vuint4& b) { return vuint4(a) | b; } + + __forceinline vuint4 operator ^(const vuint4& a, const vuint4& b) { return _mm_xor_si128(a, b); } + __forceinline vuint4 operator ^(const vuint4& a, unsigned int b) { return a ^ vuint4(b); } + __forceinline vuint4 operator ^(unsigned int a, const vuint4& b) { return vuint4(a) ^ b; } + + __forceinline vuint4 operator <<(const vuint4& a, unsigned int n) { return _mm_slli_epi32(a, n); } + __forceinline vuint4 operator >>(const vuint4& a, unsigned int n) { return _mm_srli_epi32(a, n); } + + __forceinline vuint4 sll (const vuint4& a, unsigned int b) { return _mm_slli_epi32(a, b); } + __forceinline vuint4 sra (const vuint4& a, unsigned int b) { return _mm_srai_epi32(a, b); } + __forceinline vuint4 srl (const vuint4& a, unsigned int b) { return _mm_srli_epi32(a, b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint4& operator +=(vuint4& a, const vuint4& b) { return a = a + b; } + __forceinline vuint4& operator +=(vuint4& a, unsigned int b) { return a = a + b; } + + __forceinline vuint4& operator -=(vuint4& a, const vuint4& b) { return a = a - b; } + __forceinline vuint4& operator -=(vuint4& a, unsigned int b) { return a = a - b; } + +//#if defined(__SSE4_1__) +// __forceinline vuint4& operator *=(vuint4& a, const vuint4& b) { return a = a * b; } +// __forceinline vuint4& operator *=(vuint4& a, unsigned int b) { return a = a * b; } +//#endif + + __forceinline vuint4& operator &=(vuint4& a, const vuint4& b) { return a = a & b; } + __forceinline vuint4& operator &=(vuint4& a, unsigned int b) { return a = a & b; } + + __forceinline vuint4& operator |=(vuint4& a, const vuint4& b) { return a = a | b; } + __forceinline vuint4& operator |=(vuint4& a, unsigned int b) { return a = a | b; } + + __forceinline vuint4& operator <<=(vuint4& a, unsigned int b) { return a = a << b; } + __forceinline vuint4& operator >>=(vuint4& a, unsigned int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } + //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } + //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } + //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } +#else + __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } + __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return !(a == b); } + //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmplt_epu32(a, b)); } + //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return !(a < b); } + //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epu32(a, b)); } + //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return !(a > b); } +#endif + + __forceinline vboolf4 operator ==(const vuint4& a, unsigned int b) { return a == vuint4(b); } + __forceinline vboolf4 operator ==(unsigned int a, const vuint4& b) { return vuint4(a) == b; } + + __forceinline vboolf4 operator !=(const vuint4& a, unsigned int b) { return a != vuint4(b); } + __forceinline vboolf4 operator !=(unsigned int a, const vuint4& b) { return vuint4(a) != b; } + + //__forceinline vboolf4 operator < (const vuint4& a, unsigned int b) { return a < vuint4(b); } + //__forceinline vboolf4 operator < (unsigned int a, const vuint4& b) { return vuint4(a) < b; } + + //__forceinline vboolf4 operator >=(const vuint4& a, unsigned int b) { return a >= vuint4(b); } + //__forceinline vboolf4 operator >=(unsigned int a, const vuint4& b) { return vuint4(a) >= b; } + + //__forceinline vboolf4 operator > (const vuint4& a, unsigned int b) { return a > vuint4(b); } + //__forceinline vboolf4 operator > (unsigned int a, const vuint4& b) { return vuint4(a) > b; } + + //__forceinline vboolf4 operator <=(const vuint4& a, unsigned int b) { return a <= vuint4(b); } + //__forceinline vboolf4 operator <=(unsigned int a, const vuint4& b) { return vuint4(a) <= b; } + + __forceinline vboolf4 eq(const vuint4& a, const vuint4& b) { return a == b; } + __forceinline vboolf4 ne(const vuint4& a, const vuint4& b) { return a != b; } + //__forceinline vboolf4 lt(const vuint4& a, const vuint4& b) { return a < b; } + //__forceinline vboolf4 ge(const vuint4& a, const vuint4& b) { return a >= b; } + //__forceinline vboolf4 gt(const vuint4& a, const vuint4& b) { return a > b; } + //__forceinline vboolf4 le(const vuint4& a, const vuint4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); } + //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); } + //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); } + //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); } + //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a == b); } + __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a != b); } + //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a < b); } + //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a >= b); } + //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a > b); } + //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a <= b); } +#endif + + template<int mask> + __forceinline vuint4 select(const vuint4& t, const vuint4& f) { +#if defined(__SSE4_1__) + return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask)); +#else + return select(vboolf4(mask), t, f); +#endif + } + +/*#if defined(__SSE4_1__) + __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return _mm_min_epu32(a, b); } + __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return _mm_max_epu32(a, b); } + +#else + __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return select(a < b,a,b); } + __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return select(a < b,b,a); } +#endif + + __forceinline vuint4 min(const vuint4& a, unsigned int b) { return min(a,vuint4(b)); } + __forceinline vuint4 min(unsigned int a, const vuint4& b) { return min(vuint4(a),b); } + __forceinline vuint4 max(const vuint4& a, unsigned int b) { return max(a,vuint4(b)); } + __forceinline vuint4 max(unsigned int a, const vuint4& b) { return max(vuint4(a),b); }*/ + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } + __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } + + template<int i0, int i1, int i2, int i3> + __forceinline vuint4 shuffle(const vuint4& v) { + return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) { + return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + +#if defined(__SSE3__) + template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); } + template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); } + template<> __forceinline vuint4 shuffle<0, 1, 0, 1>(const vuint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); } +#endif + + template<int i> + __forceinline vuint4 shuffle(const vuint4& v) { + return shuffle<i,i,i,i>(v); + } + +#if defined(__SSE4_1__) + template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); } + template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); } +#else + template<int src> __forceinline unsigned int extract(const vuint4& b) { return b[src&3]; } + template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; } +#endif + + + template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); } + + __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + +#if 0 +#if defined(__SSE4_1__) + + __forceinline vuint4 vreduce_min(const vuint4& v) { vuint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } + __forceinline vuint4 vreduce_max(const vuint4& v) { vuint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } + __forceinline vuint4 vreduce_add(const vuint4& v) { vuint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } + + __forceinline unsigned int reduce_min(const vuint4& v) { return toScalar(vreduce_min(v)); } + __forceinline unsigned int reduce_max(const vuint4& v) { return toScalar(vreduce_max(v)); } + __forceinline unsigned int reduce_add(const vuint4& v) { return toScalar(vreduce_add(v)); } + + __forceinline size_t select_min(const vuint4& v) { return bsf(movemask(v == vreduce_min(v))); } + __forceinline size_t select_max(const vuint4& v) { return bsf(movemask(v == vreduce_max(v))); } + + //__forceinline size_t select_min(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + //__forceinline size_t select_max(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + +#else + + __forceinline unsigned int reduce_min(const vuint4& v) { return min(v[0],v[1],v[2],v[3]); } + __forceinline unsigned int reduce_max(const vuint4& v) { return max(v[0],v[1],v[2],v[3]); } + __forceinline unsigned int reduce_add(const vuint4& v) { return v[0]+v[1]+v[2]+v[3]; } + +#endif +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vuint4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vuint8_avx.h b/thirdparty/embree/common/simd/vuint8_avx.h new file mode 100644 index 0000000000..589cd9d731 --- /dev/null +++ b/thirdparty/embree/common/simd/vuint8_avx.h @@ -0,0 +1,386 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX integer type */ + template<> + struct vuint<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vuint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256i v; + struct { __m128i vl,vh; }; + unsigned int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint() {} + __forceinline vuint(const vuint8& a) { v = a.v; } + __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; } + + __forceinline vuint(__m256i a) : v(a) {} + __forceinline operator const __m256i&() const { return v; } + __forceinline operator __m256i&() { return v; } + + __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} + __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + __forceinline vuint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {} + + __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} + __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {} + __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {} + + __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vuint(OneTy) : v(_mm256_set1_epi32(1)) {} + __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(0xFFFFFFFF)) {} + __forceinline vuint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} + __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vuint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); } + static __forceinline vuint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); } + + static __forceinline vuint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } + static __forceinline vuint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } + + static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } + + static __forceinline void store_nt(void* ptr, const vuint8& v) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); + } + + static __forceinline vuint8 load(const unsigned char* ptr) { + vuint4 il = vuint4::load(ptr+0); + vuint4 ih = vuint4::load(ptr+4); + return vuint8(il,ih); + } + + static __forceinline vuint8 loadu(const unsigned char* ptr) { + vuint4 il = vuint4::loadu(ptr+0); + vuint4 ih = vuint4::loadu(ptr+4); + return vuint8(il,ih); + } + + static __forceinline vuint8 load(const unsigned short* ptr) { + vuint4 il = vuint4::load(ptr+0); + vuint4 ih = vuint4::load(ptr+4); + return vuint8(il,ih); + } + + static __forceinline vuint8 loadu(const unsigned short* ptr) { + vuint4 il = vuint4::loadu(ptr+0); + vuint4 ih = vuint4::loadu(ptr+4); + return vuint8(il,ih); + } + + static __forceinline void store(unsigned char* ptr, const vuint8& i) { + vuint4 il(i.vl); + vuint4 ih(i.vh); + vuint4::store(ptr + 0,il); + vuint4::store(ptr + 4,ih); + } + + static __forceinline void store(unsigned short* ptr, const vuint8& v) { + for (size_t i=0;i<8;i++) + ptr[i] = (unsigned short)v[i]; + } + + template<int scale = 4> + static __forceinline vuint8 gather(const unsigned int* ptr, const vint8& index) { + return vuint8( + *(unsigned int*)(((char*)ptr)+scale*index[0]), + *(unsigned int*)(((char*)ptr)+scale*index[1]), + *(unsigned int*)(((char*)ptr)+scale*index[2]), + *(unsigned int*)(((char*)ptr)+scale*index[3]), + *(unsigned int*)(((char*)ptr)+scale*index[4]), + *(unsigned int*)(((char*)ptr)+scale*index[5]), + *(unsigned int*)(((char*)ptr)+scale*index[6]), + *(unsigned int*)(((char*)ptr)+scale*index[7])); + } + + template<int scale = 4> + static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int* ptr, const vint8& index) { + vuint8 r = zero; + if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]); + if (likely(mask[4])) r[4] = *(unsigned int*)(((char*)ptr)+scale*index[4]); + if (likely(mask[5])) r[5] = *(unsigned int*)(((char*)ptr)+scale*index[5]); + if (likely(mask[6])) r[6] = *(unsigned int*)(((char*)ptr)+scale*index[6]); + if (likely(mask[7])) r[7] = *(unsigned int*)(((char*)ptr)+scale*index[7]); + return r; + } + + template<int scale = 4> + static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v) + { + *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0]; + *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1]; + *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2]; + *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3]; + *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4]; + *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5]; + *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6]; + *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7]; + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v) + { + if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7]; + } + + + static __forceinline vuint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; } + __forceinline unsigned int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); } + + __forceinline vuint8 operator +(const vuint8& a) { return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return vuint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); } + __forceinline vuint8 operator +(const vuint8& a, unsigned int b) { return a + vuint8(b); } + __forceinline vuint8 operator +(unsigned int a, const vuint8& b) { return vuint8(a) + b; } + + __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return vuint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); } + __forceinline vuint8 operator -(const vuint8& a, unsigned int b) { return a - vuint8(b); } + __forceinline vuint8 operator -(unsigned int a, const vuint8& b) { return vuint8(a) - b; } + + //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return vuint8(_mm_mullo_epu32(a.vl, b.vl), _mm_mullo_epu32(a.vh, b.vh)); } + //__forceinline vuint8 operator *(const vuint8& a, unsigned int b) { return a * vuint8(b); } + //__forceinline vuint8 operator *(unsigned int a, const vuint8& b) { return vuint8(a) * b; } + + __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vuint8 operator &(const vuint8& a, unsigned int b) { return a & vuint8(b); } + __forceinline vuint8 operator &(unsigned int a, const vuint8& b) { return vuint8(a) & b; } + + __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vuint8 operator |(const vuint8& a, unsigned int b) { return a | vuint8(b); } + __forceinline vuint8 operator |(unsigned int a, const vuint8& b) { return vuint8(a) | b; } + + __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vuint8 operator ^(const vuint8& a, unsigned int b) { return a ^ vuint8(b); } + __forceinline vuint8 operator ^(unsigned int a, const vuint8& b) { return vuint8(a) ^ b; } + + __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return vuint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); } + __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return vuint8(_mm_srai_epi32(a.vl, n), _mm_srli_epi32(a.vh, n)); } + + __forceinline vuint8 sll (const vuint8& a, unsigned int b) { return vuint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); } + __forceinline vuint8 sra (const vuint8& a, unsigned int b) { return vuint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); } + __forceinline vuint8 srl (const vuint8& a, unsigned int b) { return vuint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); } + + __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return vuint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); } + __forceinline vuint8 min(const vuint8& a, unsigned int b) { return min(a,vuint8(b)); } + __forceinline vuint8 min(unsigned int a, const vuint8& b) { return min(vuint8(a),b); } + + __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return vuint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); } + __forceinline vuint8 max(const vuint8& a, unsigned int b) { return max(a,vuint8(b)); } + __forceinline vuint8 max(unsigned int a, const vuint8& b) { return max(vuint8(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; } + __forceinline vuint8& operator +=(vuint8& a, unsigned int b) { return a = a + b; } + + __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; } + __forceinline vuint8& operator -=(vuint8& a, unsigned int b) { return a = a - b; } + + //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; } + //__forceinline vuint8& operator *=(vuint8& a, unsigned int b) { return a = a * b; } + + __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; } + __forceinline vuint8& operator &=(vuint8& a, unsigned int b) { return a = a & b; } + + __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; } + __forceinline vuint8& operator |=(vuint8& a, unsigned int b) { return a = a | b; } + + __forceinline vuint8& operator <<=(vuint8& a, unsigned int b) { return a = a << b; } + __forceinline vuint8& operator >>=(vuint8& a, unsigned int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)), + _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); } + __forceinline vboolf8 operator ==(const vuint8& a, unsigned int b) { return a == vuint8(b); } + __forceinline vboolf8 operator ==(unsigned int a, const vuint8& b) { return vuint8(a) == b; } + + __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); } + __forceinline vboolf8 operator !=(const vuint8& a, unsigned int b) { return a != vuint8(b); } + __forceinline vboolf8 operator !=(unsigned int a, const vuint8& b) { return vuint8(a) != b; } + + //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epu32 (a.vl, b.vl)), + // _mm_castsi128_ps(_mm_cmplt_epu32 (a.vh, b.vh))); } + //__forceinline vboolf8 operator < (const vuint8& a, unsigned int b) { return a < vuint8(b); } + //__forceinline vboolf8 operator < (unsigned int a, const vuint8& b) { return vuint8(a) < b; } + + //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a < b); } + //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int b) { return a >= vuint8(b); } + //__forceinline vboolf8 operator >=(unsigned int a, const vuint8& b) { return vuint8(a) >= b; } + + //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epu32 (a.vl, b.vl)), + // _mm_castsi128_ps(_mm_cmpgt_epu32 (a.vh, b.vh))); } + //__forceinline vboolf8 operator > (const vuint8& a, unsigned int b) { return a > vuint8(b); } + //__forceinline vboolf8 operator > (unsigned int a, const vuint8& b) { return vuint8(a) > b; } + + //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a > b); } + //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int b) { return a <= vuint8(b); } + //__forceinline vboolf8 operator <=(unsigned int a, const vuint8& b) { return vuint8(a) <= b; } + + __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; } + __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; } + + __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); } + __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); } + + __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + + template<int i> + __forceinline vuint8 shuffle(const vuint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template<int i0, int i1> + __forceinline vuint8 shuffle4(const vuint8& v) { + return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1> + __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) { + return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vuint8 shuffle(const vuint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) { + return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } + + template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); } + template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); } + template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); } + + __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } + + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); } + //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); } + //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); } + //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); } + + //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); } + //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); } + + //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vuint8_avx2.h b/thirdparty/embree/common/simd/vuint8_avx2.h new file mode 100644 index 0000000000..17b994522f --- /dev/null +++ b/thirdparty/embree/common/simd/vuint8_avx2.h @@ -0,0 +1,446 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX integer type */ + template<> + struct vuint<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vuint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256i v; + unsigned int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint() {} + __forceinline vuint(const vuint8& a) { v = a.v; } + __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; } + + __forceinline vuint(__m256i a) : v(a) {} + __forceinline operator const __m256i&() const { return v; } + __forceinline operator __m256i&() { return v; } + + __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} + __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + __forceinline vuint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + + __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} + __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {} + __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {} + + __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} + +#if defined(__AVX512VL__) + __forceinline explicit vuint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {} +#else + __forceinline explicit vuint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {} +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vuint(OneTy) : v(_mm256_set1_epi32(1)) {} + __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {} + __forceinline vuint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {} + __forceinline vuint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} + __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vuint8 load(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } + static __forceinline vuint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } + static __forceinline vuint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); } + static __forceinline vuint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); } + + static __forceinline vuint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); } + static __forceinline vuint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); } + + static __forceinline void store (void* ptr, const vuint8& v) { _mm256_store_si256((__m256i*)ptr,v); } + static __forceinline void storeu(void* ptr, const vuint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); } + +#if defined(__AVX512VL__) + + static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &v) { + return _mm256_mask_compress_epi32(v, mask, v); + } + static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &a, const vuint8& b) { + return _mm256_mask_compress_epi32(a, mask, b); + } + + static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); } + static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); } +#else + static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } + static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } +#endif + + static __forceinline vuint8 load_nt(void* ptr) { + return _mm256_stream_load_si256((__m256i*)ptr); + } + + static __forceinline void store_nt(void* ptr, const vuint8& v) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); + } + + static __forceinline void store(unsigned char* ptr, const vuint8& i) + { + for (size_t j=0; j<8; j++) + ptr[j] = i[j]; + } + + static __forceinline void store(unsigned short* ptr, const vuint8& v) { + for (size_t i=0;i<8;i++) + ptr[i] = (unsigned short)v[i]; + } + + template<int scale = 4> + static __forceinline vuint8 gather(const unsigned int *const ptr, const vint8& index) { + return _mm256_i32gather_epi32((const int*) ptr, index, scale); + } + + template<int scale = 4> + static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int *const ptr, const vint8& index) { + vuint8 r = zero; +#if defined(__AVX512VL__) + return _mm256_mmask_i32gather_epi32(r, mask, index, (const int*) ptr, scale); +#else + return _mm256_mask_i32gather_epi32(r, (const int*) ptr, index, mask, scale); +#endif + } + + template<int scale = 4> + static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v) + { +#if defined(__AVX512VL__) + _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale); +#else + *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0]; + *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1]; + *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2]; + *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3]; + *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4]; + *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5]; + *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6]; + *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v) + { +#if defined(__AVX512VL__) + _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale); +#else + if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + static __forceinline vuint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; } + __forceinline unsigned int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_movepi32_mask(a); } +#else + __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); } +#endif + + __forceinline vuint8 operator +(const vuint8& a) { return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return _mm256_add_epi32(a, b); } + __forceinline vuint8 operator +(const vuint8& a, unsigned int b) { return a + vuint8(b); } + __forceinline vuint8 operator +(unsigned int a, const vuint8& b) { return vuint8(a) + b; } + + __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return _mm256_sub_epi32(a, b); } + __forceinline vuint8 operator -(const vuint8& a, unsigned int b) { return a - vuint8(b); } + __forceinline vuint8 operator -(unsigned int a, const vuint8& b) { return vuint8(a) - b; } + + //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return _mm256_mullo_epu32(a, b); } + //__forceinline vuint8 operator *(const vuint8& a, unsigned int b) { return a * vuint8(b); } + //__forceinline vuint8 operator *(unsigned int a, const vuint8& b) { return vuint8(a) * b; } + + __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_and_si256(a, b); } + __forceinline vuint8 operator &(const vuint8& a, unsigned int b) { return a & vuint8(b); } + __forceinline vuint8 operator &(unsigned int a, const vuint8& b) { return vuint8(a) & b; } + + __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_or_si256(a, b); } + __forceinline vuint8 operator |(const vuint8& a, unsigned int b) { return a | vuint8(b); } + __forceinline vuint8 operator |(unsigned int a, const vuint8& b) { return vuint8(a) | b; } + + __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_xor_si256(a, b); } + __forceinline vuint8 operator ^(const vuint8& a, unsigned int b) { return a ^ vuint8(b); } + __forceinline vuint8 operator ^(unsigned int a, const vuint8& b) { return vuint8(a) ^ b; } + + __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return _mm256_slli_epi32(a, n); } + __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return _mm256_srli_epi32(a, n); } + + __forceinline vuint8 operator <<(const vuint8& a, const vuint8& n) { return _mm256_sllv_epi32(a, n); } + __forceinline vuint8 operator >>(const vuint8& a, const vuint8& n) { return _mm256_srlv_epi32(a, n); } + + __forceinline vuint8 sll(const vuint8& a, unsigned int b) { return _mm256_slli_epi32(a, b); } + __forceinline vuint8 sra(const vuint8& a, unsigned int b) { return _mm256_srai_epi32(a, b); } + __forceinline vuint8 srl(const vuint8& a, unsigned int b) { return _mm256_srli_epi32(a, b); } + + __forceinline vuint8 sll(const vuint8& a, const vuint8& b) { return _mm256_sllv_epi32(a, b); } + __forceinline vuint8 sra(const vuint8& a, const vuint8& b) { return _mm256_srav_epi32(a, b); } + __forceinline vuint8 srl(const vuint8& a, const vuint8& b) { return _mm256_srlv_epi32(a, b); } + + __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return _mm256_min_epu32(a, b); } + __forceinline vuint8 min(const vuint8& a, unsigned int b) { return min(a,vuint8(b)); } + __forceinline vuint8 min(unsigned int a, const vuint8& b) { return min(vuint8(a),b); } + + __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return _mm256_max_epu32(a, b); } + __forceinline vuint8 max(const vuint8& a, unsigned int b) { return max(a,vuint8(b)); } + __forceinline vuint8 max(unsigned int a, const vuint8& b) { return max(vuint8(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; } + __forceinline vuint8& operator +=(vuint8& a, unsigned int b) { return a = a + b; } + + __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; } + __forceinline vuint8& operator -=(vuint8& a, unsigned int b) { return a = a - b; } + + //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; } + //__forceinline vuint8& operator *=(vuint8& a, unsigned int b) { return a = a * b; } + + __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; } + __forceinline vuint8& operator &=(vuint8& a, unsigned int b) { return a = a & b; } + + __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; } + __forceinline vuint8& operator |=(vuint8& a, unsigned int b) { return a = a | b; } + + __forceinline vuint8& operator <<=(vuint8& a, const unsigned int b) { return a = a << b; } + __forceinline vuint8& operator >>=(vuint8& a, const unsigned int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { + return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t); + } +#else + __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); } + __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); } + //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(b, a)); } + //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a < b); } + //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(a, b)); } + //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a > b); } + + __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); + } +#endif + + template<int mask> + __forceinline vuint8 select(const vuint8& t, const vuint8& f) { + return _mm256_blend_epi32(f, t, mask); + } + + __forceinline vboolf8 operator ==(const vuint8& a, unsigned int b) { return a == vuint8(b); } + __forceinline vboolf8 operator ==(unsigned int a, const vuint8& b) { return vuint8(a) == b; } + + __forceinline vboolf8 operator !=(const vuint8& a, unsigned int b) { return a != vuint8(b); } + __forceinline vboolf8 operator !=(unsigned int a, const vuint8& b) { return vuint8(a) != b; } + + //__forceinline vboolf8 operator < (const vuint8& a, unsigned int b) { return a < vuint8(b); } + //__forceinline vboolf8 operator < (unsigned int a, const vuint8& b) { return vuint8(a) < b; } + + //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int b) { return a >= vuint8(b); } + //__forceinline vboolf8 operator >=(unsigned int a, const vuint8& b) { return vuint8(a) >= b; } + + //__forceinline vboolf8 operator > (const vuint8& a, unsigned int b) { return a > vuint8(b); } + //__forceinline vboolf8 operator > (unsigned int a, const vuint8& b) { return vuint8(a) > b; } + + //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int b) { return a <= vuint8(b); } + //__forceinline vboolf8 operator <=(unsigned int a, const vuint8& b) { return vuint8(a) <= b; } + + __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; } + __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; } + //__forceinline vboolf8 lt(const vuint8& a, const vuint8& b) { return a < b; } + //__forceinline vboolf8 ge(const vuint8& a, const vuint8& b) { return a >= b; } + //__forceinline vboolf8 gt(const vuint8& a, const vuint8& b) { return a > b; } + //__forceinline vboolf8 le(const vuint8& a, const vuint8& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); } + __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); } + //__forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a < b); } + //__forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a >= b); } + //__forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a > b); } + //__forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a <= b); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_unpacklo_epi32(a, b); } + __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_unpackhi_epi32(a, b); } + + template<int i> + __forceinline vuint8 shuffle(const vuint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template<int i0, int i1> + __forceinline vuint8 shuffle4(const vuint8& v) { + return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1> + __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) { + return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vuint8 shuffle(const vuint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) { + return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } + + template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); } + template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); } + template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); } + + __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } + + __forceinline vuint8 permute(const vuint8& v, const __m256i& index) { + return _mm256_permutevar8x32_epi32(v, index); + } + + __forceinline vuint8 shuffle(const vuint8& v, const __m256i& index) { + return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index)); + } + + template<int i> + __forceinline vuint8 align_shift_right(const vuint8& a, const vuint8& b) { +#if defined(__AVX512VL__) + return _mm256_alignr_epi32(a, b, i); +#else + return _mm256_alignr_epi8(a, b, 4*i); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); } + //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); } + //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); } + //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); } + + //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); } + //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); } + + //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/sys/alloc.cpp b/thirdparty/embree/common/sys/alloc.cpp new file mode 100644 index 0000000000..abdd269069 --- /dev/null +++ b/thirdparty/embree/common/sys/alloc.cpp @@ -0,0 +1,327 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "alloc.h" +#include "intrinsics.h" +#include "sysinfo.h" +#include "mutex.h" + +//////////////////////////////////////////////////////////////////////////////// +/// All Platforms +//////////////////////////////////////////////////////////////////////////////// + +namespace embree +{ + void* alignedMalloc(size_t size, size_t align) + { + if (size == 0) + return nullptr; + + assert((align & (align-1)) == 0); + void* ptr = _mm_malloc(size,align); + + if (size != 0 && ptr == nullptr) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + + return ptr; + } + + void alignedFree(void* ptr) + { + if (ptr) + _mm_free(ptr); + } + + static bool huge_pages_enabled = false; + static MutexSys os_init_mutex; + + __forceinline bool isHugePageCandidate(const size_t bytes) + { + if (!huge_pages_enabled) + return false; + + /* use huge pages only when memory overhead is low */ + const size_t hbytes = (bytes+PAGE_SIZE_2M-1) & ~size_t(PAGE_SIZE_2M-1); + return 66*(hbytes-bytes) < bytes; // at most 1.5% overhead + } +} + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#ifdef _WIN32 + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> +#include <malloc.h> + +namespace embree +{ + bool win_enable_selockmemoryprivilege (bool verbose) + { + HANDLE hToken; + if (!OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY | TOKEN_ADJUST_PRIVILEGES, &hToken)) { + if (verbose) std::cout << "WARNING: OpenProcessToken failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl; + return false; + } + + TOKEN_PRIVILEGES tp; + tp.PrivilegeCount = 1; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + if (!LookupPrivilegeValueW(nullptr, L"SeLockMemoryPrivilege", &tp.Privileges[0].Luid)) { + if (verbose) std::cout << "WARNING: LookupPrivilegeValue failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl; + return false; + } + + SetLastError(ERROR_SUCCESS); + if (!AdjustTokenPrivileges(hToken, FALSE, &tp, sizeof(tp), nullptr, 0)) { + if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed while trying to enable SeLockMemoryPrivilege" << std::endl; + return false; + } + + if (GetLastError() == ERROR_NOT_ALL_ASSIGNED) { + if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed to enable SeLockMemoryPrivilege: Add SeLockMemoryPrivilege for current user and run process in elevated mode (Run as administrator)." << std::endl; + return false; + } + + return true; + } + + bool os_init(bool hugepages, bool verbose) + { + Lock<MutexSys> lock(os_init_mutex); + + if (!hugepages) { + huge_pages_enabled = false; + return true; + } + + if (GetLargePageMinimum() != PAGE_SIZE_2M) { + huge_pages_enabled = false; + return false; + } + + huge_pages_enabled = true; + return true; + } + + void* os_malloc(size_t bytes, bool& hugepages) + { + if (bytes == 0) { + hugepages = false; + return nullptr; + } + + /* try direct huge page allocation first */ + if (isHugePageCandidate(bytes)) + { + int flags = MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES; + char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); + if (ptr != nullptr) { + hugepages = true; + return ptr; + } + } + + /* fall back to 4k pages */ + int flags = MEM_COMMIT | MEM_RESERVE; + char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); + // -- GODOT start -- + // if (ptr == nullptr) throw std::bad_alloc(); + if (ptr == nullptr) abort(); + // -- GODOT end -- + hugepages = false; + return ptr; + } + + size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) + { + if (hugepages) // decommitting huge pages seems not to work under Windows + return bytesOld; + + const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; + bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1); + bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1); + if (bytesNew >= bytesOld) + return bytesOld; + + if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT)) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + + return bytesNew; + } + + void os_free(void* ptr, size_t bytes, bool hugepages) + { + if (bytes == 0) + return; + + if (!VirtualFree(ptr,0,MEM_RELEASE)) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + } + + void os_advise(void *ptr, size_t bytes) + { + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) + +#include <sys/mman.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <sstream> + +#if defined(__MACOSX__) +#include <mach/vm_statistics.h> +#endif + +namespace embree +{ + bool os_init(bool hugepages, bool verbose) + { + Lock<MutexSys> lock(os_init_mutex); + + if (!hugepages) { + huge_pages_enabled = false; + return true; + } + +#if defined(__LINUX__) + + int hugepagesize = 0; + + std::ifstream file; + file.open("/proc/meminfo",std::ios::in); + if (!file.is_open()) { + if (verbose) std::cout << "WARNING: Could not open /proc/meminfo. Huge page support cannot get enabled!" << std::endl; + huge_pages_enabled = false; + return false; + } + + std::string line; + while (getline(file,line)) + { + std::stringstream sline(line); + while (!sline.eof() && sline.peek() == ' ') sline.ignore(); + std::string tag; getline(sline,tag,' '); + while (!sline.eof() && sline.peek() == ' ') sline.ignore(); + std::string val; getline(sline,val,' '); + while (!sline.eof() && sline.peek() == ' ') sline.ignore(); + std::string unit; getline(sline,unit,' '); + if (tag == "Hugepagesize:" && unit == "kB") { + hugepagesize = std::stoi(val)*1024; + break; + } + } + + if (hugepagesize != PAGE_SIZE_2M) + { + if (verbose) std::cout << "WARNING: Only 2MB huge pages supported. Huge page support cannot get enabled!" << std::endl; + huge_pages_enabled = false; + return false; + } +#endif + + huge_pages_enabled = true; + return true; + } + + void* os_malloc(size_t bytes, bool& hugepages) + { + if (bytes == 0) { + hugepages = false; + return nullptr; + } + + /* try direct huge page allocation first */ + if (isHugePageCandidate(bytes)) + { +#if defined(__MACOSX__) + void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); + if (ptr != MAP_FAILED) { + hugepages = true; + return ptr; + } +#elif defined(MAP_HUGETLB) + void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB, -1, 0); + if (ptr != MAP_FAILED) { + hugepages = true; + return ptr; + } +#endif + } + + /* fallback to 4k pages */ + void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + // -- GODOT start -- + // if (ptr == MAP_FAILED) throw std::bad_alloc(); + if (ptr == MAP_FAILED) abort(); + // -- GODOT end -- + hugepages = false; + + /* advise huge page hint for THP */ + os_advise(ptr,bytes); + return ptr; + } + + size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) + { + const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; + bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1); + bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1); + if (bytesNew >= bytesOld) + return bytesOld; + + if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + + return bytesNew; + } + + void os_free(void* ptr, size_t bytes, bool hugepages) + { + if (bytes == 0) + return; + + /* for hugepages we need to also align the size */ + const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; + bytes = (bytes+pageSize-1) & ~(pageSize-1); + if (munmap(ptr,bytes) == -1) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + } + + /* hint for transparent huge pages (THP) */ + void os_advise(void* pptr, size_t bytes) + { +#if defined(MADV_HUGEPAGE) + madvise(pptr,bytes,MADV_HUGEPAGE); +#endif + } +} + +#endif diff --git a/thirdparty/embree/common/sys/alloc.h b/thirdparty/embree/common/sys/alloc.h new file mode 100644 index 0000000000..4fa474ec1d --- /dev/null +++ b/thirdparty/embree/common/sys/alloc.h @@ -0,0 +1,164 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include <vector> +#include <set> + +namespace embree +{ +#define ALIGNED_STRUCT_(align) \ + void* operator new(size_t size) { return alignedMalloc(size,align); } \ + void operator delete(void* ptr) { alignedFree(ptr); } \ + void* operator new[](size_t size) { return alignedMalloc(size,align); } \ + void operator delete[](void* ptr) { alignedFree(ptr); } + +#define ALIGNED_CLASS_(align) \ + public: \ + ALIGNED_STRUCT_(align) \ + private: + + /*! aligned allocation */ + void* alignedMalloc(size_t size, size_t align); + void alignedFree(void* ptr); + + /*! allocator that performs aligned allocations */ + template<typename T, size_t alignment> + struct aligned_allocator + { + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + __forceinline pointer allocate( size_type n ) { + return (pointer) alignedMalloc(n*sizeof(value_type),alignment); + } + + __forceinline void deallocate( pointer p, size_type n ) { + return alignedFree(p); + } + + __forceinline void construct( pointer p, const_reference val ) { + new (p) T(val); + } + + __forceinline void destroy( pointer p ) { + p->~T(); + } + }; + + /*! allocates pages directly from OS */ + bool win_enable_selockmemoryprivilege(bool verbose); + bool os_init(bool hugepages, bool verbose); + void* os_malloc (size_t bytes, bool& hugepages); + size_t os_shrink (void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages); + void os_free (void* ptr, size_t bytes, bool hugepages); + void os_advise (void* ptr, size_t bytes); + + /*! allocator that performs OS allocations */ + template<typename T> + struct os_allocator + { + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + __forceinline os_allocator () + : hugepages(false) {} + + __forceinline pointer allocate( size_type n ) { + return (pointer) os_malloc(n*sizeof(value_type),hugepages); + } + + __forceinline void deallocate( pointer p, size_type n ) { + return os_free(p,n*sizeof(value_type),hugepages); + } + + __forceinline void construct( pointer p, const_reference val ) { + new (p) T(val); + } + + __forceinline void destroy( pointer p ) { + p->~T(); + } + + bool hugepages; + }; + + /*! allocator for IDs */ + template<typename T, size_t max_id> + struct IDPool + { + typedef T value_type; + + IDPool () + : nextID(0) {} + + T allocate() + { + /* return ID from list */ + if (!IDs.empty()) + { + T id = *IDs.begin(); + IDs.erase(IDs.begin()); + return id; + } + + /* allocate new ID */ + else + { + if (size_t(nextID)+1 > max_id) + return -1; + + return nextID++; + } + } + + /* adds an ID provided by the user */ + bool add(T id) + { + if (id > max_id) + return false; + + /* check if ID should be in IDs set */ + if (id < nextID) { + auto p = IDs.find(id); + if (p == IDs.end()) return false; + IDs.erase(p); + return true; + } + + /* otherwise increase ID set */ + else + { + for (T i=nextID; i<id; i++) { + IDs.insert(i); + } + nextID = id+1; + return true; + } + } + + void deallocate( T id ) + { + assert(id < nextID); + MAYBE_UNUSED auto done = IDs.insert(id).second; + assert(done); + } + + private: + std::set<T> IDs; //!< stores deallocated IDs to be reused + T nextID; //!< next ID to use when IDs vector is empty + }; +} + diff --git a/thirdparty/embree/common/sys/array.h b/thirdparty/embree/common/sys/array.h new file mode 100644 index 0000000000..dd9190c52a --- /dev/null +++ b/thirdparty/embree/common/sys/array.h @@ -0,0 +1,222 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "alloc.h" + +namespace embree +{ + /*! static array with static size */ + template<typename T, size_t N> + class array_t + { + public: + + /********************** Iterators ****************************/ + + __forceinline T* begin() const { return items; }; + __forceinline T* end () const { return items+N; }; + + + /********************** Capacity ****************************/ + + __forceinline bool empty () const { return N == 0; } + __forceinline size_t size () const { return N; } + __forceinline size_t max_size () const { return N; } + + + /******************** Element access **************************/ + + __forceinline T& operator[](size_t i) { assert(i < N); return items[i]; } + __forceinline const T& operator[](size_t i) const { assert(i < N); return items[i]; } + + __forceinline T& at(size_t i) { assert(i < N); return items[i]; } + __forceinline const T& at(size_t i) const { assert(i < N); return items[i]; } + + __forceinline T& front() const { assert(N > 0); return items[0]; }; + __forceinline T& back () const { assert(N > 0); return items[N-1]; }; + + __forceinline T* data() { return items; }; + __forceinline const T* data() const { return items; }; + + private: + T items[N]; + }; + + /*! static array with dynamic size */ + template<typename T, size_t N> + class darray_t + { + public: + + __forceinline darray_t () : M(0) {} + + __forceinline darray_t (const T& v) : M(0) { + for (size_t i=0; i<N; i++) items[i] = v; + } + + /********************** Iterators ****************************/ + + __forceinline T* begin() const { return items; }; + __forceinline T* end () const { return items+M; }; + + + /********************** Capacity ****************************/ + + __forceinline bool empty () const { return M == 0; } + __forceinline size_t size () const { return M; } + __forceinline size_t capacity () const { return N; } + __forceinline size_t max_size () const { return N; } + + void resize(size_t new_size) { + assert(new_size < max_size()); + M = new_size; + } + + /******************** Modifiers **************************/ + + __forceinline void push_back(const T& v) + { + assert(M+1 < max_size()); + items[M++] = v; + } + + __forceinline void pop_back() + { + assert(!empty()); + M--; + } + + __forceinline void clear() { + M = 0; + } + + /******************** Element access **************************/ + + __forceinline T& operator[](size_t i) { assert(i < M); return items[i]; } + __forceinline const T& operator[](size_t i) const { assert(i < M); return items[i]; } + + __forceinline T& at(size_t i) { assert(i < M); return items[i]; } + __forceinline const T& at(size_t i) const { assert(i < M); return items[i]; } + + __forceinline T& front() const { assert(M > 0); return items[0]; }; + __forceinline T& back () const { assert(M > 0); return items[M-1]; }; + + __forceinline T* data() { return items; }; + __forceinline const T* data() const { return items; }; + + private: + size_t M; + T items[N]; + }; + + /*! dynamic sized array that is allocated on the stack */ +#define dynamic_large_stack_array(Ty,Name,N,max_stack_bytes) StackArray<Ty,max_stack_bytes> Name(N) + template<typename Ty, size_t max_stack_bytes> + struct __aligned(64) StackArray + { + __forceinline StackArray (const size_t N) + : N(N) + { + if (N*sizeof(Ty) <= max_stack_bytes) + data = &arr[0]; + else + data = (Ty*) alignedMalloc(N*sizeof(Ty),64); + } + + __forceinline ~StackArray () { + if (data != &arr[0]) alignedFree(data); + } + + __forceinline operator Ty* () { return data; } + __forceinline operator const Ty* () const { return data; } + + __forceinline Ty& operator[](const int i) { assert(i>=0 && i<N); return data[i]; } + __forceinline const Ty& operator[](const int i) const { assert(i>=0 && i<N); return data[i]; } + + __forceinline Ty& operator[](const unsigned i) { assert(i<N); return data[i]; } + __forceinline const Ty& operator[](const unsigned i) const { assert(i<N); return data[i]; } + +#if defined(__64BIT__) + __forceinline Ty& operator[](const size_t i) { assert(i<N); return data[i]; } + __forceinline const Ty& operator[](const size_t i) const { assert(i<N); return data[i]; } +#endif + + private: + Ty arr[max_stack_bytes/sizeof(Ty)]; + Ty* data; + size_t N; + + private: + StackArray (const StackArray& other) DELETED; // do not implement + StackArray& operator= (const StackArray& other) DELETED; // do not implement + + }; + + /*! dynamic sized array that is allocated on the stack */ + template<typename Ty, size_t max_stack_elements, size_t max_total_elements> + struct __aligned(64) DynamicStackArray + { + __forceinline DynamicStackArray () + : data(&arr[0]) {} + + __forceinline ~DynamicStackArray () + { + if (!isStackAllocated()) + delete[] data; + } + + __forceinline bool isStackAllocated() const { + return data == &arr[0]; + } + + __forceinline size_t size() const + { + if (isStackAllocated()) return max_stack_elements; + else return max_total_elements; + } + + __forceinline void resize(size_t M) + { + assert(M <= max_total_elements); + if (likely(M <= max_stack_elements)) return; + if (likely(!isStackAllocated())) return; + + data = new Ty[max_total_elements]; + + for (size_t i=0; i<max_stack_elements; i++) + data[i] = arr[i]; + } + + __forceinline operator Ty* () { return data; } + __forceinline operator const Ty* () const { return data; } + + __forceinline Ty& operator[](const int i) { assert(i>=0 && i<max_total_elements); resize(i+1); return data[i]; } + __forceinline Ty& operator[](const unsigned i) { assert(i<max_total_elements); resize(i+1); return data[i]; } + +#if defined(__64BIT__) + __forceinline Ty& operator[](const size_t i) { assert(i<max_total_elements); resize(i+1); return data[i]; } +#endif + + __forceinline DynamicStackArray (const DynamicStackArray& other) + : data(&arr[0]) + { + for (size_t i=0; i<other.size(); i++) + this->operator[] (i) = other[i]; + } + + DynamicStackArray& operator= (const DynamicStackArray& other) + { + for (size_t i=0; i<other.size(); i++) + this->operator[] (i) = other[i]; + + return *this; + } + + private: + Ty arr[max_stack_elements]; + Ty* data; + }; +} diff --git a/thirdparty/embree/common/sys/atomic.h b/thirdparty/embree/common/sys/atomic.h new file mode 100644 index 0000000000..67af254f36 --- /dev/null +++ b/thirdparty/embree/common/sys/atomic.h @@ -0,0 +1,59 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <atomic> +#include "intrinsics.h" + +namespace embree +{ +/* compiler memory barriers */ +#if defined(__INTEL_COMPILER) +//#define __memory_barrier() __memory_barrier() +#elif defined(__GNUC__) || defined(__clang__) +# define __memory_barrier() asm volatile("" ::: "memory") +#elif defined(_MSC_VER) +# define __memory_barrier() _ReadWriteBarrier() +#endif + + template <typename T> + struct atomic : public std::atomic<T> + { + atomic () {} + + atomic (const T& a) + : std::atomic<T>(a) {} + + atomic (const atomic<T>& a) { + this->store(a.load()); + } + + atomic& operator=(const atomic<T>& other) { + this->store(other.load()); + return *this; + } + }; + + template<typename T> + __forceinline void atomic_min(std::atomic<T>& aref, const T& bref) + { + const T b = bref.load(); + while (true) { + T a = aref.load(); + if (a <= b) break; + if (aref.compare_exchange_strong(a,b)) break; + } + } + + template<typename T> + __forceinline void atomic_max(std::atomic<T>& aref, const T& bref) + { + const T b = bref.load(); + while (true) { + T a = aref.load(); + if (a >= b) break; + if (aref.compare_exchange_strong(a,b)) break; + } + } +} diff --git a/thirdparty/embree/common/sys/barrier.cpp b/thirdparty/embree/common/sys/barrier.cpp new file mode 100644 index 0000000000..0c0e39d92d --- /dev/null +++ b/thirdparty/embree/common/sys/barrier.cpp @@ -0,0 +1,289 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "barrier.h" +#include "condition.h" +#include "regression.h" +#include "thread.h" + +#if defined (__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +namespace embree +{ + struct BarrierSysImplementation + { + __forceinline BarrierSysImplementation (size_t N) + : i(0), enterCount(0), exitCount(0), barrierSize(0) + { + events[0] = CreateEvent(nullptr, TRUE, FALSE, nullptr); + events[1] = CreateEvent(nullptr, TRUE, FALSE, nullptr); + init(N); + } + + __forceinline ~BarrierSysImplementation () + { + CloseHandle(events[0]); + CloseHandle(events[1]); + } + + __forceinline void init(size_t N) + { + barrierSize = N; + enterCount.store(N); + exitCount.store(N); + } + + __forceinline void wait() + { + /* every thread entering the barrier decrements this count */ + size_t i0 = i; + size_t cnt0 = enterCount--; + + /* all threads except the last one are wait in the barrier */ + if (cnt0 > 1) + { + if (WaitForSingleObject(events[i0], INFINITE) != WAIT_OBJECT_0) + THROW_RUNTIME_ERROR("WaitForSingleObjects failed"); + } + + /* the last thread starts all threads waiting at the barrier */ + else + { + i = 1-i; + enterCount.store(barrierSize); + if (SetEvent(events[i0]) == 0) + THROW_RUNTIME_ERROR("SetEvent failed"); + } + + /* every thread leaving the barrier decrements this count */ + size_t cnt1 = exitCount--; + + /* the last thread that left the barrier resets the event again */ + if (cnt1 == 1) + { + exitCount.store(barrierSize); + if (ResetEvent(events[i0]) == 0) + THROW_RUNTIME_ERROR("ResetEvent failed"); + } + } + + public: + HANDLE events[2]; + atomic<size_t> i; + atomic<size_t> enterCount; + atomic<size_t> exitCount; + size_t barrierSize; + }; +} + +#else + +namespace embree +{ + struct BarrierSysImplementation + { + __forceinline BarrierSysImplementation (size_t N) + : count(0), barrierSize(0) + { + init(N); + } + + __forceinline void init(size_t N) + { + assert(count == 0); + count = 0; + barrierSize = N; + } + + __forceinline void wait() + { + mutex.lock(); + count++; + + if (count == barrierSize) { + count = 0; + cond.notify_all(); + mutex.unlock(); + return; + } + + cond.wait(mutex); + mutex.unlock(); + return; + } + + public: + MutexSys mutex; + ConditionSys cond; + volatile size_t count; + volatile size_t barrierSize; + }; +} + +#endif + +namespace embree +{ + BarrierSys::BarrierSys (size_t N) { + opaque = new BarrierSysImplementation(N); + } + + BarrierSys::~BarrierSys () { + delete (BarrierSysImplementation*) opaque; + } + + void BarrierSys::init(size_t count) { + ((BarrierSysImplementation*) opaque)->init(count); + } + + void BarrierSys::wait() { + ((BarrierSysImplementation*) opaque)->wait(); + } + + LinearBarrierActive::LinearBarrierActive (size_t N) + : count0(nullptr), count1(nullptr), mode(0), flag0(0), flag1(0), threadCount(0) + { + if (N == 0) N = getNumberOfLogicalThreads(); + init(N); + } + + LinearBarrierActive::~LinearBarrierActive() + { + delete[] count0; + delete[] count1; + } + + void LinearBarrierActive::init(size_t N) + { + if (threadCount != N) { + threadCount = N; + if (count0) delete[] count0; count0 = new unsigned char[N]; + if (count1) delete[] count1; count1 = new unsigned char[N]; + } + mode = 0; + flag0 = 0; + flag1 = 0; + for (size_t i=0; i<N; i++) count0[i] = 0; + for (size_t i=0; i<N; i++) count1[i] = 0; + } + + void LinearBarrierActive::wait (const size_t threadIndex) + { + if (mode == 0) + { + if (threadIndex == 0) + { + for (size_t i=0; i<threadCount; i++) + count1[i] = 0; + + for (size_t i=1; i<threadCount; i++) + { + while (likely(count0[i] == 0)) + pause_cpu(); + } + mode = 1; + flag1 = 0; + __memory_barrier(); + flag0 = 1; + } + else + { + count0[threadIndex] = 1; + { + while (likely(flag0 == 0)) + pause_cpu(); + } + + } + } + else + { + if (threadIndex == 0) + { + for (size_t i=0; i<threadCount; i++) + count0[i] = 0; + + for (size_t i=1; i<threadCount; i++) + { + while (likely(count1[i] == 0)) + pause_cpu(); + } + + mode = 0; + flag0 = 0; + __memory_barrier(); + flag1 = 1; + } + else + { + count1[threadIndex] = 1; + { + while (likely(flag1 == 0)) + pause_cpu(); + } + } + } + } + + struct barrier_sys_regression_test : public RegressionTest + { + BarrierSys barrier; + std::atomic<size_t> threadID; + std::atomic<size_t> numFailed; + std::vector<size_t> threadResults; + + barrier_sys_regression_test() + : RegressionTest("barrier_sys_regression_test"), threadID(0), numFailed(0) + { + registerRegressionTest(this); + } + + static void thread_alloc(barrier_sys_regression_test* This) + { + size_t tid = This->threadID++; + for (size_t j=0; j<1000; j++) + { + This->barrier.wait(); + This->threadResults[tid] = tid; + This->barrier.wait(); + } + } + + bool run () + { + threadID.store(0); + numFailed.store(0); + + size_t numThreads = getNumberOfLogicalThreads(); + threadResults.resize(numThreads); + barrier.init(numThreads+1); + + /* create threads */ + std::vector<thread_t> threads; + for (size_t i=0; i<numThreads; i++) + threads.push_back(createThread((thread_func)thread_alloc,this)); + + /* run test */ + for (size_t i=0; i<1000; i++) + { + for (size_t i=0; i<numThreads; i++) threadResults[i] = 0; + barrier.wait(); + barrier.wait(); + for (size_t i=0; i<numThreads; i++) numFailed += threadResults[i] != i; + } + + /* destroy threads */ + for (size_t i=0; i<numThreads; i++) + join(threads[i]); + + return numFailed == 0; + } + }; + + barrier_sys_regression_test barrier_sys_regression_test; +} + + diff --git a/thirdparty/embree/common/sys/barrier.h b/thirdparty/embree/common/sys/barrier.h new file mode 100644 index 0000000000..37fc036291 --- /dev/null +++ b/thirdparty/embree/common/sys/barrier.h @@ -0,0 +1,112 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "intrinsics.h" +#include "sysinfo.h" +#include "atomic.h" + +namespace embree +{ + /*! system barrier using operating system */ + class BarrierSys + { + public: + + /*! construction / destruction */ + BarrierSys (size_t N = 0); + ~BarrierSys (); + + private: + /*! class in non-copyable */ + BarrierSys (const BarrierSys& other) DELETED; // do not implement + BarrierSys& operator= (const BarrierSys& other) DELETED; // do not implement + + public: + /*! intializes the barrier with some number of threads */ + void init(size_t count); + + /*! lets calling thread wait in barrier */ + void wait(); + + private: + void* opaque; + }; + + /*! fast active barrier using atomitc counter */ + struct BarrierActive + { + public: + BarrierActive () + : cntr(0) {} + + void reset() { + cntr.store(0); + } + + void wait (size_t numThreads) + { + cntr++; + while (cntr.load() != numThreads) + pause_cpu(); + } + + private: + std::atomic<size_t> cntr; + }; + + /*! fast active barrier that does not require initialization to some number of threads */ + struct BarrierActiveAutoReset + { + public: + BarrierActiveAutoReset () + : cntr0(0), cntr1(0) {} + + void wait (size_t threadCount) + { + cntr0.fetch_add(1); + while (cntr0 != threadCount) pause_cpu(); + cntr1.fetch_add(1); + while (cntr1 != threadCount) pause_cpu(); + cntr0.fetch_add(-1); + while (cntr0 != 0) pause_cpu(); + cntr1.fetch_add(-1); + while (cntr1 != 0) pause_cpu(); + } + + private: + std::atomic<size_t> cntr0; + std::atomic<size_t> cntr1; + }; + + class LinearBarrierActive + { + public: + + /*! construction and destruction */ + LinearBarrierActive (size_t threadCount = 0); + ~LinearBarrierActive(); + + private: + /*! class in non-copyable */ + LinearBarrierActive (const LinearBarrierActive& other) DELETED; // do not implement + LinearBarrierActive& operator= (const LinearBarrierActive& other) DELETED; // do not implement + + public: + /*! intializes the barrier with some number of threads */ + void init(size_t threadCount); + + /*! thread with threadIndex waits in the barrier */ + void wait (const size_t threadIndex); + + private: + volatile unsigned char* count0; + volatile unsigned char* count1; + volatile unsigned int mode; + volatile unsigned int flag0; + volatile unsigned int flag1; + volatile size_t threadCount; + }; +} + diff --git a/thirdparty/embree/common/sys/condition.cpp b/thirdparty/embree/common/sys/condition.cpp new file mode 100644 index 0000000000..606a1d0b04 --- /dev/null +++ b/thirdparty/embree/common/sys/condition.cpp @@ -0,0 +1,85 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "condition.h" + +#if defined(__WIN32__) && !defined(PTHREADS_WIN32) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +namespace embree +{ + struct ConditionImplementation + { + __forceinline ConditionImplementation () { + InitializeConditionVariable(&cond); + } + + __forceinline ~ConditionImplementation () { + } + + __forceinline void wait(MutexSys& mutex_in) { + SleepConditionVariableCS(&cond, (LPCRITICAL_SECTION)mutex_in.mutex, INFINITE); + } + + __forceinline void notify_all() { + WakeAllConditionVariable(&cond); + } + + public: + CONDITION_VARIABLE cond; + }; +} +#endif + +#if defined(__UNIX__) || defined(PTHREADS_WIN32) +#include <pthread.h> +namespace embree +{ + struct ConditionImplementation + { + __forceinline ConditionImplementation () { + if (pthread_cond_init(&cond,nullptr) != 0) + THROW_RUNTIME_ERROR("pthread_cond_init failed"); + } + + __forceinline ~ConditionImplementation() { + MAYBE_UNUSED bool ok = pthread_cond_destroy(&cond) == 0; + assert(ok); + } + + __forceinline void wait(MutexSys& mutex) { + if (pthread_cond_wait(&cond, (pthread_mutex_t*)mutex.mutex) != 0) + THROW_RUNTIME_ERROR("pthread_cond_wait failed"); + } + + __forceinline void notify_all() { + if (pthread_cond_broadcast(&cond) != 0) + THROW_RUNTIME_ERROR("pthread_cond_broadcast failed"); + } + + public: + pthread_cond_t cond; + }; +} +#endif + +namespace embree +{ + ConditionSys::ConditionSys () { + cond = new ConditionImplementation; + } + + ConditionSys::~ConditionSys() { + delete (ConditionImplementation*) cond; + } + + void ConditionSys::wait(MutexSys& mutex) { + ((ConditionImplementation*) cond)->wait(mutex); + } + + void ConditionSys::notify_all() { + ((ConditionImplementation*) cond)->notify_all(); + } +} diff --git a/thirdparty/embree/common/sys/condition.h b/thirdparty/embree/common/sys/condition.h new file mode 100644 index 0000000000..557c6e3482 --- /dev/null +++ b/thirdparty/embree/common/sys/condition.h @@ -0,0 +1,31 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "mutex.h" + +namespace embree +{ + class ConditionSys + { + public: + ConditionSys(); + ~ConditionSys(); + void wait( class MutexSys& mutex ); + void notify_all(); + + template<typename Predicate> + __forceinline void wait( class MutexSys& mutex, const Predicate& pred ) + { + while (!pred()) wait(mutex); + } + + private: + ConditionSys (const ConditionSys& other) DELETED; // do not implement + ConditionSys& operator= (const ConditionSys& other) DELETED; // do not implement + + protected: + void* cond; + }; +} diff --git a/thirdparty/embree/common/sys/filename.cpp b/thirdparty/embree/common/sys/filename.cpp new file mode 100644 index 0000000000..f55b224302 --- /dev/null +++ b/thirdparty/embree/common/sys/filename.cpp @@ -0,0 +1,138 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "filename.h" +#include "sysinfo.h" + +namespace embree +{ +#ifdef __WIN32__ + const char path_sep = '\\'; +#else + const char path_sep = '/'; +#endif + + /*! create an empty filename */ + FileName::FileName () {} + + /*! create a valid filename from a string */ + FileName::FileName (const char* in) { + filename = in; + for (size_t i=0; i<filename.size(); i++) + if (filename[i] == '\\' || filename[i] == '/') + filename[i] = path_sep; + while (!filename.empty() && filename[filename.size()-1] == path_sep) + filename.resize(filename.size()-1); + } + + /*! create a valid filename from a string */ + FileName::FileName (const std::string& in) { + filename = in; + for (size_t i=0; i<filename.size(); i++) + if (filename[i] == '\\' || filename[i] == '/') + filename[i] = path_sep; + while (!filename.empty() && filename[filename.size()-1] == path_sep) + filename.resize(filename.size()-1); + } + + /*! returns path to home folder */ + FileName FileName::homeFolder() + { +#ifdef __WIN32__ + const char* home = getenv("UserProfile"); +#else + const char* home = getenv("HOME"); +#endif + if (home) return home; + return ""; + } + + /*! returns path to executable */ + FileName FileName::executableFolder() { + return FileName(getExecutableFileName()).path(); + } + + /*! returns the path */ + FileName FileName::path() const { + size_t pos = filename.find_last_of(path_sep); + if (pos == std::string::npos) return FileName(); + return filename.substr(0,pos); + } + + /*! returns the basename */ + std::string FileName::base() const { + size_t pos = filename.find_last_of(path_sep); + if (pos == std::string::npos) return filename; + return filename.substr(pos+1); + } + + /*! returns the extension */ + std::string FileName::ext() const { + size_t pos = filename.find_last_of('.'); + if (pos == std::string::npos) return ""; + return filename.substr(pos+1); + } + + /*! returns the extension */ + FileName FileName::dropExt() const { + size_t pos = filename.find_last_of('.'); + if (pos == std::string::npos) return filename; + return filename.substr(0,pos); + } + + /*! returns the basename without extension */ + std::string FileName::name() const { + size_t start = filename.find_last_of(path_sep); + if (start == std::string::npos) start = 0; else start++; + size_t end = filename.find_last_of('.'); + if (end == std::string::npos || end < start) end = filename.size(); + return filename.substr(start, end - start); + } + + /*! replaces the extension */ + FileName FileName::setExt(const std::string& ext) const { + size_t start = filename.find_last_of(path_sep); + if (start == std::string::npos) start = 0; else start++; + size_t end = filename.find_last_of('.'); + if (end == std::string::npos || end < start) return FileName(filename+ext); + return FileName(filename.substr(0,end)+ext); + } + + /*! adds the extension */ + FileName FileName::addExt(const std::string& ext) const { + return FileName(filename+ext); + } + + /*! concatenates two filenames to this/other */ + FileName FileName::operator +( const FileName& other ) const { + if (filename == "") return FileName(other); + else return FileName(filename + path_sep + other.filename); + } + + /*! concatenates two filenames to this/other */ + FileName FileName::operator +( const std::string& other ) const { + return operator+(FileName(other)); + } + + /*! removes the base from a filename (if possible) */ + FileName FileName::operator -( const FileName& base ) const { + size_t pos = filename.find_first_of(base); + if (pos == std::string::npos) return *this; + return FileName(filename.substr(pos+1)); + } + + /*! == operator */ + bool operator== (const FileName& a, const FileName& b) { + return a.filename == b.filename; + } + + /*! != operator */ + bool operator!= (const FileName& a, const FileName& b) { + return a.filename != b.filename; + } + + /*! output operator */ + std::ostream& operator<<(std::ostream& cout, const FileName& filename) { + return cout << filename.filename; + } +} diff --git a/thirdparty/embree/common/sys/filename.h b/thirdparty/embree/common/sys/filename.h new file mode 100644 index 0000000000..d5929cd836 --- /dev/null +++ b/thirdparty/embree/common/sys/filename.h @@ -0,0 +1,81 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" + +namespace embree +{ + /*! Convenience class for handling file names and paths. */ + class FileName + { + public: + + /*! create an empty filename */ + FileName (); + + /*! create a valid filename from a string */ + FileName (const char* filename); + + /*! create a valid filename from a string */ + FileName (const std::string& filename); + + /*! returns path to home folder */ + static FileName homeFolder(); + + /*! returns path to executable */ + static FileName executableFolder(); + + /*! auto convert into a string */ + operator std::string() const { return filename; } + + /*! returns a string of the filename */ + const std::string str() const { return filename; } + + /*! returns a c-string of the filename */ + const char* c_str() const { return filename.c_str(); } + + /*! returns the path of a filename */ + FileName path() const; + + /*! returns the file of a filename */ + std::string base() const; + + /*! returns the base of a filename without extension */ + std::string name() const; + + /*! returns the file extension */ + std::string ext() const; + + /*! drops the file extension */ + FileName dropExt() const; + + /*! replaces the file extension */ + FileName setExt(const std::string& ext = "") const; + + /*! adds file extension */ + FileName addExt(const std::string& ext = "") const; + + /*! concatenates two filenames to this/other */ + FileName operator +( const FileName& other ) const; + + /*! concatenates two filenames to this/other */ + FileName operator +( const std::string& other ) const; + + /*! removes the base from a filename (if possible) */ + FileName operator -( const FileName& base ) const; + + /*! == operator */ + friend bool operator==(const FileName& a, const FileName& b); + + /*! != operator */ + friend bool operator!=(const FileName& a, const FileName& b); + + /*! output operator */ + friend std::ostream& operator<<(std::ostream& cout, const FileName& filename); + + private: + std::string filename; + }; +} diff --git a/thirdparty/embree/common/sys/intrinsics.h b/thirdparty/embree/common/sys/intrinsics.h new file mode 100644 index 0000000000..ed8dd7d40a --- /dev/null +++ b/thirdparty/embree/common/sys/intrinsics.h @@ -0,0 +1,525 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" + +#if defined(__WIN32__) +#include <intrin.h> +#endif + +#if defined(__ARM_NEON) +#include "../simd/arm/emulation.h" +#else +#include <immintrin.h> +#endif + +#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER) + #if !defined(_tzcnt_u32) + #define _tzcnt_u32 __tzcnt_u32 + #endif + #if !defined(_tzcnt_u64) + #define _tzcnt_u64 __tzcnt_u64 + #endif +#endif + +#if defined(__LZCNT__) + #if !defined(_lzcnt_u32) + #define _lzcnt_u32 __lzcnt32 + #endif + #if !defined(_lzcnt_u64) + #define _lzcnt_u64 __lzcnt64 + #endif +#endif + +#if defined(__WIN32__) +// -- GODOT start -- +#if !defined(NOMINMAX) +// -- GODOT end -- +#define NOMINMAX +// -- GODOT start -- +#endif +#include "windows.h" +// -- GODOT end -- +#endif + +/* normally defined in pmmintrin.h, but we always need this */ +#if !defined(_MM_SET_DENORMALS_ZERO_MODE) +#define _MM_DENORMALS_ZERO_ON (0x0040) +#define _MM_DENORMALS_ZERO_OFF (0x0000) +#define _MM_DENORMALS_ZERO_MASK (0x0040) +#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) +#endif + +namespace embree +{ + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + + __forceinline size_t read_tsc() + { + LARGE_INTEGER li; + QueryPerformanceCounter(&li); + return (size_t)li.QuadPart; + } + + __forceinline int bsf(int v) { +#if defined(__AVX2__) + return _tzcnt_u32(v); +#else + unsigned long r = 0; _BitScanForward(&r,v); return r; +#endif + } + + __forceinline unsigned bsf(unsigned v) { +#if defined(__AVX2__) + return _tzcnt_u32(v); +#else + unsigned long r = 0; _BitScanForward(&r,v); return r; +#endif + } + +#if defined(__X86_64__) + __forceinline size_t bsf(size_t v) { +#if defined(__AVX2__) + return _tzcnt_u64(v); +#else + unsigned long r = 0; _BitScanForward64(&r,v); return r; +#endif + } +#endif + + __forceinline int bscf(int& v) + { + int i = bsf(v); + v &= v-1; + return i; + } + + __forceinline unsigned bscf(unsigned& v) + { + unsigned i = bsf(v); + v &= v-1; + return i; + } + +#if defined(__X86_64__) + __forceinline size_t bscf(size_t& v) + { + size_t i = bsf(v); + v &= v-1; + return i; + } +#endif + + __forceinline int bsr(int v) { +#if defined(__AVX2__) + return 31 - _lzcnt_u32(v); +#else + unsigned long r = 0; _BitScanReverse(&r,v); return r; +#endif + } + + __forceinline unsigned bsr(unsigned v) { +#if defined(__AVX2__) + return 31 - _lzcnt_u32(v); +#else + unsigned long r = 0; _BitScanReverse(&r,v); return r; +#endif + } + +#if defined(__X86_64__) + __forceinline size_t bsr(size_t v) { +#if defined(__AVX2__) + return 63 -_lzcnt_u64(v); +#else + unsigned long r = 0; _BitScanReverse64(&r, v); return r; +#endif + } +#endif + + __forceinline int lzcnt(const int x) + { +#if defined(__AVX2__) + return _lzcnt_u32(x); +#else + if (unlikely(x == 0)) return 32; + return 31 - bsr(x); +#endif + } + + __forceinline int btc(int v, int i) { + long r = v; _bittestandcomplement(&r,i); return r; + } + + __forceinline int bts(int v, int i) { + long r = v; _bittestandset(&r,i); return r; + } + + __forceinline int btr(int v, int i) { + long r = v; _bittestandreset(&r,i); return r; + } + +#if defined(__X86_64__) + + __forceinline size_t btc(size_t v, size_t i) { + size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r; + } + + __forceinline size_t bts(size_t v, size_t i) { + __int64 r = v; _bittestandset64(&r,i); return r; + } + + __forceinline size_t btr(size_t v, size_t i) { + __int64 r = v; _bittestandreset64(&r,i); return r; + } + +#endif + + __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) { + return _InterlockedCompareExchange((volatile long*)p,v,c); + } + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#else + +#if defined(__i386__) && defined(__PIC__) + + __forceinline void __cpuid(int out[4], int op) + { + asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" + "cpuid\n\t" + "xchg{l}\t{%%}ebx, %1\n\t" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "0"(op)); + } + + __forceinline void __cpuid_count(int out[4], int op1, int op2) + { + asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" + "cpuid\n\t" + "xchg{l}\t{%%}ebx, %1\n\t" + : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3]) + : "0" (op1), "2" (op2)); + } + +#elif defined(__X86_ASM__) + + __forceinline void __cpuid(int out[4], int op) { + asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op)); + } + + __forceinline void __cpuid_count(int out[4], int op1, int op2) { + asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2)); + } + +#endif + + __forceinline uint64_t read_tsc() { +#if defined(__X86_ASM__) + uint32_t high,low; + asm volatile ("rdtsc" : "=d"(high), "=a"(low)); + return (((uint64_t)high) << 32) + (uint64_t)low; +#else + /* Not supported yet, meaning measuring traversal cost per pixel does not work. */ + return 0; +#endif + } + + __forceinline int bsf(int v) { +#if defined(__AVX2__) + return _tzcnt_u32(v); +#elif defined(__X86_ASM__) + int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +#else + return __builtin_ctz(v); +#endif + } + +#if defined(__64BIT__) + __forceinline unsigned bsf(unsigned v) + { +#if defined(__AVX2__) + return _tzcnt_u32(v); +#elif defined(__X86_ASM__) + unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +#else + return __builtin_ctz(v); +#endif + } +#endif + + __forceinline size_t bsf(size_t v) { +#if defined(__AVX2__) +#if defined(__X86_64__) + return _tzcnt_u64(v); +#else + return _tzcnt_u32(v); +#endif +#elif defined(__X86_ASM__) + size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +#else + return __builtin_ctzl(v); +#endif + } + + __forceinline int bscf(int& v) + { + int i = bsf(v); + v &= v-1; + return i; + } + +#if defined(__64BIT__) + __forceinline unsigned int bscf(unsigned int& v) + { + unsigned int i = bsf(v); + v &= v-1; + return i; + } +#endif + + __forceinline size_t bscf(size_t& v) + { + size_t i = bsf(v); + v &= v-1; + return i; + } + + __forceinline int bsr(int v) { +#if defined(__AVX2__) + return 31 - _lzcnt_u32(v); +#elif defined(__X86_ASM__) + int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +#else + return __builtin_clz(v) ^ 31; +#endif + } + +#if defined(__64BIT__) + __forceinline unsigned bsr(unsigned v) { +#if defined(__AVX2__) + return 31 - _lzcnt_u32(v); +#elif defined(__X86_ASM__) + unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +#else + return __builtin_clz(v) ^ 31; +#endif + } +#endif + + __forceinline size_t bsr(size_t v) { +#if defined(__AVX2__) +#if defined(__X86_64__) + return 63 - _lzcnt_u64(v); +#else + return 31 - _lzcnt_u32(v); +#endif +#elif defined(__X86_ASM__) + size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +#else + return (sizeof(v) * 8 - 1) - __builtin_clzl(v); +#endif + } + + __forceinline int lzcnt(const int x) + { +#if defined(__AVX2__) + return _lzcnt_u32(x); +#else + if (unlikely(x == 0)) return 32; + return 31 - bsr(x); +#endif + } + + __forceinline size_t blsr(size_t v) { +#if defined(__AVX2__) +#if defined(__INTEL_COMPILER) + return _blsr_u64(v); +#else +#if defined(__X86_64__) + return __blsr_u64(v); +#else + return __blsr_u32(v); +#endif +#endif +#else + return v & (v-1); +#endif + } + + __forceinline int btc(int v, int i) { +#if defined(__X86_ASM__) + int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; +#else + return (v ^ (1 << i)); +#endif + } + + __forceinline int bts(int v, int i) { +#if defined(__X86_ASM__) + int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#else + return (v | (v << i)); +#endif + } + + __forceinline int btr(int v, int i) { +#if defined(__X86_ASM__) + int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#else + return (v & ~(v << i)); +#endif + } + + __forceinline size_t btc(size_t v, size_t i) { +#if defined(__X86_ASM__) + size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; +#else + return (v ^ (1 << i)); +#endif + } + + __forceinline size_t bts(size_t v, size_t i) { +#if defined(__X86_ASM__) + size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#else + return (v | (v << i)); +#endif + } + + __forceinline size_t btr(size_t v, size_t i) { +#if defined(__X86_ASM__) + size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#else + return (v & ~(v << i)); +#endif + } + + __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) { + return __sync_val_compare_and_swap(value, comparand, input); + } + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// All Platforms +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__clang__) || defined(__GNUC__) +#if !defined(_mm_undefined_ps) + __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); } +#endif +#if !defined(_mm_undefined_si128) + __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); } +#endif +#if !defined(_mm256_undefined_ps) && defined(__AVX__) + __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); } +#endif +#if !defined(_mm256_undefined_si256) && defined(__AVX__) + __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); } +#endif +#if !defined(_mm512_undefined_ps) && defined(__AVX512F__) + __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); } +#endif +#if !defined(_mm512_undefined_epi32) && defined(__AVX512F__) + __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); } +#endif +#endif + +#if defined(__SSE4_2__) + + __forceinline int popcnt(int in) { + return _mm_popcnt_u32(in); + } + + __forceinline unsigned popcnt(unsigned in) { + return _mm_popcnt_u32(in); + } + +#if defined(__64BIT__) + __forceinline size_t popcnt(size_t in) { + return _mm_popcnt_u64(in); + } +#endif + +#endif + +#if defined(__X86_ASM__) + __forceinline uint64_t rdtsc() + { + int dummy[4]; + __cpuid(dummy,0); + uint64_t clock = read_tsc(); + __cpuid(dummy,0); + return clock; + } +#endif + + __forceinline void pause_cpu(const size_t N = 8) + { + for (size_t i=0; i<N; i++) + _mm_pause(); + } + + /* prefetches */ + __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); } + __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); } + __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); } + __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); } + __forceinline void prefetchEX (const void* ptr) { +#if defined(__INTEL_COMPILER) + _mm_prefetch((const char*)ptr,_MM_HINT_ET0); +#else + _mm_prefetch((const char*)ptr,_MM_HINT_T0); +#endif + } + + __forceinline void prefetchL1EX(const void* ptr) { + prefetchEX(ptr); + } + + __forceinline void prefetchL2EX(const void* ptr) { + prefetchEX(ptr); + } +#if defined(__AVX2__) + __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); } + __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); } +#if defined(__X86_64__) + __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); } + __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); } +#endif +#endif + +#if defined(__AVX512F__) +#if defined(__INTEL_COMPILER) + __forceinline float mm512_cvtss_f32(__m512 v) { + return _mm512_cvtss_f32(v); + } + __forceinline int mm512_mask2int(__mmask16 k1) { + return _mm512_mask2int(k1); + } + __forceinline __mmask16 mm512_int2mask(int mask) { + return _mm512_int2mask(mask); + } +#else + __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3 + return _mm_cvtss_f32(_mm512_castps512_ps128(v)); + } + __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3 + return (int)k1; + } + __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3 + return (__mmask16)mask; + } +#endif +#endif +} diff --git a/thirdparty/embree/common/sys/library.cpp b/thirdparty/embree/common/sys/library.cpp new file mode 100644 index 0000000000..fc983dffd5 --- /dev/null +++ b/thirdparty/embree/common/sys/library.cpp @@ -0,0 +1,83 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "library.h" +#include "sysinfo.h" +#include "filename.h" + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +namespace embree +{ + /* opens a shared library */ + lib_t openLibrary(const std::string& file) + { + std::string fullName = file+".dll"; + FileName executable = getExecutableFileName(); + HANDLE handle = LoadLibrary((executable.path() + fullName).c_str()); + return lib_t(handle); + } + + /* returns address of a symbol from the library */ + void* getSymbol(lib_t lib, const std::string& sym) { + return (void*)GetProcAddress(HMODULE(lib),sym.c_str()); + } + + /* closes the shared library */ + void closeLibrary(lib_t lib) { + FreeLibrary(HMODULE(lib)); + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) + +#include <dlfcn.h> + +namespace embree +{ + /* opens a shared library */ + lib_t openLibrary(const std::string& file) + { +#if defined(__MACOSX__) + std::string fullName = "lib"+file+".dylib"; +#else + std::string fullName = "lib"+file+".so"; +#endif + void* lib = dlopen(fullName.c_str(), RTLD_NOW); + if (lib) return lib_t(lib); + FileName executable = getExecutableFileName(); + lib = dlopen((executable.path() + fullName).c_str(),RTLD_NOW); + if (lib == nullptr) { + const char* error = dlerror(); + if (error) { + THROW_RUNTIME_ERROR(error); + } else { + THROW_RUNTIME_ERROR("could not load library "+executable.str()); + } + } + return lib_t(lib); + } + + /* returns address of a symbol from the library */ + void* getSymbol(lib_t lib, const std::string& sym) { + return dlsym(lib,sym.c_str()); + } + + /* closes the shared library */ + void closeLibrary(lib_t lib) { + dlclose(lib); + } +} +#endif diff --git a/thirdparty/embree/common/sys/library.h b/thirdparty/embree/common/sys/library.h new file mode 100644 index 0000000000..67e14d2420 --- /dev/null +++ b/thirdparty/embree/common/sys/library.h @@ -0,0 +1,21 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" + +namespace embree +{ + /*! type for shared library */ + typedef struct opaque_lib_t* lib_t; + + /*! loads a shared library */ + lib_t openLibrary(const std::string& file); + + /*! returns address of a symbol from the library */ + void* getSymbol(lib_t lib, const std::string& sym); + + /*! unloads a shared library */ + void closeLibrary(lib_t lib); +} diff --git a/thirdparty/embree/common/sys/mutex.cpp b/thirdparty/embree/common/sys/mutex.cpp new file mode 100644 index 0000000000..789feaf2d8 --- /dev/null +++ b/thirdparty/embree/common/sys/mutex.cpp @@ -0,0 +1,57 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "mutex.h" +#include "regression.h" + +#if defined(__WIN32__) && !defined(PTHREADS_WIN32) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +namespace embree +{ + MutexSys::MutexSys() { mutex = new CRITICAL_SECTION; InitializeCriticalSection((CRITICAL_SECTION*)mutex); } + MutexSys::~MutexSys() { DeleteCriticalSection((CRITICAL_SECTION*)mutex); delete (CRITICAL_SECTION*)mutex; } + void MutexSys::lock() { EnterCriticalSection((CRITICAL_SECTION*)mutex); } + bool MutexSys::try_lock() { return TryEnterCriticalSection((CRITICAL_SECTION*)mutex) != 0; } + void MutexSys::unlock() { LeaveCriticalSection((CRITICAL_SECTION*)mutex); } +} +#endif + +#if defined(__UNIX__) || defined(PTHREADS_WIN32) +#include <pthread.h> +namespace embree +{ + /*! system mutex using pthreads */ + MutexSys::MutexSys() + { + mutex = new pthread_mutex_t; + if (pthread_mutex_init((pthread_mutex_t*)mutex, nullptr) != 0) + THROW_RUNTIME_ERROR("pthread_mutex_init failed"); + } + + MutexSys::~MutexSys() + { + MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0; + assert(ok); + delete (pthread_mutex_t*)mutex; + } + + void MutexSys::lock() + { + if (pthread_mutex_lock((pthread_mutex_t*)mutex) != 0) + THROW_RUNTIME_ERROR("pthread_mutex_lock failed"); + } + + bool MutexSys::try_lock() { + return pthread_mutex_trylock((pthread_mutex_t*)mutex) == 0; + } + + void MutexSys::unlock() + { + if (pthread_mutex_unlock((pthread_mutex_t*)mutex) != 0) + THROW_RUNTIME_ERROR("pthread_mutex_unlock failed"); + } +}; +#endif diff --git a/thirdparty/embree/common/sys/mutex.h b/thirdparty/embree/common/sys/mutex.h new file mode 100644 index 0000000000..4cb3626d92 --- /dev/null +++ b/thirdparty/embree/common/sys/mutex.h @@ -0,0 +1,98 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "intrinsics.h" +#include "atomic.h" + +namespace embree +{ + /*! system mutex */ + class MutexSys { + friend struct ConditionImplementation; + public: + MutexSys(); + ~MutexSys(); + + private: + MutexSys (const MutexSys& other) DELETED; // do not implement + MutexSys& operator= (const MutexSys& other) DELETED; // do not implement + + public: + void lock(); + bool try_lock(); + void unlock(); + + protected: + void* mutex; + }; + + /*! spinning mutex */ + class SpinLock + { + public: + + SpinLock () + : flag(false) {} + + __forceinline bool isLocked() { + return flag.load(); + } + + __forceinline void lock() + { + while (true) + { + while (flag.load()) + { + _mm_pause(); + _mm_pause(); + } + + bool expected = false; + if (flag.compare_exchange_strong(expected,true,std::memory_order_acquire)) + break; + } + } + + __forceinline bool try_lock() + { + bool expected = false; + if (flag.load() != expected) { + return false; + } + return flag.compare_exchange_strong(expected,true,std::memory_order_acquire); + } + + __forceinline void unlock() { + flag.store(false,std::memory_order_release); + } + + __forceinline void wait_until_unlocked() + { + while(flag.load()) + { + _mm_pause(); + _mm_pause(); + } + } + + public: + atomic<bool> flag; + }; + + /*! safe mutex lock and unlock helper */ + template<typename Mutex> class Lock { + public: + Lock (Mutex& mutex) : mutex(mutex), locked(true) { mutex.lock(); } + Lock (Mutex& mutex, bool locked) : mutex(mutex), locked(locked) {} + ~Lock() { if (locked) mutex.unlock(); } + __forceinline void lock() { assert(!locked); locked = true; mutex.lock(); } + __forceinline bool isLocked() const { return locked; } + protected: + Mutex& mutex; + bool locked; + }; +} diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h new file mode 100644 index 0000000000..697e07bb86 --- /dev/null +++ b/thirdparty/embree/common/sys/platform.h @@ -0,0 +1,392 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define _CRT_SECURE_NO_WARNINGS + +#include <cstddef> +#include <cassert> +#include <cstdlib> +#include <cstdio> +#include <memory> +#include <stdexcept> +#include <iostream> +#include <iomanip> +#include <fstream> +#include <string> +#include <cstring> +#include <stdint.h> +#include <functional> + +//////////////////////////////////////////////////////////////////////////////// +/// detect platform +//////////////////////////////////////////////////////////////////////////////// + +/* detect 32 or 64 Intel platform */ +#if defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) +#define __X86_64__ +#define __X86_ASM__ +#elif defined(__i386__) || defined(_M_IX86) +#define __X86_ASM__ +#endif + +/* detect 64 bit platform */ +#if defined(__X86_64__) || defined(__aarch64__) +#define __64BIT__ +#endif + +/* detect Linux platform */ +#if defined(linux) || defined(__linux__) || defined(__LINUX__) +# if !defined(__LINUX__) +# define __LINUX__ +# endif +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* detect FreeBSD platform */ +#if defined(__FreeBSD__) || defined(__FREEBSD__) +# if !defined(__FREEBSD__) +# define __FREEBSD__ +# endif +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* detect Windows 95/98/NT/2000/XP/Vista/7/8/10 platform */ +#if (defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)) && !defined(__CYGWIN__) +# if !defined(__WIN32__) +# define __WIN32__ +# endif +#endif + +/* detect Cygwin platform */ +#if defined(__CYGWIN__) +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* detect MAC OS X platform */ +#if defined(__APPLE__) || defined(MACOSX) || defined(__MACOSX__) +# if !defined(__MACOSX__) +# define __MACOSX__ +# endif +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* try to detect other Unix systems */ +#if defined(__unix__) || defined (unix) || defined(__unix) || defined(_unix) +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Macros +//////////////////////////////////////////////////////////////////////////////// + +#ifdef __WIN32__ +#define dll_export __declspec(dllexport) +#define dll_import __declspec(dllimport) +#else +#define dll_export __attribute__ ((visibility ("default"))) +#define dll_import +#endif + +// -- GODOT start -- +#if defined(__WIN32__) && !defined(__MINGW32__) +// -- GODOT end -- +#if !defined(__noinline) +#define __noinline __declspec(noinline) +#endif +//#define __forceinline __forceinline +//#define __restrict __restrict +#if defined(__INTEL_COMPILER) +#define __restrict__ __restrict +#else +#define __restrict__ //__restrict // causes issues with MSVC +#endif +#if !defined(__thread) +#define __thread __declspec(thread) +#endif +#if !defined(__aligned) +#define __aligned(...) __declspec(align(__VA_ARGS__)) +#endif +//#define __FUNCTION__ __FUNCTION__ +#define debugbreak() __debugbreak() + +#else +#if !defined(__noinline) +#define __noinline __attribute__((noinline)) +#endif +#if !defined(__forceinline) +#define __forceinline inline __attribute__((always_inline)) +#endif +//#define __restrict __restrict +//#define __thread __thread +#if !defined(__aligned) +#define __aligned(...) __attribute__((aligned(__VA_ARGS__))) +#endif +#if !defined(__FUNCTION__) +#define __FUNCTION__ __PRETTY_FUNCTION__ +#endif +#define debugbreak() asm ("int $3") +#endif + +#if defined(__clang__) || defined(__GNUC__) + #define MAYBE_UNUSED __attribute__((unused)) +#else + #define MAYBE_UNUSED +#endif + +#if defined(_MSC_VER) && (_MSC_VER < 1900) // before VS2015 deleted functions are not supported properly + #define DELETED +#else + #define DELETED = delete +#endif + +// -- GODOT start -- +#if !defined(likely) +// -- GODOT end -- +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#define likely(expr) (expr) +#define unlikely(expr) (expr) +#else +#define likely(expr) __builtin_expect((bool)(expr),true ) +#define unlikely(expr) __builtin_expect((bool)(expr),false) +#endif +// -- GODOT start -- +#endif +// -- GODOT end -- + +//////////////////////////////////////////////////////////////////////////////// +/// Error handling and debugging +//////////////////////////////////////////////////////////////////////////////// + +/* debug printing macros */ +#define STRING(x) #x +#define TOSTRING(x) STRING(x) +#define PING embree_cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << embree_endl +#define PRINT(x) embree_cout << STRING(x) << " = " << (x) << embree_endl +#define PRINT2(x,y) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl +#define PRINT3(x,y,z) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl +#define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl + +#if defined(DEBUG) // only report file and line in debug mode + // -- GODOT start -- + // #define THROW_RUNTIME_ERROR(str) + // throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); + #define THROW_RUNTIME_ERROR(str) \ + printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort(); + // -- GODOT end -- +#else + // -- GODOT start -- + // #define THROW_RUNTIME_ERROR(str) + // throw std::runtime_error(str); + #define THROW_RUNTIME_ERROR(str) \ + abort(); + // -- GODOT end -- +#endif + +#define FATAL(x) THROW_RUNTIME_ERROR(x) +#define WARNING(x) { std::cerr << "Warning: " << x << embree_endl << std::flush; } + +#define NOT_IMPLEMENTED FATAL(std::string(__FUNCTION__) + " not implemented") + +//////////////////////////////////////////////////////////////////////////////// +/// Basic types +//////////////////////////////////////////////////////////////////////////////// + +/* default floating-point type */ +namespace embree { + typedef float real; +} + +/* windows does not have ssize_t */ +#if defined(__WIN32__) +#if defined(__64BIT__) +typedef int64_t ssize_t; +#else +typedef int32_t ssize_t; +#endif +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Basic utility functions +//////////////////////////////////////////////////////////////////////////////// + +__forceinline std::string toString(long long value) { + return std::to_string(value); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Disable some compiler warnings +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__INTEL_COMPILER) +//#pragma warning(disable:265 ) // floating-point operation result is out of range +//#pragma warning(disable:383 ) // value copied to temporary, reference to temporary used +//#pragma warning(disable:869 ) // parameter was never referenced +//#pragma warning(disable:981 ) // operands are evaluated in unspecified order +//#pragma warning(disable:1418) // external function definition with no prior declaration +//#pragma warning(disable:1419) // external declaration in primary source file +//#pragma warning(disable:1572) // floating-point equality and inequality comparisons are unreliable +//#pragma warning(disable:94 ) // the size of an array must be greater than zero +//#pragma warning(disable:1599) // declaration hides parameter +//#pragma warning(disable:424 ) // extra ";" ignored +#pragma warning(disable:2196) // routine is both "inline" and "noinline" +//#pragma warning(disable:177 ) // label was declared but never referenced +//#pragma warning(disable:114 ) // function was referenced but not defined +//#pragma warning(disable:819 ) // template nesting depth does not match the previous declaration of function +#pragma warning(disable:15335) // was not vectorized: vectorization possible but seems inefficient +#endif + +#if defined(_MSC_VER) +//#pragma warning(disable:4200) // nonstandard extension used : zero-sized array in struct/union +#pragma warning(disable:4800) // forcing value to bool 'true' or 'false' (performance warning) +//#pragma warning(disable:4267) // '=' : conversion from 'size_t' to 'unsigned long', possible loss of data +#pragma warning(disable:4244) // 'argument' : conversion from 'ssize_t' to 'unsigned int', possible loss of data +//#pragma warning(disable:4355) // 'this' : used in base member initializer list +//#pragma warning(disable:391 ) // '<=' : signed / unsigned mismatch +//#pragma warning(disable:4018) // '<' : signed / unsigned mismatch +//#pragma warning(disable:4305) // 'initializing' : truncation from 'double' to 'float' +//#pragma warning(disable:4068) // unknown pragma +//#pragma warning(disable:4146) // unary minus operator applied to unsigned type, result still unsigned +//#pragma warning(disable:4838) // conversion from 'unsigned int' to 'const int' requires a narrowing conversion) +//#pragma warning(disable:4227) // anachronism used : qualifiers on reference are ignored +#pragma warning(disable:4503) // decorated name length exceeded, name was truncated +#pragma warning(disable:4180) // qualifier applied to function type has no meaning; ignored +#pragma warning(disable:4258) // definition from the for loop is ignored; the definition from the enclosing scope is used + +# if _MSC_VER < 1910 // prior to Visual studio 2017 (V141) +# pragma warning(disable:4101) // warning C4101: 'x': unreferenced local variable // a compiler bug issues wrong warnings +# pragma warning(disable:4789) // buffer '' of size 8 bytes will be overrun; 32 bytes will be written starting at offset 0 +# endif + +#endif + +#if defined(__clang__) && !defined(__INTEL_COMPILER) +//#pragma clang diagnostic ignored "-Wunknown-pragmas" +//#pragma clang diagnostic ignored "-Wunused-variable" +//#pragma clang diagnostic ignored "-Wreorder" +//#pragma clang diagnostic ignored "-Wmicrosoft" +//#pragma clang diagnostic ignored "-Wunused-private-field" +//#pragma clang diagnostic ignored "-Wunused-local-typedef" +//#pragma clang diagnostic ignored "-Wunused-function" +//#pragma clang diagnostic ignored "-Wnarrowing" +//#pragma clang diagnostic ignored "-Wc++11-narrowing" +//#pragma clang diagnostic ignored "-Wdeprecated-register" +//#pragma clang diagnostic ignored "-Wdeprecated-declarations" +#endif + +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) +#pragma GCC diagnostic ignored "-Wpragmas" +//#pragma GCC diagnostic ignored "-Wnarrowing" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +//#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +//#pragma GCC diagnostic ignored "-Warray-bounds" +#pragma GCC diagnostic ignored "-Wattributes" +#pragma GCC diagnostic ignored "-Wmisleading-indentation" +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma GCC diagnostic ignored "-Wparentheses" +#endif + +#if defined(__clang__) && defined(__WIN32__) +#pragma clang diagnostic ignored "-Wunused-parameter" +#pragma clang diagnostic ignored "-Wmicrosoft-cast" +#pragma clang diagnostic ignored "-Wmicrosoft-enum-value" +#pragma clang diagnostic ignored "-Wmicrosoft-include" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wunknown-pragmas" +#endif + +/* disabling deprecated warning, please use only where use of deprecated Embree API functions is desired */ +#if defined(__WIN32__) && defined(__INTEL_COMPILER) +#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 1478)) // warning: function was declared deprecated +#define ENABLE_DEPRECATED_WARNING __pragma(warning (enable: 1478)) // warning: function was declared deprecated +#elif defined(__INTEL_COMPILER) +#define DISABLE_DEPRECATED_WARNING _Pragma("warning (disable: 1478)") // warning: function was declared deprecated +#define ENABLE_DEPRECATED_WARNING _Pragma("warning (enable : 1478)") // warning: function was declared deprecated +#elif defined(__clang__) +#define DISABLE_DEPRECATED_WARNING _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#define ENABLE_DEPRECATED_WARNING _Pragma("clang diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#elif defined(__GNUC__) +#define DISABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#define ENABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#elif defined(_MSC_VER) +#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 4996)) // warning: function was declared deprecated +#define ENABLE_DEPRECATED_WARNING __pragma(warning (enable : 4996)) // warning: function was declared deprecated +#endif + +/* embree output stream */ +#define embree_ostream std::ostream& +#define embree_cout std::cout +#define embree_cout_uniform std::cout +#define embree_endl std::endl + +//////////////////////////////////////////////////////////////////////////////// +/// Some macros for static profiling +//////////////////////////////////////////////////////////////////////////////// + +#if defined (__GNUC__) +#define IACA_SSC_MARK( MARK_ID ) \ +__asm__ __volatile__ ( \ + "\n\t movl $"#MARK_ID", %%ebx" \ + "\n\t .byte 0x64, 0x67, 0x90" \ + : : : "memory" ); + +#define IACA_UD_BYTES __asm__ __volatile__ ("\n\t .byte 0x0F, 0x0B"); + +#else +#define IACA_UD_BYTES {__asm _emit 0x0F \ + __asm _emit 0x0B} + +#define IACA_SSC_MARK(x) {__asm mov ebx, x\ + __asm _emit 0x64 \ + __asm _emit 0x67 \ + __asm _emit 0x90 } + +#define IACA_VC64_START __writegsbyte(111, 111); +#define IACA_VC64_END __writegsbyte(222, 222); + +#endif + +#define IACA_START {IACA_UD_BYTES \ + IACA_SSC_MARK(111)} +#define IACA_END {IACA_SSC_MARK(222) \ + IACA_UD_BYTES} + +namespace embree +{ + template<typename Closure> + struct OnScopeExitHelper + { + OnScopeExitHelper (const Closure f) : active(true), f(f) {} + ~OnScopeExitHelper() { if (active) f(); } + void deactivate() { active = false; } + bool active; + const Closure f; + }; + + template <typename Closure> + OnScopeExitHelper<Closure> OnScopeExit(const Closure f) { + return OnScopeExitHelper<Closure>(f); + } + +#define STRING_JOIN2(arg1, arg2) DO_STRING_JOIN2(arg1, arg2) +#define DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2 +#define ON_SCOPE_EXIT(code) \ + auto STRING_JOIN2(on_scope_exit_, __LINE__) = OnScopeExit([&](){code;}) + + template<typename Ty> + std::unique_ptr<Ty> make_unique(Ty* ptr) { + return std::unique_ptr<Ty>(ptr); + } + +} diff --git a/thirdparty/embree/common/sys/ref.h b/thirdparty/embree/common/sys/ref.h new file mode 100644 index 0000000000..c2b56c1908 --- /dev/null +++ b/thirdparty/embree/common/sys/ref.h @@ -0,0 +1,122 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "atomic.h" + +namespace embree +{ + struct NullTy { + }; + + extern MAYBE_UNUSED NullTy null; + + class RefCount + { + public: + RefCount(int val = 0) : refCounter(val) {} + virtual ~RefCount() {}; + + virtual RefCount* refInc() { refCounter.fetch_add(1); return this; } + virtual void refDec() { if (refCounter.fetch_add(-1) == 1) delete this; } + private: + std::atomic<size_t> refCounter; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Reference to single object + //////////////////////////////////////////////////////////////////////////////// + + template<typename Type> + class Ref + { + public: + Type* ptr; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Ref() : ptr(nullptr) {} + __forceinline Ref(NullTy) : ptr(nullptr) {} + __forceinline Ref(const Ref& input) : ptr(input.ptr) { if (ptr) ptr->refInc(); } + __forceinline Ref(Ref&& input) : ptr(input.ptr) { input.ptr = nullptr; } + + __forceinline Ref(Type* const input) : ptr(input) + { + if (ptr) + ptr->refInc(); + } + + __forceinline ~Ref() + { + if (ptr) + ptr->refDec(); + } + + __forceinline Ref& operator =(const Ref& input) + { + if (input.ptr) + input.ptr->refInc(); + if (ptr) + ptr->refDec(); + ptr = input.ptr; + return *this; + } + + __forceinline Ref& operator =(Ref&& input) + { + if (ptr) + ptr->refDec(); + ptr = input.ptr; + input.ptr = nullptr; + return *this; + } + + __forceinline Ref& operator =(Type* const input) + { + if (input) + input->refInc(); + if (ptr) + ptr->refDec(); + ptr = input; + return *this; + } + + __forceinline Ref& operator =(NullTy) + { + if (ptr) + ptr->refDec(); + ptr = nullptr; + return *this; + } + + __forceinline operator bool() const { return ptr != nullptr; } + + __forceinline const Type& operator *() const { return *ptr; } + __forceinline Type& operator *() { return *ptr; } + __forceinline const Type* operator ->() const { return ptr; } + __forceinline Type* operator ->() { return ptr; } + + template<typename TypeOut> + __forceinline Ref<TypeOut> cast() { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); } + template<typename TypeOut> + __forceinline const Ref<TypeOut> cast() const { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); } + + template<typename TypeOut> + __forceinline Ref<TypeOut> dynamicCast() { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); } + template<typename TypeOut> + __forceinline const Ref<TypeOut> dynamicCast() const { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); } + }; + + template<typename Type> __forceinline bool operator < (const Ref<Type>& a, const Ref<Type>& b) { return a.ptr < b.ptr; } + + template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, NullTy ) { return a.ptr == nullptr; } + template<typename Type> __forceinline bool operator ==(NullTy , const Ref<Type>& b) { return nullptr == b.ptr; } + template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr == b.ptr; } + + template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, NullTy ) { return a.ptr != nullptr; } + template<typename Type> __forceinline bool operator !=(NullTy , const Ref<Type>& b) { return nullptr != b.ptr; } + template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr != b.ptr; } +} diff --git a/thirdparty/embree/common/sys/regression.cpp b/thirdparty/embree/common/sys/regression.cpp new file mode 100644 index 0000000000..45315b1105 --- /dev/null +++ b/thirdparty/embree/common/sys/regression.cpp @@ -0,0 +1,30 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "regression.h" + +namespace embree +{ + /* registerRegressionTest is invoked from static initializers, thus + * we cannot have the regression_tests variable as global static + * variable due to issues with static variable initialization + * order. */ + std::vector<RegressionTest*>& get_regression_tests() + { + static std::vector<RegressionTest*> regression_tests; + return regression_tests; + } + + void registerRegressionTest(RegressionTest* test) + { + get_regression_tests().push_back(test); + } + + RegressionTest* getRegressionTest(size_t index) + { + if (index >= get_regression_tests().size()) + return nullptr; + + return get_regression_tests()[index]; + } +} diff --git a/thirdparty/embree/common/sys/regression.h b/thirdparty/embree/common/sys/regression.h new file mode 100644 index 0000000000..bb0bb94006 --- /dev/null +++ b/thirdparty/embree/common/sys/regression.h @@ -0,0 +1,25 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" + +#include <vector> + +namespace embree +{ + /*! virtual interface for all regression tests */ + struct RegressionTest + { + RegressionTest (std::string name) : name(name) {} + virtual bool run() = 0; + std::string name; + }; + + /*! registers a regression test */ + void registerRegressionTest(RegressionTest* test); + + /*! run all regression tests */ + RegressionTest* getRegressionTest(size_t index); +} diff --git a/thirdparty/embree/common/sys/string.cpp b/thirdparty/embree/common/sys/string.cpp new file mode 100644 index 0000000000..f42fdc8536 --- /dev/null +++ b/thirdparty/embree/common/sys/string.cpp @@ -0,0 +1,42 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "string.h" + +#include <algorithm> +#include <ctype.h> + +namespace embree +{ + char to_lower(char c) { return char(tolower(int(c))); } + char to_upper(char c) { return char(toupper(int(c))); } + std::string toLowerCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_lower); return dst; } + std::string toUpperCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_upper); return dst; } + + Vec2f string_to_Vec2f ( std::string str ) + { + size_t next = 0; + const float x = std::stof(str,&next); str = str.substr(next+1); + const float y = std::stof(str,&next); + return Vec2f(x,y); + } + + Vec3f string_to_Vec3f ( std::string str ) + { + size_t next = 0; + const float x = std::stof(str,&next); str = str.substr(next+1); + const float y = std::stof(str,&next); str = str.substr(next+1); + const float z = std::stof(str,&next); + return Vec3f(x,y,z); + } + + Vec4f string_to_Vec4f ( std::string str ) + { + size_t next = 0; + const float x = std::stof(str,&next); str = str.substr(next+1); + const float y = std::stof(str,&next); str = str.substr(next+1); + const float z = std::stof(str,&next); str = str.substr(next+1); + const float w = std::stof(str,&next); + return Vec4f(x,y,z,w); + } +} diff --git a/thirdparty/embree/common/sys/string.h b/thirdparty/embree/common/sys/string.h new file mode 100644 index 0000000000..820076b21c --- /dev/null +++ b/thirdparty/embree/common/sys/string.h @@ -0,0 +1,37 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "../math/vec2.h" +#include "../math/vec3.h" +#include "../math/vec4.h" + +namespace embree +{ + class IOStreamStateRestorer + { + public: + IOStreamStateRestorer(std::ostream& iostream) + : iostream(iostream), flags(iostream.flags()), precision(iostream.precision()) { + } + + ~IOStreamStateRestorer() { + iostream.flags(flags); + iostream.precision(precision); + } + + private: + std::ostream& iostream; + std::ios::fmtflags flags; + std::streamsize precision; + }; + + std::string toLowerCase(const std::string& s); + std::string toUpperCase(const std::string& s); + + Vec2f string_to_Vec2f ( std::string str ); + Vec3f string_to_Vec3f ( std::string str ); + Vec4f string_to_Vec4f ( std::string str ); +} diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp new file mode 100644 index 0000000000..f1a59e511e --- /dev/null +++ b/thirdparty/embree/common/sys/sysinfo.cpp @@ -0,0 +1,656 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "sysinfo.h" +#include "intrinsics.h" +#include "string.h" +#include "ref.h" +#if defined(__FREEBSD__) +#include <sys/cpuset.h> +#include <pthread_np.h> +typedef cpuset_t cpu_set_t; +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// All Platforms +//////////////////////////////////////////////////////////////////////////////// + +namespace embree +{ + NullTy null; + + std::string getPlatformName() + { +#if defined(__LINUX__) && !defined(__64BIT__) + return "Linux (32bit)"; +#elif defined(__LINUX__) && defined(__64BIT__) + return "Linux (64bit)"; +#elif defined(__FREEBSD__) && !defined(__64BIT__) + return "FreeBSD (32bit)"; +#elif defined(__FREEBSD__) && defined(__64BIT__) + return "FreeBSD (64bit)"; +#elif defined(__CYGWIN__) && !defined(__64BIT__) + return "Cygwin (32bit)"; +#elif defined(__CYGWIN__) && defined(__64BIT__) + return "Cygwin (64bit)"; +#elif defined(__WIN32__) && !defined(__64BIT__) + return "Windows (32bit)"; +#elif defined(__WIN32__) && defined(__64BIT__) + return "Windows (64bit)"; +#elif defined(__MACOSX__) && !defined(__64BIT__) + return "Mac OS X (32bit)"; +#elif defined(__MACOSX__) && defined(__64BIT__) + return "Mac OS X (64bit)"; +#elif defined(__UNIX__) && !defined(__64BIT__) + return "Unix (32bit)"; +#elif defined(__UNIX__) && defined(__64BIT__) + return "Unix (64bit)"; +#else + return "Unknown"; +#endif + } + + std::string getCompilerName() + { +#if defined(__INTEL_COMPILER) + int icc_mayor = __INTEL_COMPILER / 100 % 100; + int icc_minor = __INTEL_COMPILER % 100; + std::string version = "Intel Compiler "; + version += toString(icc_mayor); + version += "." + toString(icc_minor); +#if defined(__INTEL_COMPILER_UPDATE) + version += "." + toString(__INTEL_COMPILER_UPDATE); +#endif + return version; +#elif defined(__clang__) + return "CLANG " __clang_version__; +#elif defined (__GNUC__) + return "GCC " __VERSION__; +#elif defined(_MSC_VER) + std::string version = toString(_MSC_FULL_VER); + version.insert(4,"."); + version.insert(9,"."); + version.insert(2,"."); + return "Visual C++ Compiler " + version; +#else + return "Unknown Compiler"; +#endif + } + + std::string getCPUVendor() + { +#if defined(__X86_ASM__) + int cpuinfo[4]; + __cpuid (cpuinfo, 0); + int name[4]; + name[0] = cpuinfo[1]; + name[1] = cpuinfo[3]; + name[2] = cpuinfo[2]; + name[3] = 0; + return (char*)name; +#elif defined(__ARM_NEON) + return "ARM"; +#else + return "Unknown"; +#endif + } + + CPU getCPUModel() + { +#if defined(__X86_ASM__) + if (getCPUVendor() != "GenuineIntel") + return CPU::UNKNOWN; + + int out[4]; + __cpuid(out, 0); + if (out[0] < 1) return CPU::UNKNOWN; + __cpuid(out, 1); + + /* please see CPUID documentation for these formulas */ + uint32_t family_ID = (out[0] >> 8) & 0x0F; + uint32_t extended_family_ID = (out[0] >> 20) & 0xFF; + + uint32_t model_ID = (out[0] >> 4) & 0x0F; + uint32_t extended_model_ID = (out[0] >> 16) & 0x0F; + + uint32_t DisplayFamily = family_ID; + if (family_ID == 0x0F) + DisplayFamily += extended_family_ID; + + uint32_t DisplayModel = model_ID; + if (family_ID == 0x06 || family_ID == 0x0F) + DisplayModel += extended_model_ID << 4; + + uint32_t DisplayFamily_DisplayModel = (DisplayFamily << 8) + (DisplayModel << 0); + + // Data from Intel® 64 and IA-32 Architectures, Volume 4, Chapter 2, Table 2-1 (CPUID Signature Values of DisplayFamily_DisplayModel) + if (DisplayFamily_DisplayModel == 0x067D) return CPU::CORE_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x067E) return CPU::CORE_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x068C) return CPU::CORE_TIGER_LAKE; + if (DisplayFamily_DisplayModel == 0x06A5) return CPU::CORE_COMET_LAKE; + if (DisplayFamily_DisplayModel == 0x06A6) return CPU::CORE_COMET_LAKE; + if (DisplayFamily_DisplayModel == 0x0666) return CPU::CORE_CANNON_LAKE; + if (DisplayFamily_DisplayModel == 0x068E) return CPU::CORE_KABY_LAKE; + if (DisplayFamily_DisplayModel == 0x069E) return CPU::CORE_KABY_LAKE; + if (DisplayFamily_DisplayModel == 0x066A) return CPU::XEON_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x066C) return CPU::XEON_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x0655) return CPU::XEON_SKY_LAKE; + if (DisplayFamily_DisplayModel == 0x064E) return CPU::CORE_SKY_LAKE; + if (DisplayFamily_DisplayModel == 0x065E) return CPU::CORE_SKY_LAKE; + if (DisplayFamily_DisplayModel == 0x0656) return CPU::XEON_BROADWELL; + if (DisplayFamily_DisplayModel == 0x064F) return CPU::XEON_BROADWELL; + if (DisplayFamily_DisplayModel == 0x0647) return CPU::CORE_BROADWELL; + if (DisplayFamily_DisplayModel == 0x063D) return CPU::CORE_BROADWELL; + if (DisplayFamily_DisplayModel == 0x063F) return CPU::XEON_HASWELL; + if (DisplayFamily_DisplayModel == 0x063C) return CPU::CORE_HASWELL; + if (DisplayFamily_DisplayModel == 0x0645) return CPU::CORE_HASWELL; + if (DisplayFamily_DisplayModel == 0x0646) return CPU::CORE_HASWELL; + if (DisplayFamily_DisplayModel == 0x063E) return CPU::XEON_IVY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x063A) return CPU::CORE_IVY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062D) return CPU::SANDY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062F) return CPU::SANDY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062A) return CPU::SANDY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062E) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x0625) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x062C) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061E) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061F) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061A) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061D) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x0617) return CPU::CORE2; + if (DisplayFamily_DisplayModel == 0x060F) return CPU::CORE2; + if (DisplayFamily_DisplayModel == 0x060E) return CPU::CORE1; + + if (DisplayFamily_DisplayModel == 0x0685) return CPU::XEON_PHI_KNIGHTS_MILL; + if (DisplayFamily_DisplayModel == 0x0657) return CPU::XEON_PHI_KNIGHTS_LANDING; + +#elif defined(__ARM_NEON) + return CPU::ARM; +#endif + + return CPU::UNKNOWN; + } + + std::string stringOfCPUModel(CPU model) + { + switch (model) { + case CPU::XEON_ICE_LAKE : return "Xeon Ice Lake"; + case CPU::CORE_ICE_LAKE : return "Core Ice Lake"; + case CPU::CORE_TIGER_LAKE : return "Core Tiger Lake"; + case CPU::CORE_COMET_LAKE : return "Core Comet Lake"; + case CPU::CORE_CANNON_LAKE : return "Core Cannon Lake"; + case CPU::CORE_KABY_LAKE : return "Core Kaby Lake"; + case CPU::XEON_SKY_LAKE : return "Xeon Sky Lake"; + case CPU::CORE_SKY_LAKE : return "Core Sky Lake"; + case CPU::XEON_PHI_KNIGHTS_MILL : return "Xeon Phi Knights Mill"; + case CPU::XEON_PHI_KNIGHTS_LANDING: return "Xeon Phi Knights Landing"; + case CPU::XEON_BROADWELL : return "Xeon Broadwell"; + case CPU::CORE_BROADWELL : return "Core Broadwell"; + case CPU::XEON_HASWELL : return "Xeon Haswell"; + case CPU::CORE_HASWELL : return "Core Haswell"; + case CPU::XEON_IVY_BRIDGE : return "Xeon Ivy Bridge"; + case CPU::CORE_IVY_BRIDGE : return "Core Ivy Bridge"; + case CPU::SANDY_BRIDGE : return "Sandy Bridge"; + case CPU::NEHALEM : return "Nehalem"; + case CPU::CORE2 : return "Core2"; + case CPU::CORE1 : return "Core"; + case CPU::ARM : return "ARM"; + case CPU::UNKNOWN : return "Unknown CPU"; + } + return "Unknown CPU (error)"; + } + +#if defined(__X86_ASM__) + /* constants to access destination registers of CPUID instruction */ + static const int EAX = 0; + static const int EBX = 1; + static const int ECX = 2; + static const int EDX = 3; + + /* cpuid[eax=1].ecx */ + static const int CPU_FEATURE_BIT_SSE3 = 1 << 0; + static const int CPU_FEATURE_BIT_SSSE3 = 1 << 9; + static const int CPU_FEATURE_BIT_FMA3 = 1 << 12; + static const int CPU_FEATURE_BIT_SSE4_1 = 1 << 19; + static const int CPU_FEATURE_BIT_SSE4_2 = 1 << 20; + //static const int CPU_FEATURE_BIT_MOVBE = 1 << 22; + static const int CPU_FEATURE_BIT_POPCNT = 1 << 23; + //static const int CPU_FEATURE_BIT_XSAVE = 1 << 26; + static const int CPU_FEATURE_BIT_OXSAVE = 1 << 27; + static const int CPU_FEATURE_BIT_AVX = 1 << 28; + static const int CPU_FEATURE_BIT_F16C = 1 << 29; + static const int CPU_FEATURE_BIT_RDRAND = 1 << 30; + + /* cpuid[eax=1].edx */ + static const int CPU_FEATURE_BIT_SSE = 1 << 25; + static const int CPU_FEATURE_BIT_SSE2 = 1 << 26; + + /* cpuid[eax=0x80000001].ecx */ + static const int CPU_FEATURE_BIT_LZCNT = 1 << 5; + + /* cpuid[eax=7,ecx=0].ebx */ + static const int CPU_FEATURE_BIT_BMI1 = 1 << 3; + static const int CPU_FEATURE_BIT_AVX2 = 1 << 5; + static const int CPU_FEATURE_BIT_BMI2 = 1 << 8; + static const int CPU_FEATURE_BIT_AVX512F = 1 << 16; // AVX512F (foundation) + static const int CPU_FEATURE_BIT_AVX512DQ = 1 << 17; // AVX512DQ (doubleword and quadword instructions) + static const int CPU_FEATURE_BIT_AVX512PF = 1 << 26; // AVX512PF (prefetch gather/scatter instructions) + static const int CPU_FEATURE_BIT_AVX512ER = 1 << 27; // AVX512ER (exponential and reciprocal instructions) + static const int CPU_FEATURE_BIT_AVX512CD = 1 << 28; // AVX512CD (conflict detection instructions) + static const int CPU_FEATURE_BIT_AVX512BW = 1 << 30; // AVX512BW (byte and word instructions) + static const int CPU_FEATURE_BIT_AVX512VL = 1 << 31; // AVX512VL (vector length extensions) + static const int CPU_FEATURE_BIT_AVX512IFMA = 1 << 21; // AVX512IFMA (integer fused multiple-add instructions) + + /* cpuid[eax=7,ecx=0].ecx */ + static const int CPU_FEATURE_BIT_AVX512VBMI = 1 << 1; // AVX512VBMI (vector bit manipulation instructions) +#endif + +#if defined(__X86_ASM__) + __noinline int64_t get_xcr0() + { +// -- GODOT start -- +#if defined (__WIN32__) && !defined (__MINGW32__) +// -- GODOT end -- + int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32 + xcr0 = _xgetbv(0); + return xcr0; +#else + int xcr0 = 0; + __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" ); + return xcr0; +#endif + } +#endif + + int getCPUFeatures() + { +#if defined(__X86_ASM__) + /* cache CPU features access */ + static int cpu_features = 0; + if (cpu_features) + return cpu_features; + + /* get number of CPUID leaves */ + int cpuid_leaf0[4]; + __cpuid(cpuid_leaf0, 0x00000000); + unsigned nIds = cpuid_leaf0[EAX]; + + /* get number of extended CPUID leaves */ + int cpuid_leafe[4]; + __cpuid(cpuid_leafe, 0x80000000); + unsigned nExIds = cpuid_leafe[EAX]; + + /* get CPUID leaves for EAX = 1,7, and 0x80000001 */ + int cpuid_leaf_1[4] = { 0,0,0,0 }; + int cpuid_leaf_7[4] = { 0,0,0,0 }; + int cpuid_leaf_e1[4] = { 0,0,0,0 }; + if (nIds >= 1) __cpuid (cpuid_leaf_1,0x00000001); +#if _WIN32 +#if _MSC_VER && (_MSC_FULL_VER < 160040219) +#else + if (nIds >= 7) __cpuidex(cpuid_leaf_7,0x00000007,0); +#endif +#else + if (nIds >= 7) __cpuid_count(cpuid_leaf_7,0x00000007,0); +#endif + if (nExIds >= 0x80000001) __cpuid(cpuid_leaf_e1,0x80000001); + + /* detect if OS saves XMM, YMM, and ZMM states */ + bool xmm_enabled = true; + bool ymm_enabled = false; + bool zmm_enabled = false; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_OXSAVE) { + int64_t xcr0 = get_xcr0(); + xmm_enabled = ((xcr0 & 0x02) == 0x02); /* checks if xmm are enabled in XCR0 */ + ymm_enabled = xmm_enabled && ((xcr0 & 0x04) == 0x04); /* checks if ymm state are enabled in XCR0 */ + zmm_enabled = ymm_enabled && ((xcr0 & 0xE0) == 0xE0); /* checks if OPMASK state, upper 256-bit of ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled in XCR0 */ + } + if (xmm_enabled) cpu_features |= CPU_FEATURE_XMM_ENABLED; + if (ymm_enabled) cpu_features |= CPU_FEATURE_YMM_ENABLED; + if (zmm_enabled) cpu_features |= CPU_FEATURE_ZMM_ENABLED; + + if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE ) cpu_features |= CPU_FEATURE_SSE; + if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE2 ) cpu_features |= CPU_FEATURE_SSE2; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE3 ) cpu_features |= CPU_FEATURE_SSE3; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSSE3 ) cpu_features |= CPU_FEATURE_SSSE3; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_1) cpu_features |= CPU_FEATURE_SSE41; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_2) cpu_features |= CPU_FEATURE_SSE42; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_POPCNT) cpu_features |= CPU_FEATURE_POPCNT; + + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_AVX ) cpu_features |= CPU_FEATURE_AVX; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_F16C ) cpu_features |= CPU_FEATURE_F16C; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_RDRAND) cpu_features |= CPU_FEATURE_RDRAND; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX2 ) cpu_features |= CPU_FEATURE_AVX2; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_FMA3 ) cpu_features |= CPU_FEATURE_FMA3; + if (cpuid_leaf_e1[ECX] & CPU_FEATURE_BIT_LZCNT) cpu_features |= CPU_FEATURE_LZCNT; + if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI1 ) cpu_features |= CPU_FEATURE_BMI1; + if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI2 ) cpu_features |= CPU_FEATURE_BMI2; + + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512F ) cpu_features |= CPU_FEATURE_AVX512F; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512DQ ) cpu_features |= CPU_FEATURE_AVX512DQ; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512PF ) cpu_features |= CPU_FEATURE_AVX512PF; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512ER ) cpu_features |= CPU_FEATURE_AVX512ER; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512CD ) cpu_features |= CPU_FEATURE_AVX512CD; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512BW ) cpu_features |= CPU_FEATURE_AVX512BW; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512IFMA) cpu_features |= CPU_FEATURE_AVX512IFMA; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512VL ) cpu_features |= CPU_FEATURE_AVX512VL; + if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI; + + return cpu_features; +#elif defined(__ARM_NEON) + /* emulated features with sse2neon */ + return CPU_FEATURE_SSE|CPU_FEATURE_SSE2|CPU_FEATURE_XMM_ENABLED; +#else + /* Unknown CPU. */ + return 0; +#endif + } + + std::string stringOfCPUFeatures(int features) + { + std::string str; + if (features & CPU_FEATURE_XMM_ENABLED) str += "XMM "; + if (features & CPU_FEATURE_YMM_ENABLED) str += "YMM "; + if (features & CPU_FEATURE_ZMM_ENABLED) str += "ZMM "; + if (features & CPU_FEATURE_SSE ) str += "SSE "; + if (features & CPU_FEATURE_SSE2 ) str += "SSE2 "; + if (features & CPU_FEATURE_SSE3 ) str += "SSE3 "; + if (features & CPU_FEATURE_SSSE3 ) str += "SSSE3 "; + if (features & CPU_FEATURE_SSE41 ) str += "SSE4.1 "; + if (features & CPU_FEATURE_SSE42 ) str += "SSE4.2 "; + if (features & CPU_FEATURE_POPCNT) str += "POPCNT "; + if (features & CPU_FEATURE_AVX ) str += "AVX "; + if (features & CPU_FEATURE_F16C ) str += "F16C "; + if (features & CPU_FEATURE_RDRAND) str += "RDRAND "; + if (features & CPU_FEATURE_AVX2 ) str += "AVX2 "; + if (features & CPU_FEATURE_FMA3 ) str += "FMA3 "; + if (features & CPU_FEATURE_LZCNT ) str += "LZCNT "; + if (features & CPU_FEATURE_BMI1 ) str += "BMI1 "; + if (features & CPU_FEATURE_BMI2 ) str += "BMI2 "; + if (features & CPU_FEATURE_AVX512F) str += "AVX512F "; + if (features & CPU_FEATURE_AVX512DQ) str += "AVX512DQ "; + if (features & CPU_FEATURE_AVX512PF) str += "AVX512PF "; + if (features & CPU_FEATURE_AVX512ER) str += "AVX512ER "; + if (features & CPU_FEATURE_AVX512CD) str += "AVX512CD "; + if (features & CPU_FEATURE_AVX512BW) str += "AVX512BW "; + if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL "; + if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA "; + if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI "; + return str; + } + + std::string stringOfISA (int isa) + { + if (isa == SSE) return "SSE"; + if (isa == SSE2) return "SSE2"; + if (isa == SSE3) return "SSE3"; + if (isa == SSSE3) return "SSSE3"; + if (isa == SSE41) return "SSE4.1"; + if (isa == SSE42) return "SSE4.2"; + if (isa == AVX) return "AVX"; + if (isa == AVX2) return "AVX2"; + if (isa == AVX512) return "AVX512"; + return "UNKNOWN"; + } + + bool hasISA(int features, int isa) { + return (features & isa) == isa; + } + + std::string supportedTargetList (int features) + { + std::string v; + if (hasISA(features,SSE)) v += "SSE "; + if (hasISA(features,SSE2)) v += "SSE2 "; + if (hasISA(features,SSE3)) v += "SSE3 "; + if (hasISA(features,SSSE3)) v += "SSSE3 "; + if (hasISA(features,SSE41)) v += "SSE4.1 "; + if (hasISA(features,SSE42)) v += "SSE4.2 "; + if (hasISA(features,AVX)) v += "AVX "; + if (hasISA(features,AVXI)) v += "AVXI "; + if (hasISA(features,AVX2)) v += "AVX2 "; + if (hasISA(features,AVX512)) v += "AVX512 "; + return v; + } +} + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> +#include <psapi.h> + +namespace embree +{ + std::string getExecutableFileName() { + char filename[1024]; + if (!GetModuleFileName(nullptr, filename, sizeof(filename))) + return std::string(); + return std::string(filename); + } + + unsigned int getNumberOfLogicalThreads() + { + static int nThreads = -1; + if (nThreads != -1) return nThreads; + + typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)(); + typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD); + HMODULE hlib = LoadLibrary("Kernel32"); + GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount"); + GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc) GetProcAddress(hlib, "GetActiveProcessorCount"); + + if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount) + { + int groups = pGetActiveProcessorGroupCount(); + int totalProcessors = 0; + for (int i = 0; i < groups; i++) + totalProcessors += pGetActiveProcessorCount(i); + nThreads = totalProcessors; + } + else + { + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + nThreads = sysinfo.dwNumberOfProcessors; + } + assert(nThreads); + return nThreads; + } + + int getTerminalWidth() + { + HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE); + if (handle == INVALID_HANDLE_VALUE) return 80; + CONSOLE_SCREEN_BUFFER_INFO info; + memset(&info,0,sizeof(info)); + GetConsoleScreenBufferInfo(handle, &info); + return info.dwSize.X; + } + + double getSeconds() + { + LARGE_INTEGER freq, val; + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&val); + return (double)val.QuadPart / (double)freq.QuadPart; + } + + void sleepSeconds(double t) { + Sleep(DWORD(1000.0*t)); + } + + size_t getVirtualMemoryBytes() + { + PROCESS_MEMORY_COUNTERS info; + GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); + return (size_t)info.QuotaPeakPagedPoolUsage; + } + + size_t getResidentMemoryBytes() + { + PROCESS_MEMORY_COUNTERS info; + GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); + return (size_t)info.WorkingSetSize; + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Linux Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__LINUX__) + +#include <stdio.h> +#include <unistd.h> + +namespace embree +{ + std::string getExecutableFileName() + { + std::string pid = "/proc/" + toString(getpid()) + "/exe"; + char buf[4096]; + memset(buf,0,sizeof(buf)); + if (readlink(pid.c_str(), buf, sizeof(buf)-1) == -1) + return std::string(); + return std::string(buf); + } + + size_t getVirtualMemoryBytes() + { + size_t virt, resident, shared; + std::ifstream buffer("/proc/self/statm"); + buffer >> virt >> resident >> shared; + return virt*sysconf(_SC_PAGE_SIZE); + } + + size_t getResidentMemoryBytes() + { + size_t virt, resident, shared; + std::ifstream buffer("/proc/self/statm"); + buffer >> virt >> resident >> shared; + return resident*sysconf(_SC_PAGE_SIZE); + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// FreeBSD Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined (__FreeBSD__) + +#include <sys/sysctl.h> + +namespace embree +{ + std::string getExecutableFileName() + { + const int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1 }; + char buf[4096]; + memset(buf,0,sizeof(buf)); + size_t len = sizeof(buf)-1; + if (sysctl(mib, 4, buf, &len, 0x0, 0) == -1) + return std::string(); + return std::string(buf); + } + + size_t getVirtualMemoryBytes() { + return 0; + } + + size_t getResidentMemoryBytes() { + return 0; + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Mac OS X Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__MACOSX__) + +#include <mach-o/dyld.h> + +namespace embree +{ + std::string getExecutableFileName() + { + char buf[4096]; + uint32_t size = sizeof(buf); + if (_NSGetExecutablePath(buf, &size) != 0) + return std::string(); + return std::string(buf); + } + + size_t getVirtualMemoryBytes() { + return 0; + } + + size_t getResidentMemoryBytes() { + return 0; + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) + +#include <unistd.h> +#include <sys/ioctl.h> +#include <sys/time.h> +#include <pthread.h> + +namespace embree +{ + unsigned int getNumberOfLogicalThreads() + { + static int nThreads = -1; + if (nThreads != -1) return nThreads; + +// -- GODOT start -- +// #if defined(__MACOSX__) +#if defined(__MACOSX__) || defined(__ANDROID__) +// -- GODOT end -- + nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container + assert(nThreads); +#else + cpu_set_t set; + if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0) + nThreads = CPU_COUNT(&set); +#endif + + assert(nThreads); + return nThreads; + } + + int getTerminalWidth() + { + struct winsize info; + if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &info) < 0) return 80; + return info.ws_col; + } + + double getSeconds() { + struct timeval tp; gettimeofday(&tp,nullptr); + return double(tp.tv_sec) + double(tp.tv_usec)/1E6; + } + + void sleepSeconds(double t) { + usleep(1000000.0*t); + } +} +#endif + diff --git a/thirdparty/embree/common/sys/sysinfo.h b/thirdparty/embree/common/sys/sysinfo.h new file mode 100644 index 0000000000..72351d12e4 --- /dev/null +++ b/thirdparty/embree/common/sys/sysinfo.h @@ -0,0 +1,178 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define CACHELINE_SIZE 64 + +#if !defined(PAGE_SIZE) + #define PAGE_SIZE 4096 +#endif + +#define PAGE_SIZE_2M (2*1024*1024) +#define PAGE_SIZE_4K (4*1024) + +#include "platform.h" + +/* define isa namespace and ISA bitvector */ +#if defined (__AVX512VL__) +# define isa avx512 +# define ISA AVX512 +# define ISA_STR "AVX512" +#elif defined (__AVX2__) +# define isa avx2 +# define ISA AVX2 +# define ISA_STR "AVX2" +#elif defined(__AVXI__) +# define isa avxi +# define ISA AVXI +# define ISA_STR "AVXI" +#elif defined(__AVX__) +# define isa avx +# define ISA AVX +# define ISA_STR "AVX" +#elif defined (__SSE4_2__) +# define isa sse42 +# define ISA SSE42 +# define ISA_STR "SSE4.2" +//#elif defined (__SSE4_1__) // we demote this to SSE2, MacOSX code compiles with SSE41 by default with XCode 11 +//# define isa sse41 +//# define ISA SSE41 +//# define ISA_STR "SSE4.1" +//#elif defined(__SSSE3__) // we demote this to SSE2, MacOSX code compiles with SSSE3 by default with ICC +//# define isa ssse3 +//# define ISA SSSE3 +//# define ISA_STR "SSSE3" +//#elif defined(__SSE3__) // we demote this to SSE2, MacOSX code compiles with SSE3 by default with clang +//# define isa sse3 +//# define ISA SSE3 +//# define ISA_STR "SSE3" +#elif defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__) +# define isa sse2 +# define ISA SSE2 +# define ISA_STR "SSE2" +#elif defined(__SSE__) +# define isa sse +# define ISA SSE +# define ISA_STR "SSE" +#else +#error Unknown ISA +#endif + +namespace embree +{ + enum class CPU + { + XEON_ICE_LAKE, + CORE_ICE_LAKE, + CORE_TIGER_LAKE, + CORE_COMET_LAKE, + CORE_CANNON_LAKE, + CORE_KABY_LAKE, + XEON_SKY_LAKE, + CORE_SKY_LAKE, + XEON_PHI_KNIGHTS_MILL, + XEON_PHI_KNIGHTS_LANDING, + XEON_BROADWELL, + CORE_BROADWELL, + XEON_HASWELL, + CORE_HASWELL, + XEON_IVY_BRIDGE, + CORE_IVY_BRIDGE, + SANDY_BRIDGE, + NEHALEM, + CORE2, + CORE1, + ARM, + UNKNOWN, + }; + + /*! get the full path to the running executable */ + std::string getExecutableFileName(); + + /*! return platform name */ + std::string getPlatformName(); + + /*! get the full name of the compiler */ + std::string getCompilerName(); + + /*! return the name of the CPU */ + std::string getCPUVendor(); + + /*! get microprocessor model */ + CPU getCPUModel(); + + /*! converts CPU model into string */ + std::string stringOfCPUModel(CPU model); + + /*! CPU features */ + static const int CPU_FEATURE_SSE = 1 << 0; + static const int CPU_FEATURE_SSE2 = 1 << 1; + static const int CPU_FEATURE_SSE3 = 1 << 2; + static const int CPU_FEATURE_SSSE3 = 1 << 3; + static const int CPU_FEATURE_SSE41 = 1 << 4; + static const int CPU_FEATURE_SSE42 = 1 << 5; + static const int CPU_FEATURE_POPCNT = 1 << 6; + static const int CPU_FEATURE_AVX = 1 << 7; + static const int CPU_FEATURE_F16C = 1 << 8; + static const int CPU_FEATURE_RDRAND = 1 << 9; + static const int CPU_FEATURE_AVX2 = 1 << 10; + static const int CPU_FEATURE_FMA3 = 1 << 11; + static const int CPU_FEATURE_LZCNT = 1 << 12; + static const int CPU_FEATURE_BMI1 = 1 << 13; + static const int CPU_FEATURE_BMI2 = 1 << 14; + static const int CPU_FEATURE_AVX512F = 1 << 16; + static const int CPU_FEATURE_AVX512DQ = 1 << 17; + static const int CPU_FEATURE_AVX512PF = 1 << 18; + static const int CPU_FEATURE_AVX512ER = 1 << 19; + static const int CPU_FEATURE_AVX512CD = 1 << 20; + static const int CPU_FEATURE_AVX512BW = 1 << 21; + static const int CPU_FEATURE_AVX512VL = 1 << 22; + static const int CPU_FEATURE_AVX512IFMA = 1 << 23; + static const int CPU_FEATURE_AVX512VBMI = 1 << 24; + static const int CPU_FEATURE_XMM_ENABLED = 1 << 25; + static const int CPU_FEATURE_YMM_ENABLED = 1 << 26; + static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27; + + /*! get CPU features */ + int getCPUFeatures(); + + /*! convert CPU features into a string */ + std::string stringOfCPUFeatures(int features); + + /*! creates a string of all supported targets that are supported */ + std::string supportedTargetList (int isa); + + /*! ISAs */ + static const int SSE = CPU_FEATURE_SSE | CPU_FEATURE_XMM_ENABLED; + static const int SSE2 = SSE | CPU_FEATURE_SSE2; + static const int SSE3 = SSE2 | CPU_FEATURE_SSE3; + static const int SSSE3 = SSE3 | CPU_FEATURE_SSSE3; + static const int SSE41 = SSSE3 | CPU_FEATURE_SSE41; + static const int SSE42 = SSE41 | CPU_FEATURE_SSE42 | CPU_FEATURE_POPCNT; + static const int AVX = SSE42 | CPU_FEATURE_AVX | CPU_FEATURE_YMM_ENABLED; + static const int AVXI = AVX | CPU_FEATURE_F16C | CPU_FEATURE_RDRAND; + static const int AVX2 = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT; + static const int AVX512 = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED; + + /*! converts ISA bitvector into a string */ + std::string stringOfISA(int features); + + /*! return the number of logical threads of the system */ + unsigned int getNumberOfLogicalThreads(); + + /*! returns the size of the terminal window in characters */ + int getTerminalWidth(); + + /*! returns performance counter in seconds */ + double getSeconds(); + + /*! sleeps the specified number of seconds */ + void sleepSeconds(double t); + + /*! returns virtual address space occupied by process */ + size_t getVirtualMemoryBytes(); + + /*! returns resident memory required by process */ + size_t getResidentMemoryBytes(); +} diff --git a/thirdparty/embree/common/sys/thread.cpp b/thirdparty/embree/common/sys/thread.cpp new file mode 100644 index 0000000000..f4014be89b --- /dev/null +++ b/thirdparty/embree/common/sys/thread.cpp @@ -0,0 +1,474 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "thread.h" +#include "sysinfo.h" +#include "string.h" + +#include <iostream> +#if defined(__ARM_NEON) +#include "../simd/arm/emulation.h" +#else +#include <xmmintrin.h> +#endif + +#if defined(PTHREADS_WIN32) +#pragma comment (lib, "pthreadVC.lib") +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +namespace embree +{ + /*! set the affinity of a given thread */ + void setAffinity(HANDLE thread, ssize_t affinity) + { + typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)(); + typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD); + typedef BOOL (WINAPI *SetThreadGroupAffinityFunc)(HANDLE, const GROUP_AFFINITY *, PGROUP_AFFINITY); + typedef BOOL (WINAPI *SetThreadIdealProcessorExFunc)(HANDLE, PPROCESSOR_NUMBER, PPROCESSOR_NUMBER); + HMODULE hlib = LoadLibrary("Kernel32"); + GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount"); + GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc)GetProcAddress(hlib, "GetActiveProcessorCount"); + SetThreadGroupAffinityFunc pSetThreadGroupAffinity = (SetThreadGroupAffinityFunc)GetProcAddress(hlib, "SetThreadGroupAffinity"); + SetThreadIdealProcessorExFunc pSetThreadIdealProcessorEx = (SetThreadIdealProcessorExFunc)GetProcAddress(hlib, "SetThreadIdealProcessorEx"); + if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount && pSetThreadGroupAffinity && pSetThreadIdealProcessorEx) + { + int groups = pGetActiveProcessorGroupCount(); + int totalProcessors = 0, group = 0, number = 0; + for (int i = 0; i<groups; i++) { + int processors = pGetActiveProcessorCount(i); + if (totalProcessors + processors > affinity) { + group = i; + number = (int)affinity - totalProcessors; + break; + } + totalProcessors += processors; + } + + GROUP_AFFINITY groupAffinity; + groupAffinity.Group = (WORD)group; + groupAffinity.Mask = (KAFFINITY)(uint64_t(1) << number); + groupAffinity.Reserved[0] = 0; + groupAffinity.Reserved[1] = 0; + groupAffinity.Reserved[2] = 0; + if (!pSetThreadGroupAffinity(thread, &groupAffinity, nullptr)) + WARNING("SetThreadGroupAffinity failed"); // on purpose only a warning + + PROCESSOR_NUMBER processorNumber; + processorNumber.Group = group; + processorNumber.Number = number; + processorNumber.Reserved = 0; + if (!pSetThreadIdealProcessorEx(thread, &processorNumber, nullptr)) + WARNING("SetThreadIdealProcessorEx failed"); // on purpose only a warning + } + else + { + if (!SetThreadAffinityMask(thread, DWORD_PTR(uint64_t(1) << affinity))) + WARNING("SetThreadAffinityMask failed"); // on purpose only a warning + if (SetThreadIdealProcessor(thread, (DWORD)affinity) == (DWORD)-1) + WARNING("SetThreadIdealProcessor failed"); // on purpose only a warning + } + } + + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) { + setAffinity(GetCurrentThread(), affinity); + } + + struct ThreadStartupData + { + public: + ThreadStartupData (thread_func f, void* arg) + : f(f), arg(arg) {} + public: + thread_func f; + void* arg; + }; + + DWORD WINAPI threadStartup(LPVOID ptr) + { + ThreadStartupData* parg = (ThreadStartupData*) ptr; + _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); + parg->f(parg->arg); + delete parg; + return 0; + } + +#if !defined(PTHREADS_WIN32) + + /*! creates a hardware thread running on specific core */ + thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID) + { + HANDLE thread = CreateThread(nullptr, stack_size, threadStartup, new ThreadStartupData(f,arg), 0, nullptr); + if (thread == nullptr) FATAL("CreateThread failed"); + if (threadID >= 0) setAffinity(thread, threadID); + return thread_t(thread); + } + + /*! the thread calling this function gets yielded */ + void yield() { + SwitchToThread(); + } + + /*! waits until the given thread has terminated */ + void join(thread_t tid) { + WaitForSingleObject(HANDLE(tid), INFINITE); + CloseHandle(HANDLE(tid)); + } + + /*! destroy a hardware thread by its handle */ + void destroyThread(thread_t tid) { + TerminateThread(HANDLE(tid),0); + CloseHandle(HANDLE(tid)); + } + + /*! creates thread local storage */ + tls_t createTls() { + return tls_t(size_t(TlsAlloc())); + } + + /*! set the thread local storage pointer */ + void setTls(tls_t tls, void* const ptr) { + TlsSetValue(DWORD(size_t(tls)), ptr); + } + + /*! return the thread local storage pointer */ + void* getTls(tls_t tls) { + return TlsGetValue(DWORD(size_t(tls))); + } + + /*! destroys thread local storage identifier */ + void destroyTls(tls_t tls) { + TlsFree(DWORD(size_t(tls))); + } +#endif +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Linux Platform +//////////////////////////////////////////////////////////////////////////////// + +// -- GODOT start -- +#if defined(__LINUX__) && !defined(__ANDROID__) +// -- GODOT end -- + +#include <fstream> +#include <sstream> +#include <algorithm> + +namespace embree +{ + static MutexSys mutex; + static std::vector<size_t> threadIDs; + + /* changes thread ID mapping such that we first fill up all thread on one core */ + size_t mapThreadID(size_t threadID) + { + Lock<MutexSys> lock(mutex); + + if (threadIDs.size() == 0) + { + /* parse thread/CPU topology */ + for (size_t cpuID=0;;cpuID++) + { + std::fstream fs; + std::string cpu = std::string("/sys/devices/system/cpu/cpu") + std::to_string((long long)cpuID) + std::string("/topology/thread_siblings_list"); + fs.open (cpu.c_str(), std::fstream::in); + if (fs.fail()) break; + + int i; + while (fs >> i) + { + if (std::none_of(threadIDs.begin(),threadIDs.end(),[&] (int id) { return id == i; })) + threadIDs.push_back(i); + if (fs.peek() == ',') + fs.ignore(); + } + fs.close(); + } + +#if 0 + for (size_t i=0;i<threadIDs.size();i++) + std::cout << i << " -> " << threadIDs[i] << std::endl; +#endif + + /* verify the mapping and do not use it if the mapping has errors */ + for (size_t i=0;i<threadIDs.size();i++) { + for (size_t j=0;j<threadIDs.size();j++) { + if (i != j && threadIDs[i] == threadIDs[j]) { + threadIDs.clear(); + } + } + } + } + + /* re-map threadIDs if mapping is available */ + size_t ID = threadID; + if (threadID < threadIDs.size()) + ID = threadIDs[threadID]; + + /* find correct thread to affinitize to */ + cpu_set_t set; + if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0) + { + for (int i=0, j=0; i<CPU_SETSIZE; i++) + { + if (!CPU_ISSET(i,&set)) continue; + + if (j == ID) { + ID = i; + break; + } + j++; + } + } + + return ID; + } + + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) + { + cpu_set_t cset; + CPU_ZERO(&cset); + size_t threadID = mapThreadID(affinity); + CPU_SET(threadID, &cset); + + pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset); + } +} +#endif + +// -- GODOT start -- +//////////////////////////////////////////////////////////////////////////////// +/// Android Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__ANDROID__) + +namespace embree +{ + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) + { + cpu_set_t cset; + CPU_ZERO(&cset); + CPU_SET(affinity, &cset); + + sched_setaffinity(0, sizeof(cset), &cset); + } +} +#endif +// -- GODOT end -- + +//////////////////////////////////////////////////////////////////////////////// +/// FreeBSD Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__FreeBSD__) + +#include <pthread_np.h> + +namespace embree +{ + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) + { + cpuset_t cset; + CPU_ZERO(&cset); + CPU_SET(affinity, &cset); + + pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset); + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// MacOSX Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__MACOSX__) + +#include <mach/thread_act.h> +#include <mach/thread_policy.h> +#include <mach/mach_init.h> + +namespace embree +{ + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) + { +#if !defined(__ARM_NEON) // affinity seems not supported on M1 chip + + thread_affinity_policy ap; + ap.affinity_tag = affinity; + if (thread_policy_set(mach_thread_self(),THREAD_AFFINITY_POLICY,(thread_policy_t)&ap,THREAD_AFFINITY_POLICY_COUNT) != KERN_SUCCESS) + WARNING("setting thread affinity failed"); // on purpose only a warning + +#endif + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) || defined(PTHREADS_WIN32) + +#include <pthread.h> +#include <sched.h> + +#if defined(__USE_NUMA__) +#include <numa.h> +#endif + +namespace embree +{ + struct ThreadStartupData + { + public: + ThreadStartupData (thread_func f, void* arg, int affinity) + : f(f), arg(arg), affinity(affinity) {} + public: + thread_func f; + void* arg; + ssize_t affinity; + }; + + static void* threadStartup(ThreadStartupData* parg) + { + _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); + + /*! Mac OS X does not support setting affinity at thread creation time */ +#if defined(__MACOSX__) + if (parg->affinity >= 0) + setAffinity(parg->affinity); +#endif + + parg->f(parg->arg); + delete parg; + return nullptr; + } + + /*! creates a hardware thread running on specific core */ + thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID) + { + /* set stack size */ + pthread_attr_t attr; + pthread_attr_init(&attr); + if (stack_size > 0) pthread_attr_setstacksize (&attr, stack_size); + + /* create thread */ + pthread_t* tid = new pthread_t; + if (pthread_create(tid,&attr,(void*(*)(void*))threadStartup,new ThreadStartupData(f,arg,threadID)) != 0) { + pthread_attr_destroy(&attr); + delete tid; + FATAL("pthread_create failed"); + } + pthread_attr_destroy(&attr); + + /* set affinity */ +// -- GODOT start -- +#if defined(__LINUX__) && !defined(__ANDROID__) +// -- GODOT end -- + if (threadID >= 0) { + cpu_set_t cset; + CPU_ZERO(&cset); + threadID = mapThreadID(threadID); + CPU_SET(threadID, &cset); + pthread_setaffinity_np(*tid, sizeof(cset), &cset); + } +#elif defined(__FreeBSD__) + if (threadID >= 0) { + cpuset_t cset; + CPU_ZERO(&cset); + CPU_SET(threadID, &cset); + pthread_setaffinity_np(*tid, sizeof(cset), &cset); + } +// -- GODOT start -- +#elif defined(__ANDROID__) + if (threadID >= 0) { + cpu_set_t cset; + CPU_ZERO(&cset); + CPU_SET(threadID, &cset); + sched_setaffinity(pthread_gettid_np(*tid), sizeof(cset), &cset); + } +#endif +// -- GODOT end -- + + return thread_t(tid); + } + + /*! the thread calling this function gets yielded */ + void yield() { + sched_yield(); + } + + /*! waits until the given thread has terminated */ + void join(thread_t tid) { + if (pthread_join(*(pthread_t*)tid, nullptr) != 0) + FATAL("pthread_join failed"); + delete (pthread_t*)tid; + } + + /*! destroy a hardware thread by its handle */ + void destroyThread(thread_t tid) { +// -- GODOT start -- +#if defined(__ANDROID__) + FATAL("Can't destroy threads on Android."); +#else + pthread_cancel(*(pthread_t*)tid); + delete (pthread_t*)tid; +#endif +// -- GODOT end -- + } + + /*! creates thread local storage */ + tls_t createTls() + { + pthread_key_t* key = new pthread_key_t; + if (pthread_key_create(key,nullptr) != 0) { + delete key; + FATAL("pthread_key_create failed"); + } + + return tls_t(key); + } + + /*! return the thread local storage pointer */ + void* getTls(tls_t tls) + { + assert(tls); + return pthread_getspecific(*(pthread_key_t*)tls); + } + + /*! set the thread local storage pointer */ + void setTls(tls_t tls, void* const ptr) + { + assert(tls); + if (pthread_setspecific(*(pthread_key_t*)tls, ptr) != 0) + FATAL("pthread_setspecific failed"); + } + + /*! destroys thread local storage identifier */ + void destroyTls(tls_t tls) + { + assert(tls); + if (pthread_key_delete(*(pthread_key_t*)tls) != 0) + FATAL("pthread_key_delete failed"); + delete (pthread_key_t*)tls; + } +} + +#endif diff --git a/thirdparty/embree/common/sys/thread.h b/thirdparty/embree/common/sys/thread.h new file mode 100644 index 0000000000..92a10d5c5d --- /dev/null +++ b/thirdparty/embree/common/sys/thread.h @@ -0,0 +1,49 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "mutex.h" +#include "alloc.h" +#include "vector.h" +#include <vector> + +namespace embree +{ + /*! type for thread */ + typedef struct opaque_thread_t* thread_t; + + /*! signature of thread start function */ + typedef void (*thread_func)(void*); + + /*! creates a hardware thread running on specific logical thread */ + thread_t createThread(thread_func f, void* arg, size_t stack_size = 0, ssize_t threadID = -1); + + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity); + + /*! the thread calling this function gets yielded */ + void yield(); + + /*! waits until the given thread has terminated */ + void join(thread_t tid); + + /*! destroy handle of a thread */ + void destroyThread(thread_t tid); + + /*! type for handle to thread local storage */ + typedef struct opaque_tls_t* tls_t; + + /*! creates thread local storage */ + tls_t createTls(); + + /*! set the thread local storage pointer */ + void setTls(tls_t tls, void* const ptr); + + /*! return the thread local storage pointer */ + void* getTls(tls_t tls); + + /*! destroys thread local storage identifier */ + void destroyTls(tls_t tls); +} diff --git a/thirdparty/embree/common/sys/vector.h b/thirdparty/embree/common/sys/vector.h new file mode 100644 index 0000000000..f832626789 --- /dev/null +++ b/thirdparty/embree/common/sys/vector.h @@ -0,0 +1,242 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "alloc.h" +#include <algorithm> + +namespace embree +{ + template<typename T, typename allocator> + class vector_t + { + public: + typedef T value_type; + typedef T* iterator; + typedef const T* const_iterator; + + __forceinline vector_t () + : size_active(0), size_alloced(0), items(nullptr) {} + + __forceinline explicit vector_t (size_t sz) + : size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); } + + template<typename M> + __forceinline explicit vector_t (M alloc, size_t sz) + : alloc(alloc), size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); } + + __forceinline ~vector_t() { + clear(); + } + + __forceinline vector_t (const vector_t& other) + { + size_active = other.size_active; + size_alloced = other.size_alloced; + items = alloc.allocate(size_alloced); + for (size_t i=0; i<size_active; i++) + ::new (&items[i]) value_type(other.items[i]); + } + + __forceinline vector_t (vector_t&& other) + : alloc(std::move(other.alloc)) + { + size_active = other.size_active; other.size_active = 0; + size_alloced = other.size_alloced; other.size_alloced = 0; + items = other.items; other.items = nullptr; + } + + __forceinline vector_t& operator=(const vector_t& other) + { + resize(other.size_active); + for (size_t i=0; i<size_active; i++) + items[i] = value_type(other.items[i]); + return *this; + } + + __forceinline vector_t& operator=(vector_t&& other) + { + clear(); + alloc = std::move(other.alloc); + size_active = other.size_active; other.size_active = 0; + size_alloced = other.size_alloced; other.size_alloced = 0; + items = other.items; other.items = nullptr; + return *this; + } + + /********************** Iterators ****************************/ + + __forceinline iterator begin() { return items; }; + __forceinline const_iterator begin() const { return items; }; + + __forceinline iterator end () { return items+size_active; }; + __forceinline const_iterator end () const { return items+size_active; }; + + + /********************** Capacity ****************************/ + + __forceinline bool empty () const { return size_active == 0; } + __forceinline size_t size () const { return size_active; } + __forceinline size_t capacity () const { return size_alloced; } + + + __forceinline void resize(size_t new_size) { + internal_resize(new_size,internal_grow_size(new_size)); + } + + __forceinline void reserve(size_t new_alloced) + { + /* do nothing if container already large enough */ + if (new_alloced <= size_alloced) + return; + + /* resize exact otherwise */ + internal_resize(size_active,new_alloced); + } + + __forceinline void shrink_to_fit() { + internal_resize(size_active,size_active); + } + + /******************** Element access **************************/ + + __forceinline T& operator[](size_t i) { assert(i < size_active); return items[i]; } + __forceinline const T& operator[](size_t i) const { assert(i < size_active); return items[i]; } + + __forceinline T& at(size_t i) { assert(i < size_active); return items[i]; } + __forceinline const T& at(size_t i) const { assert(i < size_active); return items[i]; } + + __forceinline T& front() const { assert(size_active > 0); return items[0]; }; + __forceinline T& back () const { assert(size_active > 0); return items[size_active-1]; }; + + __forceinline T* data() { return items; }; + __forceinline const T* data() const { return items; }; + + + /******************** Modifiers **************************/ + + __forceinline void push_back(const T& nt) + { + const T v = nt; // need local copy as input reference could point to this vector + internal_resize(size_active,internal_grow_size(size_active+1)); + ::new (&items[size_active++]) T(v); + } + + __forceinline void pop_back() + { + assert(!empty()); + size_active--; + alloc.destroy(&items[size_active]); + } + + __forceinline void clear() + { + /* destroy elements */ + for (size_t i=0; i<size_active; i++) + alloc.destroy(&items[i]); + + /* free memory */ + alloc.deallocate(items,size_alloced); + items = nullptr; + size_active = size_alloced = 0; + } + + /******************** Comparisons **************************/ + + friend bool operator== (const vector_t& a, const vector_t& b) + { + if (a.size() != b.size()) return false; + for (size_t i=0; i<a.size(); i++) + if (a[i] != b[i]) + return false; + return true; + } + + friend bool operator!= (const vector_t& a, const vector_t& b) { + return !(a==b); + } + + private: + + __forceinline void internal_resize_init(size_t new_active) + { + assert(size_active == 0); + assert(size_alloced == 0); + assert(items == nullptr); + if (new_active == 0) return; + items = alloc.allocate(new_active); + for (size_t i=0; i<new_active; i++) ::new (&items[i]) T(); + size_active = new_active; + size_alloced = new_active; + } + + __forceinline void internal_resize(size_t new_active, size_t new_alloced) + { + assert(new_active <= new_alloced); + + /* destroy elements */ + if (new_active < size_active) + { + for (size_t i=new_active; i<size_active; i++) + alloc.destroy(&items[i]); + size_active = new_active; + } + + /* only reallocate if necessary */ + if (new_alloced == size_alloced) { + for (size_t i=size_active; i<new_active; i++) ::new (&items[i]) T; + size_active = new_active; + return; + } + + /* reallocate and copy items */ + T* old_items = items; + items = alloc.allocate(new_alloced); + for (size_t i=0; i<size_active; i++) { + ::new (&items[i]) T(std::move(old_items[i])); + alloc.destroy(&old_items[i]); + } + + for (size_t i=size_active; i<new_active; i++) { + ::new (&items[i]) T; + } + + alloc.deallocate(old_items,size_alloced); + size_active = new_active; + size_alloced = new_alloced; + } + + __forceinline size_t internal_grow_size(size_t new_alloced) + { + /* do nothing if container already large enough */ + if (new_alloced <= size_alloced) + return size_alloced; + + /* resize to next power of 2 otherwise */ + size_t new_size_alloced = size_alloced; + while (new_size_alloced < new_alloced) { + new_size_alloced = std::max(size_t(1),2*new_size_alloced); + } + return new_size_alloced; + } + + private: + allocator alloc; + size_t size_active; // number of valid items + size_t size_alloced; // number of items allocated + T* items; // data array + }; + + /*! vector class that performs standard allocations */ + template<typename T> + using vector = vector_t<T,std::allocator<T>>; + + /*! vector class that performs aligned allocations */ + template<typename T> + using avector = vector_t<T,aligned_allocator<T,std::alignment_of<T>::value> >; + + /*! vector class that performs OS allocations */ + template<typename T> + using ovector = vector_t<T,os_allocator<T> >; +} diff --git a/thirdparty/embree/common/tasking/taskscheduler.h b/thirdparty/embree/common/tasking/taskscheduler.h new file mode 100644 index 0000000000..8f3dd87689 --- /dev/null +++ b/thirdparty/embree/common/tasking/taskscheduler.h @@ -0,0 +1,15 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#if defined(TASKING_INTERNAL) +# include "taskschedulerinternal.h" +#elif defined(TASKING_TBB) +# include "taskschedulertbb.h" +#elif defined(TASKING_PPL) +# include "taskschedulerppl.h" +#else +# error "no tasking system enabled" +#endif + diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp new file mode 100644 index 0000000000..ad438588a3 --- /dev/null +++ b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp @@ -0,0 +1,420 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "taskschedulerinternal.h" +#include "../math/math.h" +#include "../sys/sysinfo.h" +#include <algorithm> + +namespace embree +{ + RTC_NAMESPACE_BEGIN + + static MutexSys g_mutex; + size_t TaskScheduler::g_numThreads = 0; + __thread TaskScheduler* TaskScheduler::g_instance = nullptr; + std::vector<Ref<TaskScheduler>> g_instance_vector; + __thread TaskScheduler::Thread* TaskScheduler::thread_local_thread = nullptr; + TaskScheduler::ThreadPool* TaskScheduler::threadPool = nullptr; + + template<typename Predicate, typename Body> + __forceinline void TaskScheduler::steal_loop(Thread& thread, const Predicate& pred, const Body& body) + { + while (true) + { + /*! some rounds that yield */ + for (size_t i=0; i<32; i++) + { + /*! some spinning rounds */ + const size_t threadCount = thread.threadCount(); + for (size_t j=0; j<1024; j+=threadCount) + { + if (!pred()) return; + if (thread.scheduler->steal_from_other_threads(thread)) { + i=j=0; + body(); + } + } + yield(); + } + } + } + + /*! run this task */ + void TaskScheduler::Task::run_internal (Thread& thread) // FIXME: avoid as many dll_exports as possible + { + /* try to run if not already stolen */ + if (try_switch_state(INITIALIZED,DONE)) + { + Task* prevTask = thread.task; + thread.task = this; + // -- GODOT start -- + // try { + // if (thread.scheduler->cancellingException == nullptr) + closure->execute(); + // } catch (...) { + // if (thread.scheduler->cancellingException == nullptr) + // thread.scheduler->cancellingException = std::current_exception(); + // } + // -- GODOT end -- + thread.task = prevTask; + add_dependencies(-1); + } + + /* steal until all dependencies have completed */ + steal_loop(thread, + [&] () { return dependencies>0; }, + [&] () { while (thread.tasks.execute_local_internal(thread,this)); }); + + /* now signal our parent task that we are finished */ + if (parent) + parent->add_dependencies(-1); + } + + /*! run this task */ + dll_export void TaskScheduler::Task::run (Thread& thread) { + run_internal(thread); + } + + bool TaskScheduler::TaskQueue::execute_local_internal(Thread& thread, Task* parent) + { + /* stop if we run out of local tasks or reach the waiting task */ + if (right == 0 || &tasks[right-1] == parent) + return false; + + /* execute task */ + size_t oldRight = right; + tasks[right-1].run_internal(thread); + if (right != oldRight) { + THROW_RUNTIME_ERROR("you have to wait for spawned subtasks"); + } + + /* pop task and closure from stack */ + right--; + if (tasks[right].stackPtr != size_t(-1)) + stackPtr = tasks[right].stackPtr; + + /* also move left pointer */ + if (left >= right) left.store(right.load()); + + return right != 0; + } + + dll_export bool TaskScheduler::TaskQueue::execute_local(Thread& thread, Task* parent) { + return execute_local_internal(thread,parent); + } + + bool TaskScheduler::TaskQueue::steal(Thread& thread) + { + size_t l = left; + size_t r = right; + if (l < r) + { + l = left++; + if (l >= r) + return false; + } + else + return false; + + if (!tasks[l].try_steal(thread.tasks.tasks[thread.tasks.right])) + return false; + + thread.tasks.right++; + return true; + } + + /* we steal from the left */ + size_t TaskScheduler::TaskQueue::getTaskSizeAtLeft() + { + if (left >= right) return 0; + return tasks[left].N; + } + + void threadPoolFunction(std::pair<TaskScheduler::ThreadPool*,size_t>* pair) + { + TaskScheduler::ThreadPool* pool = pair->first; + size_t threadIndex = pair->second; + delete pair; + pool->thread_loop(threadIndex); + } + + TaskScheduler::ThreadPool::ThreadPool(bool set_affinity) + : numThreads(0), numThreadsRunning(0), set_affinity(set_affinity), running(false) {} + + dll_export void TaskScheduler::ThreadPool::startThreads() + { + if (running) return; + setNumThreads(numThreads,true); + } + + void TaskScheduler::ThreadPool::setNumThreads(size_t newNumThreads, bool startThreads) + { + Lock<MutexSys> lock(g_mutex); + assert(newNumThreads); + newNumThreads = min(newNumThreads, (size_t) getNumberOfLogicalThreads()); + + numThreads = newNumThreads; + if (!startThreads && !running) return; + running = true; + size_t numThreadsActive = numThreadsRunning; + + mutex.lock(); + numThreadsRunning = newNumThreads; + mutex.unlock(); + condition.notify_all(); + + /* start new threads */ + for (size_t t=numThreadsActive; t<numThreads; t++) + { + if (t == 0) continue; + auto pair = new std::pair<TaskScheduler::ThreadPool*,size_t>(this,t); + threads.push_back(createThread((thread_func)threadPoolFunction,pair,4*1024*1024,set_affinity ? t : -1)); + } + + /* stop some threads if we reduce the number of threads */ + for (ssize_t t=numThreadsActive-1; t>=ssize_t(numThreadsRunning); t--) { + if (t == 0) continue; + embree::join(threads.back()); + threads.pop_back(); + } + } + + TaskScheduler::ThreadPool::~ThreadPool() + { + /* leave all taskschedulers */ + mutex.lock(); + numThreadsRunning = 0; + mutex.unlock(); + condition.notify_all(); + + /* wait for threads to terminate */ + for (size_t i=0; i<threads.size(); i++) + embree::join(threads[i]); + } + + dll_export void TaskScheduler::ThreadPool::add(const Ref<TaskScheduler>& scheduler) + { + mutex.lock(); + schedulers.push_back(scheduler); + mutex.unlock(); + condition.notify_all(); + } + + dll_export void TaskScheduler::ThreadPool::remove(const Ref<TaskScheduler>& scheduler) + { + Lock<MutexSys> lock(mutex); + for (std::list<Ref<TaskScheduler> >::iterator it = schedulers.begin(); it != schedulers.end(); it++) { + if (scheduler == *it) { + schedulers.erase(it); + return; + } + } + } + + void TaskScheduler::ThreadPool::thread_loop(size_t globalThreadIndex) + { + while (globalThreadIndex < numThreadsRunning) + { + Ref<TaskScheduler> scheduler = NULL; + ssize_t threadIndex = -1; + { + Lock<MutexSys> lock(mutex); + condition.wait(mutex, [&] () { return globalThreadIndex >= numThreadsRunning || !schedulers.empty(); }); + if (globalThreadIndex >= numThreadsRunning) break; + scheduler = schedulers.front(); + threadIndex = scheduler->allocThreadIndex(); + } + scheduler->thread_loop(threadIndex); + } + } + + TaskScheduler::TaskScheduler() + : threadCounter(0), anyTasksRunning(0), hasRootTask(false) + { + threadLocal.resize(2*getNumberOfLogicalThreads()); // FIXME: this has to be 2x as in the compatibility join mode with rtcCommitScene the worker threads also join. When disallowing rtcCommitScene to join a build we can remove the 2x. + for (size_t i=0; i<threadLocal.size(); i++) + threadLocal[i].store(nullptr); + } + + TaskScheduler::~TaskScheduler() + { + assert(threadCounter == 0); + } + + dll_export size_t TaskScheduler::threadID() + { + Thread* thread = TaskScheduler::thread(); + if (thread) return thread->threadIndex; + else return 0; + } + + dll_export size_t TaskScheduler::threadIndex() + { + Thread* thread = TaskScheduler::thread(); + if (thread) return thread->threadIndex; + else return 0; + } + + dll_export size_t TaskScheduler::threadCount() { + return threadPool->size(); + } + + dll_export TaskScheduler* TaskScheduler::instance() + { + if (g_instance == NULL) { + Lock<MutexSys> lock(g_mutex); + g_instance = new TaskScheduler; + g_instance_vector.push_back(g_instance); + } + return g_instance; + } + + void TaskScheduler::create(size_t numThreads, bool set_affinity, bool start_threads) + { + if (!threadPool) threadPool = new TaskScheduler::ThreadPool(set_affinity); + threadPool->setNumThreads(numThreads,start_threads); + } + + void TaskScheduler::destroy() { + delete threadPool; threadPool = nullptr; + } + + dll_export ssize_t TaskScheduler::allocThreadIndex() + { + size_t threadIndex = threadCounter++; + assert(threadIndex < threadLocal.size()); + return threadIndex; + } + + void TaskScheduler::join() + { + mutex.lock(); + size_t threadIndex = allocThreadIndex(); + condition.wait(mutex, [&] () { return hasRootTask.load(); }); + mutex.unlock(); + // -- GODOT start -- + // std::exception_ptr except = thread_loop(threadIndex); + // if (except != nullptr) std::rethrow_exception(except); + thread_loop(threadIndex); + // -- GODOT end -- + } + + void TaskScheduler::reset() { + hasRootTask = false; + } + + void TaskScheduler::wait_for_threads(size_t threadCount) + { + while (threadCounter < threadCount-1) + pause_cpu(); + } + + dll_export TaskScheduler::Thread* TaskScheduler::thread() { + return thread_local_thread; + } + + dll_export TaskScheduler::Thread* TaskScheduler::swapThread(Thread* thread) + { + Thread* old = thread_local_thread; + thread_local_thread = thread; + return old; + } + + dll_export bool TaskScheduler::wait() + { + Thread* thread = TaskScheduler::thread(); + if (thread == nullptr) return true; + while (thread->tasks.execute_local_internal(*thread,thread->task)) {}; + return thread->scheduler->cancellingException == nullptr; + } + +// -- GODOT start -- +// std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex) + void TaskScheduler::thread_loop(size_t threadIndex) +// -- GODOT end -- + { + /* allocate thread structure */ + std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation + Thread& thread = *mthread; + threadLocal[threadIndex].store(&thread); + Thread* oldThread = swapThread(&thread); + + /* main thread loop */ + while (anyTasksRunning) + { + steal_loop(thread, + [&] () { return anyTasksRunning > 0; }, + [&] () { + anyTasksRunning++; + while (thread.tasks.execute_local_internal(thread,nullptr)); + anyTasksRunning--; + }); + } + threadLocal[threadIndex].store(nullptr); + swapThread(oldThread); + + /* remember exception to throw */ + // -- GODOT start -- + // std::exception_ptr except = nullptr; + // if (cancellingException != nullptr) except = cancellingException; + // -- GODOT end -- + /* wait for all threads to terminate */ + threadCounter--; +#if defined(__WIN32__) + size_t loopIndex = 1; +#endif +#define LOOP_YIELD_THRESHOLD (4096) + while (threadCounter > 0) { +#if defined(__WIN32__) + if ((loopIndex % LOOP_YIELD_THRESHOLD) == 0) + yield(); + else + _mm_pause(); + loopIndex++; +#else + yield(); +#endif + } + // -- GODOT start -- + // return except; + return; + // -- GODOT end -- + } + + bool TaskScheduler::steal_from_other_threads(Thread& thread) + { + const size_t threadIndex = thread.threadIndex; + const size_t threadCount = this->threadCounter; + + for (size_t i=1; i<threadCount; i++) + { + pause_cpu(32); + size_t otherThreadIndex = threadIndex+i; + if (otherThreadIndex >= threadCount) otherThreadIndex -= threadCount; + + Thread* othread = threadLocal[otherThreadIndex].load(); + if (!othread) + continue; + + if (othread->tasks.steal(thread)) + return true; + } + + return false; + } + + dll_export void TaskScheduler::startThreads() { + threadPool->startThreads(); + } + + dll_export void TaskScheduler::addScheduler(const Ref<TaskScheduler>& scheduler) { + threadPool->add(scheduler); + } + + dll_export void TaskScheduler::removeScheduler(const Ref<TaskScheduler>& scheduler) { + threadPool->remove(scheduler); + } + + RTC_NAMESPACE_END +} diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h new file mode 100644 index 0000000000..8fa6bb12fa --- /dev/null +++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h @@ -0,0 +1,385 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/alloc.h" +#include "../sys/barrier.h" +#include "../sys/thread.h" +#include "../sys/mutex.h" +#include "../sys/condition.h" +#include "../sys/ref.h" +#include "../sys/atomic.h" +#include "../math/range.h" +#include "../../include/embree3/rtcore.h" + +#include <list> + +namespace embree +{ + + /* The tasking system exports some symbols to be used by the tutorials. Thus we + hide is also in the API namespace when requested. */ + RTC_NAMESPACE_BEGIN + + struct TaskScheduler : public RefCount + { + ALIGNED_STRUCT_(64); + friend class Device; + + static const size_t TASK_STACK_SIZE = 4*1024; //!< task structure stack + static const size_t CLOSURE_STACK_SIZE = 512*1024; //!< stack for task closures + + struct Thread; + + /*! virtual interface for all tasks */ + struct TaskFunction { + virtual void execute() = 0; + }; + + /*! builds a task interface from a closure */ + template<typename Closure> + struct ClosureTaskFunction : public TaskFunction + { + Closure closure; + __forceinline ClosureTaskFunction (const Closure& closure) : closure(closure) {} + void execute() { closure(); }; + }; + + struct __aligned(64) Task + { + /*! states a task can be in */ + enum { DONE, INITIALIZED }; + + /*! switch from one state to another */ + __forceinline void switch_state(int from, int to) + { + __memory_barrier(); + MAYBE_UNUSED bool success = state.compare_exchange_strong(from,to); + assert(success); + } + + /*! try to switch from one state to another */ + __forceinline bool try_switch_state(int from, int to) { + __memory_barrier(); + return state.compare_exchange_strong(from,to); + } + + /*! increment/decrement dependency counter */ + void add_dependencies(int n) { + dependencies+=n; + } + + /*! initialize all tasks to DONE state by default */ + __forceinline Task() + : state(DONE) {} + + /*! construction of new task */ + __forceinline Task (TaskFunction* closure, Task* parent, size_t stackPtr, size_t N) + : dependencies(1), stealable(true), closure(closure), parent(parent), stackPtr(stackPtr), N(N) + { + if (parent) parent->add_dependencies(+1); + switch_state(DONE,INITIALIZED); + } + + /*! construction of stolen task, stealing thread will decrement initial dependency */ + __forceinline Task (TaskFunction* closure, Task* parent) + : dependencies(1), stealable(false), closure(closure), parent(parent), stackPtr(-1), N(1) + { + switch_state(DONE,INITIALIZED); + } + + /*! try to steal this task */ + bool try_steal(Task& child) + { + if (!stealable) return false; + if (!try_switch_state(INITIALIZED,DONE)) return false; + new (&child) Task(closure, this); + return true; + } + + /*! run this task */ + dll_export void run(Thread& thread); + + void run_internal(Thread& thread); + + public: + std::atomic<int> state; //!< state this task is in + std::atomic<int> dependencies; //!< dependencies to wait for + std::atomic<bool> stealable; //!< true if task can be stolen + TaskFunction* closure; //!< the closure to execute + Task* parent; //!< parent task to signal when we are finished + size_t stackPtr; //!< stack location where closure is stored + size_t N; //!< approximative size of task + }; + + struct TaskQueue + { + TaskQueue () + : left(0), right(0), stackPtr(0) {} + + __forceinline void* alloc(size_t bytes, size_t align = 64) + { + size_t ofs = bytes + ((align - stackPtr) & (align-1)); + if (stackPtr + ofs > CLOSURE_STACK_SIZE) + // -- GODOT start -- + // throw std::runtime_error("closure stack overflow"); + abort(); + // -- GODOT end -- + stackPtr += ofs; + return &stack[stackPtr-bytes]; + } + + template<typename Closure> + __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure) + { + if (right >= TASK_STACK_SIZE) + // -- GODOT start -- + // throw std::runtime_error("task stack overflow"); + abort(); + // -- GODOT end -- + + /* allocate new task on right side of stack */ + size_t oldStackPtr = stackPtr; + TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction<Closure>))) ClosureTaskFunction<Closure>(closure); + new (&tasks[right]) Task(func,thread.task,oldStackPtr,size); + right++; + + /* also move left pointer */ + if (left >= right-1) left = right-1; + } + + dll_export bool execute_local(Thread& thread, Task* parent); + bool execute_local_internal(Thread& thread, Task* parent); + bool steal(Thread& thread); + size_t getTaskSizeAtLeft(); + + bool empty() { return right == 0; } + + public: + + /* task stack */ + Task tasks[TASK_STACK_SIZE]; + __aligned(64) std::atomic<size_t> left; //!< threads steal from left + __aligned(64) std::atomic<size_t> right; //!< new tasks are added to the right + + /* closure stack */ + __aligned(64) char stack[CLOSURE_STACK_SIZE]; + size_t stackPtr; + }; + + /*! thread local structure for each thread */ + struct Thread + { + ALIGNED_STRUCT_(64); + + Thread (size_t threadIndex, const Ref<TaskScheduler>& scheduler) + : threadIndex(threadIndex), task(nullptr), scheduler(scheduler) {} + + __forceinline size_t threadCount() { + return scheduler->threadCounter; + } + + size_t threadIndex; //!< ID of this thread + TaskQueue tasks; //!< local task queue + Task* task; //!< current active task + Ref<TaskScheduler> scheduler; //!< pointer to task scheduler + }; + + /*! pool of worker threads */ + struct ThreadPool + { + ThreadPool (bool set_affinity); + ~ThreadPool (); + + /*! starts the threads */ + dll_export void startThreads(); + + /*! sets number of threads to use */ + void setNumThreads(size_t numThreads, bool startThreads = false); + + /*! adds a task scheduler object for scheduling */ + dll_export void add(const Ref<TaskScheduler>& scheduler); + + /*! remove the task scheduler object again */ + dll_export void remove(const Ref<TaskScheduler>& scheduler); + + /*! returns number of threads of the thread pool */ + size_t size() const { return numThreads; } + + /*! main loop for all threads */ + void thread_loop(size_t threadIndex); + + private: + std::atomic<size_t> numThreads; + std::atomic<size_t> numThreadsRunning; + bool set_affinity; + std::atomic<bool> running; + std::vector<thread_t> threads; + + private: + MutexSys mutex; + ConditionSys condition; + std::list<Ref<TaskScheduler> > schedulers; + }; + + TaskScheduler (); + ~TaskScheduler (); + + /*! initializes the task scheduler */ + static void create(size_t numThreads, bool set_affinity, bool start_threads); + + /*! destroys the task scheduler again */ + static void destroy(); + + /*! lets new worker threads join the tasking system */ + void join(); + void reset(); + + /*! let a worker thread allocate a thread index */ + dll_export ssize_t allocThreadIndex(); + + /*! wait for some number of threads available (threadCount includes main thread) */ + void wait_for_threads(size_t threadCount); + + /*! thread loop for all worker threads */ + // -- GODOT start -- + // std::exception_ptr thread_loop(size_t threadIndex); + void thread_loop(size_t threadIndex); + // -- GODOT end -- + + /*! steals a task from a different thread */ + bool steal_from_other_threads(Thread& thread); + + template<typename Predicate, typename Body> + static void steal_loop(Thread& thread, const Predicate& pred, const Body& body); + + /* spawn a new task at the top of the threads task stack */ + template<typename Closure> + void spawn_root(const Closure& closure, size_t size = 1, bool useThreadPool = true) + { + if (useThreadPool) startThreads(); + + size_t threadIndex = allocThreadIndex(); + std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation + Thread& thread = *mthread; + assert(threadLocal[threadIndex].load() == nullptr); + threadLocal[threadIndex] = &thread; + Thread* oldThread = swapThread(&thread); + thread.tasks.push_right(thread,size,closure); + { + Lock<MutexSys> lock(mutex); + anyTasksRunning++; + hasRootTask = true; + condition.notify_all(); + } + + if (useThreadPool) addScheduler(this); + + while (thread.tasks.execute_local(thread,nullptr)); + anyTasksRunning--; + if (useThreadPool) removeScheduler(this); + + threadLocal[threadIndex] = nullptr; + swapThread(oldThread); + + /* remember exception to throw */ + std::exception_ptr except = nullptr; + if (cancellingException != nullptr) except = cancellingException; + + /* wait for all threads to terminate */ + threadCounter--; + while (threadCounter > 0) yield(); + cancellingException = nullptr; + + /* re-throw proper exception */ + if (except != nullptr) + std::rethrow_exception(except); + } + + /* spawn a new task at the top of the threads task stack */ + template<typename Closure> + static __forceinline void spawn(size_t size, const Closure& closure) + { + Thread* thread = TaskScheduler::thread(); + if (likely(thread != nullptr)) thread->tasks.push_right(*thread,size,closure); + else instance()->spawn_root(closure,size); + } + + /* spawn a new task at the top of the threads task stack */ + template<typename Closure> + static __forceinline void spawn(const Closure& closure) { + spawn(1,closure); + } + + /* spawn a new task set */ + template<typename Index, typename Closure> + static void spawn(const Index begin, const Index end, const Index blockSize, const Closure& closure) + { + spawn(end-begin, [=]() + { + if (end-begin <= blockSize) { + return closure(range<Index>(begin,end)); + } + const Index center = (begin+end)/2; + spawn(begin,center,blockSize,closure); + spawn(center,end ,blockSize,closure); + wait(); + }); + } + + /* work on spawned subtasks and wait until all have finished */ + dll_export static bool wait(); + + /* returns the ID of the current thread */ + dll_export static size_t threadID(); + + /* returns the index (0..threadCount-1) of the current thread */ + dll_export static size_t threadIndex(); + + /* returns the total number of threads */ + dll_export static size_t threadCount(); + + private: + + /* returns the thread local task list of this worker thread */ + dll_export static Thread* thread(); + + /* sets the thread local task list of this worker thread */ + dll_export static Thread* swapThread(Thread* thread); + + /*! returns the taskscheduler object to be used by the master thread */ + dll_export static TaskScheduler* instance(); + + /*! starts the threads */ + dll_export static void startThreads(); + + /*! adds a task scheduler object for scheduling */ + dll_export static void addScheduler(const Ref<TaskScheduler>& scheduler); + + /*! remove the task scheduler object again */ + dll_export static void removeScheduler(const Ref<TaskScheduler>& scheduler); + + private: + std::vector<atomic<Thread*>> threadLocal; + std::atomic<size_t> threadCounter; + std::atomic<size_t> anyTasksRunning; + std::atomic<bool> hasRootTask; + std::exception_ptr cancellingException; + MutexSys mutex; + ConditionSys condition; + + private: + static size_t g_numThreads; + static __thread TaskScheduler* g_instance; + static __thread Thread* thread_local_thread; + static ThreadPool* threadPool; + }; + + RTC_NAMESPACE_END + +#if defined(RTC_NAMESPACE) + using RTC_NAMESPACE::TaskScheduler; +#endif +} diff --git a/thirdparty/embree/common/tasking/taskschedulerppl.h b/thirdparty/embree/common/tasking/taskschedulerppl.h new file mode 100644 index 0000000000..cbc2ecdbb8 --- /dev/null +++ b/thirdparty/embree/common/tasking/taskschedulerppl.h @@ -0,0 +1,46 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/alloc.h" +#include "../sys/barrier.h" +#include "../sys/thread.h" +#include "../sys/mutex.h" +#include "../sys/condition.h" +#include "../sys/ref.h" + +#if !defined(__WIN32__) +#error PPL tasking system only available under windows +#endif + +#include <ppl.h> + +namespace embree +{ + struct TaskScheduler + { + /*! initializes the task scheduler */ + static void create(size_t numThreads, bool set_affinity, bool start_threads); + + /*! destroys the task scheduler again */ + static void destroy(); + + /* returns the ID of the current thread */ + static __forceinline size_t threadID() { + return GetCurrentThreadId(); + } + + /* returns the index (0..threadCount-1) of the current thread */ + /* FIXME: threadIndex is NOT supported by PPL! */ + static __forceinline size_t threadIndex() { + return 0; + } + + /* returns the total number of threads */ + static __forceinline size_t threadCount() { + return GetMaximumProcessorCount(ALL_PROCESSOR_GROUPS) + 1; + } + }; +}; diff --git a/thirdparty/embree/common/tasking/taskschedulertbb.h b/thirdparty/embree/common/tasking/taskschedulertbb.h new file mode 100644 index 0000000000..35bd49849f --- /dev/null +++ b/thirdparty/embree/common/tasking/taskschedulertbb.h @@ -0,0 +1,73 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/alloc.h" +#include "../sys/barrier.h" +#include "../sys/thread.h" +#include "../sys/mutex.h" +#include "../sys/condition.h" +#include "../sys/ref.h" + +#if defined(__WIN32__) +// -- GODOT start -- +#if !defined(NOMINMAX) +// -- GODOT end -- +# define NOMINMAX +// -- GODOT start -- +#endif +// -- GODOT end -- +#endif + +// We need to define these to avoid implicit linkage against +// tbb_debug.lib under Windows. When removing these lines debug build +// under Windows fails. +#define __TBB_NO_IMPLICIT_LINKAGE 1 +#define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 +#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1 +#define TBB_PREVIEW_ISOLATED_TASK_GROUP 1 +#include "tbb/tbb.h" +#include "tbb/parallel_sort.h" + +namespace embree +{ + struct TaskScheduler + { + /*! initializes the task scheduler */ + static void create(size_t numThreads, bool set_affinity, bool start_threads); + + /*! destroys the task scheduler again */ + static void destroy(); + + /* returns the ID of the current thread */ + static __forceinline size_t threadID() + { + return threadIndex(); + } + + /* returns the index (0..threadCount-1) of the current thread */ + static __forceinline size_t threadIndex() + { +#if TBB_INTERFACE_VERSION >= 9100 + return tbb::this_task_arena::current_thread_index(); +#elif TBB_INTERFACE_VERSION >= 9000 + return tbb::task_arena::current_thread_index(); +#else + return 0; +#endif + } + + /* returns the total number of threads */ + static __forceinline size_t threadCount() { +#if TBB_INTERFACE_VERSION >= 9100 + return tbb::this_task_arena::max_concurrency(); +#else + return tbb::task_scheduler_init::default_num_threads(); +#endif + } + + }; + +}; |