diff options
Diffstat (limited to 'thirdparty/bullet/Bullet3Common')
27 files changed, 4767 insertions, 4898 deletions
diff --git a/thirdparty/bullet/Bullet3Common/b3AlignedAllocator.cpp b/thirdparty/bullet/Bullet3Common/b3AlignedAllocator.cpp index b98e2b4d33..d546d5e066 100644 --- a/thirdparty/bullet/Bullet3Common/b3AlignedAllocator.cpp +++ b/thirdparty/bullet/Bullet3Common/b3AlignedAllocator.cpp @@ -15,9 +15,11 @@ subject to the following restrictions: #include "b3AlignedAllocator.h" +#ifdef B3_ALLOCATOR_STATISTICS int b3g_numAlignedAllocs = 0; int b3g_numAlignedFree = 0; -int b3g_totalBytesAlignedAllocs = 0;//detect memory leaks +int b3g_totalBytesAlignedAllocs = 0; //detect memory leaks +#endif static void *b3AllocDefault(size_t size) { @@ -29,12 +31,10 @@ static void b3FreeDefault(void *ptr) free(ptr); } -static b3AllocFunc* b3s_allocFunc = b3AllocDefault; -static b3FreeFunc* b3s_freeFunc = b3FreeDefault; - - +static b3AllocFunc *b3s_allocFunc = b3AllocDefault; +static b3FreeFunc *b3s_freeFunc = b3FreeDefault; -#if defined (B3_HAS_ALIGNED_ALLOCATOR) +#if defined(B3_HAS_ALIGNED_ALLOCATOR) #include <malloc.h> static void *b3AlignedAllocDefault(size_t size, int alignment) { @@ -59,123 +59,128 @@ static inline void b3AlignedFreeDefault(void *ptr) } #else - - - - static inline void *b3AlignedAllocDefault(size_t size, int alignment) { - void *ret; - char *real; - real = (char *)b3s_allocFunc(size + sizeof(void *) + (alignment-1)); - if (real) { - ret = b3AlignPointer(real + sizeof(void *),alignment); - *((void **)(ret)-1) = (void *)(real); - } else { - ret = (void *)(real); - } - return (ret); + void *ret; + char *real; + real = (char *)b3s_allocFunc(size + sizeof(void *) + (alignment - 1)); + if (real) + { + ret = b3AlignPointer(real + sizeof(void *), alignment); + *((void **)(ret)-1) = (void *)(real); + } + else + { + ret = (void *)(real); + } + return (ret); } static inline void b3AlignedFreeDefault(void *ptr) { - void* real; + void *real; - if (ptr) { - real = *((void **)(ptr)-1); - b3s_freeFunc(real); - } + if (ptr) + { + real = *((void **)(ptr)-1); + b3s_freeFunc(real); + } } #endif - -static b3AlignedAllocFunc* b3s_alignedAllocFunc = b3AlignedAllocDefault; -static b3AlignedFreeFunc* b3s_alignedFreeFunc = b3AlignedFreeDefault; +static b3AlignedAllocFunc *b3s_alignedAllocFunc = b3AlignedAllocDefault; +static b3AlignedFreeFunc *b3s_alignedFreeFunc = b3AlignedFreeDefault; void b3AlignedAllocSetCustomAligned(b3AlignedAllocFunc *allocFunc, b3AlignedFreeFunc *freeFunc) { - b3s_alignedAllocFunc = allocFunc ? allocFunc : b3AlignedAllocDefault; - b3s_alignedFreeFunc = freeFunc ? freeFunc : b3AlignedFreeDefault; + b3s_alignedAllocFunc = allocFunc ? allocFunc : b3AlignedAllocDefault; + b3s_alignedFreeFunc = freeFunc ? freeFunc : b3AlignedFreeDefault; } void b3AlignedAllocSetCustom(b3AllocFunc *allocFunc, b3FreeFunc *freeFunc) { - b3s_allocFunc = allocFunc ? allocFunc : b3AllocDefault; - b3s_freeFunc = freeFunc ? freeFunc : b3FreeDefault; + b3s_allocFunc = allocFunc ? allocFunc : b3AllocDefault; + b3s_freeFunc = freeFunc ? freeFunc : b3FreeDefault; } #ifdef B3_DEBUG_MEMORY_ALLOCATIONS //this generic allocator provides the total allocated number of bytes #include <stdio.h> -void* b3AlignedAllocInternal (size_t size, int alignment,int line,char* filename) +void *b3AlignedAllocInternal(size_t size, int alignment, int line, char *filename) { - void *ret; - char *real; - - b3g_totalBytesAlignedAllocs += size; - b3g_numAlignedAllocs++; - - - real = (char *)b3s_allocFunc(size + 2*sizeof(void *) + (alignment-1)); - if (real) { - ret = (void*) b3AlignPointer(real + 2*sizeof(void *), alignment); - *((void **)(ret)-1) = (void *)(real); - *((int*)(ret)-2) = size; - - } else { - ret = (void *)(real);//?? - } + void *ret; + char *real; +#ifdef B3_ALLOCATOR_STATISTICS + b3g_totalBytesAlignedAllocs += size; + b3g_numAlignedAllocs++; +#endif + real = (char *)b3s_allocFunc(size + 2 * sizeof(void *) + (alignment - 1)); + if (real) + { + ret = (void *)b3AlignPointer(real + 2 * sizeof(void *), alignment); + *((void **)(ret)-1) = (void *)(real); + *((int *)(ret)-2) = size; + } + else + { + ret = (void *)(real); //?? + } - b3Printf("allocation#%d at address %x, from %s,line %d, size %d\n",b3g_numAlignedAllocs,real, filename,line,size); + b3Printf("allocation#%d at address %x, from %s,line %d, size %d\n", b3g_numAlignedAllocs, real, filename, line, size); - int* ptr = (int*)ret; - *ptr = 12; - return (ret); + int *ptr = (int *)ret; + *ptr = 12; + return (ret); } -void b3AlignedFreeInternal (void* ptr,int line,char* filename) +void b3AlignedFreeInternal(void *ptr, int line, char *filename) { + void *real; +#ifdef B3_ALLOCATOR_STATISTICS + b3g_numAlignedFree++; +#endif + if (ptr) + { + real = *((void **)(ptr)-1); + int size = *((int *)(ptr)-2); +#ifdef B3_ALLOCATOR_STATISTICS + b3g_totalBytesAlignedAllocs -= size; +#endif + b3Printf("free #%d at address %x, from %s,line %d, size %d\n", b3g_numAlignedFree, real, filename, line, size); - void* real; - b3g_numAlignedFree++; - - if (ptr) { - real = *((void **)(ptr)-1); - int size = *((int*)(ptr)-2); - b3g_totalBytesAlignedAllocs -= size; - - b3Printf("free #%d at address %x, from %s,line %d, size %d\n",b3g_numAlignedFree,real, filename,line,size); - - b3s_freeFunc(real); - } else - { - b3Printf("NULL ptr\n"); - } + b3s_freeFunc(real); + } + else + { + b3Printf("NULL ptr\n"); + } } -#else //B3_DEBUG_MEMORY_ALLOCATIONS +#else //B3_DEBUG_MEMORY_ALLOCATIONS -void* b3AlignedAllocInternal (size_t size, int alignment) +void *b3AlignedAllocInternal(size_t size, int alignment) { +#ifdef B3_ALLOCATOR_STATISTICS b3g_numAlignedAllocs++; - void* ptr; +#endif + void *ptr; ptr = b3s_alignedAllocFunc(size, alignment); -// b3Printf("b3AlignedAllocInternal %d, %x\n",size,ptr); + // b3Printf("b3AlignedAllocInternal %d, %x\n",size,ptr); return ptr; } -void b3AlignedFreeInternal (void* ptr) +void b3AlignedFreeInternal(void *ptr) { if (!ptr) { return; } - +#ifdef B3_ALLOCATOR_STATISTICS b3g_numAlignedFree++; -// b3Printf("b3AlignedFreeInternal %x\n",ptr); +#endif + // b3Printf("b3AlignedFreeInternal %x\n",ptr); b3s_alignedFreeFunc(ptr); } -#endif //B3_DEBUG_MEMORY_ALLOCATIONS - +#endif //B3_DEBUG_MEMORY_ALLOCATIONS diff --git a/thirdparty/bullet/Bullet3Common/b3AlignedAllocator.h b/thirdparty/bullet/Bullet3Common/b3AlignedAllocator.h index be418bd55f..bcff9f128e 100644 --- a/thirdparty/bullet/Bullet3Common/b3AlignedAllocator.h +++ b/thirdparty/bullet/Bullet3Common/b3AlignedAllocator.h @@ -24,84 +24,87 @@ subject to the following restrictions: //#define B3_DEBUG_MEMORY_ALLOCATIONS 1 #ifdef B3_DEBUG_MEMORY_ALLOCATIONS -#define b3AlignedAlloc(a,b) \ - b3AlignedAllocInternal(a,b,__LINE__,__FILE__) +#define b3AlignedAlloc(a, b) \ + b3AlignedAllocInternal(a, b, __LINE__, __FILE__) #define b3AlignedFree(ptr) \ - b3AlignedFreeInternal(ptr,__LINE__,__FILE__) + b3AlignedFreeInternal(ptr, __LINE__, __FILE__) -void* b3AlignedAllocInternal (size_t size, int alignment,int line,char* filename); +void* b3AlignedAllocInternal(size_t size, int alignment, int line, char* filename); -void b3AlignedFreeInternal (void* ptr,int line,char* filename); +void b3AlignedFreeInternal(void* ptr, int line, char* filename); #else - void* b3AlignedAllocInternal (size_t size, int alignment); - void b3AlignedFreeInternal (void* ptr); +void* b3AlignedAllocInternal(size_t size, int alignment); +void b3AlignedFreeInternal(void* ptr); - #define b3AlignedAlloc(size,alignment) b3AlignedAllocInternal(size,alignment) - #define b3AlignedFree(ptr) b3AlignedFreeInternal(ptr) +#define b3AlignedAlloc(size, alignment) b3AlignedAllocInternal(size, alignment) +#define b3AlignedFree(ptr) b3AlignedFreeInternal(ptr) #endif -typedef int btSizeType; +typedef int btSizeType; -typedef void *(b3AlignedAllocFunc)(size_t size, int alignment); -typedef void (b3AlignedFreeFunc)(void *memblock); -typedef void *(b3AllocFunc)(size_t size); -typedef void (b3FreeFunc)(void *memblock); +typedef void*(b3AlignedAllocFunc)(size_t size, int alignment); +typedef void(b3AlignedFreeFunc)(void* memblock); +typedef void*(b3AllocFunc)(size_t size); +typedef void(b3FreeFunc)(void* memblock); ///The developer can let all Bullet memory allocations go through a custom memory allocator, using b3AlignedAllocSetCustom -void b3AlignedAllocSetCustom(b3AllocFunc *allocFunc, b3FreeFunc *freeFunc); +void b3AlignedAllocSetCustom(b3AllocFunc* allocFunc, b3FreeFunc* freeFunc); ///If the developer has already an custom aligned allocator, then b3AlignedAllocSetCustomAligned can be used. The default aligned allocator pre-allocates extra memory using the non-aligned allocator, and instruments it. -void b3AlignedAllocSetCustomAligned(b3AlignedAllocFunc *allocFunc, b3AlignedFreeFunc *freeFunc); - +void b3AlignedAllocSetCustomAligned(b3AlignedAllocFunc* allocFunc, b3AlignedFreeFunc* freeFunc); ///The b3AlignedAllocator is a portable class for aligned memory allocations. ///Default implementations for unaligned and aligned allocations can be overridden by a custom allocator using b3AlignedAllocSetCustom and b3AlignedAllocSetCustomAligned. -template < typename T , unsigned Alignment > -class b3AlignedAllocator { - - typedef b3AlignedAllocator< T , Alignment > self_type; - -public: +template <typename T, unsigned Alignment> +class b3AlignedAllocator +{ + typedef b3AlignedAllocator<T, Alignment> self_type; +public: //just going down a list: b3AlignedAllocator() {} /* b3AlignedAllocator( const self_type & ) {} */ - template < typename Other > - b3AlignedAllocator( const b3AlignedAllocator< Other , Alignment > & ) {} + template <typename Other> + b3AlignedAllocator(const b3AlignedAllocator<Other, Alignment>&) + { + } - typedef const T* const_pointer; - typedef const T& const_reference; - typedef T* pointer; - typedef T& reference; - typedef T value_type; + typedef const T* const_pointer; + typedef const T& const_reference; + typedef T* pointer; + typedef T& reference; + typedef T value_type; - pointer address ( reference ref ) const { return &ref; } - const_pointer address ( const_reference ref ) const { return &ref; } - pointer allocate ( btSizeType n , const_pointer * hint = 0 ) { + pointer address(reference ref) const { return &ref; } + const_pointer address(const_reference ref) const { return &ref; } + pointer allocate(btSizeType n, const_pointer* hint = 0) + { (void)hint; - return reinterpret_cast< pointer >(b3AlignedAlloc( sizeof(value_type) * n , Alignment )); + return reinterpret_cast<pointer>(b3AlignedAlloc(sizeof(value_type) * n, Alignment)); } - void construct ( pointer ptr , const value_type & value ) { new (ptr) value_type( value ); } - void deallocate( pointer ptr ) { - b3AlignedFree( reinterpret_cast< void * >( ptr ) ); + void construct(pointer ptr, const value_type& value) { new (ptr) value_type(value); } + void deallocate(pointer ptr) + { + b3AlignedFree(reinterpret_cast<void*>(ptr)); } - void destroy ( pointer ptr ) { ptr->~value_type(); } - + void destroy(pointer ptr) { ptr->~value_type(); } - template < typename O > struct rebind { - typedef b3AlignedAllocator< O , Alignment > other; + template <typename O> + struct rebind + { + typedef b3AlignedAllocator<O, Alignment> other; }; - template < typename O > - self_type & operator=( const b3AlignedAllocator< O , Alignment > & ) { return *this; } + template <typename O> + self_type& operator=(const b3AlignedAllocator<O, Alignment>&) + { + return *this; + } - friend bool operator==( const self_type & , const self_type & ) { return true; } + friend bool operator==(const self_type&, const self_type&) { return true; } }; - - -#endif //B3_ALIGNED_ALLOCATOR - +#endif //B3_ALIGNED_ALLOCATOR diff --git a/thirdparty/bullet/Bullet3Common/b3AlignedObjectArray.h b/thirdparty/bullet/Bullet3Common/b3AlignedObjectArray.h index 947362d08e..249e381bf1 100644 --- a/thirdparty/bullet/Bullet3Common/b3AlignedObjectArray.h +++ b/thirdparty/bullet/Bullet3Common/b3AlignedObjectArray.h @@ -13,11 +13,10 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - #ifndef B3_OBJECT_ARRAY__ #define B3_OBJECT_ARRAY__ -#include "b3Scalar.h" // has definitions like B3_FORCE_INLINE +#include "b3Scalar.h" // has definitions like B3_FORCE_INLINE #include "b3AlignedAllocator.h" ///If the platform doesn't support placement new, you can disable B3_USE_PLACEMENT_NEW @@ -28,402 +27,386 @@ subject to the following restrictions: #define B3_USE_PLACEMENT_NEW 1 //#define B3_USE_MEMCPY 1 //disable, because it is cumbersome to find out for each platform where memcpy is defined. It can be in <memory.h> or <string.h> or otherwise... -#define B3_ALLOW_ARRAY_COPY_OPERATOR // enabling this can accidently perform deep copies of data if you are not careful +#define B3_ALLOW_ARRAY_COPY_OPERATOR // enabling this can accidently perform deep copies of data if you are not careful #ifdef B3_USE_MEMCPY #include <memory.h> #include <string.h> -#endif //B3_USE_MEMCPY +#endif //B3_USE_MEMCPY #ifdef B3_USE_PLACEMENT_NEW -#include <new> //for placement new -#endif //B3_USE_PLACEMENT_NEW - +#include <new> //for placement new +#endif //B3_USE_PLACEMENT_NEW ///The b3AlignedObjectArray template class uses a subset of the stl::vector interface for its methods ///It is developed to replace stl::vector to avoid portability issues, including STL alignment issues to add SIMD/SSE data -template <typename T> -//template <class T> +template <typename T> +//template <class T> class b3AlignedObjectArray { - b3AlignedAllocator<T , 16> m_allocator; + b3AlignedAllocator<T, 16> m_allocator; - int m_size; - int m_capacity; - T* m_data; + int m_size; + int m_capacity; + T* m_data; //PCK: added this line - bool m_ownsMemory; + bool m_ownsMemory; #ifdef B3_ALLOW_ARRAY_COPY_OPERATOR public: - B3_FORCE_INLINE b3AlignedObjectArray<T>& operator=(const b3AlignedObjectArray<T> &other) + B3_FORCE_INLINE b3AlignedObjectArray<T>& operator=(const b3AlignedObjectArray<T>& other) { copyFromArray(other); return *this; } -#else//B3_ALLOW_ARRAY_COPY_OPERATOR +#else //B3_ALLOW_ARRAY_COPY_OPERATOR private: - B3_FORCE_INLINE b3AlignedObjectArray<T>& operator=(const b3AlignedObjectArray<T> &other); -#endif//B3_ALLOW_ARRAY_COPY_OPERATOR + B3_FORCE_INLINE b3AlignedObjectArray<T>& operator=(const b3AlignedObjectArray<T>& other); +#endif //B3_ALLOW_ARRAY_COPY_OPERATOR protected: - B3_FORCE_INLINE int allocSize(int size) - { - return (size ? size*2 : 1); - } - B3_FORCE_INLINE void copy(int start,int end, T* dest) const - { - int i; - for (i=start;i<end;++i) + B3_FORCE_INLINE int allocSize(int size) + { + return (size ? size * 2 : 1); + } + B3_FORCE_INLINE void copy(int start, int end, T* dest) const + { + int i; + for (i = start; i < end; ++i) #ifdef B3_USE_PLACEMENT_NEW - new (&dest[i]) T(m_data[i]); + new (&dest[i]) T(m_data[i]); #else - dest[i] = m_data[i]; -#endif //B3_USE_PLACEMENT_NEW - } + dest[i] = m_data[i]; +#endif //B3_USE_PLACEMENT_NEW + } - B3_FORCE_INLINE void init() + B3_FORCE_INLINE void init() + { + //PCK: added this line + m_ownsMemory = true; + m_data = 0; + m_size = 0; + m_capacity = 0; + } + B3_FORCE_INLINE void destroy(int first, int last) + { + int i; + for (i = first; i < last; i++) { - //PCK: added this line - m_ownsMemory = true; - m_data = 0; - m_size = 0; - m_capacity = 0; + m_data[i].~T(); } - B3_FORCE_INLINE void destroy(int first,int last) + } + + B3_FORCE_INLINE void* allocate(int size) + { + if (size) + return m_allocator.allocate(size); + return 0; + } + + B3_FORCE_INLINE void deallocate() + { + if (m_data) { - int i; - for (i=first; i<last;i++) + //PCK: enclosed the deallocation in this block + if (m_ownsMemory) { - m_data[i].~T(); + m_allocator.deallocate(m_data); } + m_data = 0; } + } - B3_FORCE_INLINE void* allocate(int size) - { - if (size) - return m_allocator.allocate(size); - return 0; - } +public: + b3AlignedObjectArray() + { + init(); + } - B3_FORCE_INLINE void deallocate() - { - if(m_data) { - //PCK: enclosed the deallocation in this block - if (m_ownsMemory) - { - m_allocator.deallocate(m_data); - } - m_data = 0; - } - } + ~b3AlignedObjectArray() + { + clear(); + } - + ///Generally it is best to avoid using the copy constructor of an b3AlignedObjectArray, and use a (const) reference to the array instead. + b3AlignedObjectArray(const b3AlignedObjectArray& otherArray) + { + init(); + int otherSize = otherArray.size(); + resize(otherSize); + otherArray.copy(0, otherSize, m_data); + } - public: - - b3AlignedObjectArray() - { - init(); - } + /// return the number of elements in the array + B3_FORCE_INLINE int size() const + { + return m_size; + } - ~b3AlignedObjectArray() - { - clear(); - } + B3_FORCE_INLINE const T& at(int n) const + { + b3Assert(n >= 0); + b3Assert(n < size()); + return m_data[n]; + } - ///Generally it is best to avoid using the copy constructor of an b3AlignedObjectArray, and use a (const) reference to the array instead. - b3AlignedObjectArray(const b3AlignedObjectArray& otherArray) - { - init(); + B3_FORCE_INLINE T& at(int n) + { + b3Assert(n >= 0); + b3Assert(n < size()); + return m_data[n]; + } - int otherSize = otherArray.size(); - resize (otherSize); - otherArray.copy(0, otherSize, m_data); - } + B3_FORCE_INLINE const T& operator[](int n) const + { + b3Assert(n >= 0); + b3Assert(n < size()); + return m_data[n]; + } - - - /// return the number of elements in the array - B3_FORCE_INLINE int size() const - { - return m_size; - } - - B3_FORCE_INLINE const T& at(int n) const - { - b3Assert(n>=0); - b3Assert(n<size()); - return m_data[n]; - } + B3_FORCE_INLINE T& operator[](int n) + { + b3Assert(n >= 0); + b3Assert(n < size()); + return m_data[n]; + } - B3_FORCE_INLINE T& at(int n) - { - b3Assert(n>=0); - b3Assert(n<size()); - return m_data[n]; - } + ///clear the array, deallocated memory. Generally it is better to use array.resize(0), to reduce performance overhead of run-time memory (de)allocations. + B3_FORCE_INLINE void clear() + { + destroy(0, size()); - B3_FORCE_INLINE const T& operator[](int n) const - { - b3Assert(n>=0); - b3Assert(n<size()); - return m_data[n]; - } + deallocate(); - B3_FORCE_INLINE T& operator[](int n) - { - b3Assert(n>=0); - b3Assert(n<size()); - return m_data[n]; - } - + init(); + } - ///clear the array, deallocated memory. Generally it is better to use array.resize(0), to reduce performance overhead of run-time memory (de)allocations. - B3_FORCE_INLINE void clear() + B3_FORCE_INLINE void pop_back() + { + b3Assert(m_size > 0); + m_size--; + m_data[m_size].~T(); + } + + ///resize changes the number of elements in the array. If the new size is larger, the new elements will be constructed using the optional second argument. + ///when the new number of elements is smaller, the destructor will be called, but memory will not be freed, to reduce performance overhead of run-time memory (de)allocations. + B3_FORCE_INLINE void resizeNoInitialize(int newsize) + { + int curSize = size(); + + if (newsize < curSize) { - destroy(0,size()); - - deallocate(); - - init(); } - - B3_FORCE_INLINE void pop_back() + else { - b3Assert(m_size>0); - m_size--; - m_data[m_size].~T(); + if (newsize > size()) + { + reserve(newsize); + } + //leave this uninitialized } + m_size = newsize; + } + B3_FORCE_INLINE void resize(int newsize, const T& fillData = T()) + { + int curSize = size(); - ///resize changes the number of elements in the array. If the new size is larger, the new elements will be constructed using the optional second argument. - ///when the new number of elements is smaller, the destructor will be called, but memory will not be freed, to reduce performance overhead of run-time memory (de)allocations. - B3_FORCE_INLINE void resizeNoInitialize(int newsize) + if (newsize < curSize) { - int curSize = size(); - - if (newsize < curSize) - { - } else + for (int i = newsize; i < curSize; i++) { - if (newsize > size()) - { - reserve(newsize); - } - //leave this uninitialized + m_data[i].~T(); } - m_size = newsize; } - - B3_FORCE_INLINE void resize(int newsize, const T& fillData=T()) + else { - int curSize = size(); - - if (newsize < curSize) - { - for(int i = newsize; i < curSize; i++) - { - m_data[i].~T(); - } - } else + if (newsize > size()) { - if (newsize > size()) - { - reserve(newsize); - } -#ifdef B3_USE_PLACEMENT_NEW - for (int i=curSize;i<newsize;i++) - { - new ( &m_data[i]) T(fillData); - } -#endif //B3_USE_PLACEMENT_NEW - + reserve(newsize); } - - m_size = newsize; - } - B3_FORCE_INLINE T& expandNonInitializing( ) - { - int sz = size(); - if( sz == capacity() ) +#ifdef B3_USE_PLACEMENT_NEW + for (int i = curSize; i < newsize; i++) { - reserve( allocSize(size()) ); + new (&m_data[i]) T(fillData); } - m_size++; +#endif //B3_USE_PLACEMENT_NEW + } - return m_data[sz]; + m_size = newsize; + } + B3_FORCE_INLINE T& expandNonInitializing() + { + int sz = size(); + if (sz == capacity()) + { + reserve(allocSize(size())); } + m_size++; + return m_data[sz]; + } - B3_FORCE_INLINE T& expand( const T& fillValue=T()) - { - int sz = size(); - if( sz == capacity() ) - { - reserve( allocSize(size()) ); - } - m_size++; + B3_FORCE_INLINE T& expand(const T& fillValue = T()) + { + int sz = size(); + if (sz == capacity()) + { + reserve(allocSize(size())); + } + m_size++; #ifdef B3_USE_PLACEMENT_NEW - new (&m_data[sz]) T(fillValue); //use the in-place new (not really allocating heap memory) + new (&m_data[sz]) T(fillValue); //use the in-place new (not really allocating heap memory) #endif - return m_data[sz]; - } + return m_data[sz]; + } + B3_FORCE_INLINE void push_back(const T& _Val) + { + int sz = size(); + if (sz == capacity()) + { + reserve(allocSize(size())); + } - B3_FORCE_INLINE void push_back(const T& _Val) - { - int sz = size(); - if( sz == capacity() ) - { - reserve( allocSize(size()) ); - } - #ifdef B3_USE_PLACEMENT_NEW - new ( &m_data[m_size] ) T(_Val); + new (&m_data[m_size]) T(_Val); #else - m_data[size()] = _Val; -#endif //B3_USE_PLACEMENT_NEW + m_data[size()] = _Val; +#endif //B3_USE_PLACEMENT_NEW - m_size++; - } + m_size++; + } - - /// return the pre-allocated (reserved) elements, this is at least as large as the total number of elements,see size() and reserve() - B3_FORCE_INLINE int capacity() const - { - return m_capacity; - } - - B3_FORCE_INLINE void reserve(int _Count) - { // determine new minimum length of allocated storage - if (capacity() < _Count) - { // not enough room, reallocate - T* s = (T*)allocate(_Count); - b3Assert(s); - if (s==0) - { - b3Error("b3AlignedObjectArray reserve out-of-memory\n"); - _Count=0; - m_size=0; - } - copy(0, size(), s); - - destroy(0,size()); - - deallocate(); - - //PCK: added this line - m_ownsMemory = true; - - m_data = s; - - m_capacity = _Count; + /// return the pre-allocated (reserved) elements, this is at least as large as the total number of elements,see size() and reserve() + B3_FORCE_INLINE int capacity() const + { + return m_capacity; + } + B3_FORCE_INLINE void reserve(int _Count) + { // determine new minimum length of allocated storage + if (capacity() < _Count) + { // not enough room, reallocate + T* s = (T*)allocate(_Count); + b3Assert(s); + if (s == 0) + { + b3Error("b3AlignedObjectArray reserve out-of-memory\n"); + _Count = 0; + m_size = 0; } - } + copy(0, size(), s); + destroy(0, size()); - class less - { - public: + deallocate(); + + //PCK: added this line + m_ownsMemory = true; + + m_data = s; - bool operator() ( const T& a, const T& b ) - { - return ( a < b ); - } - }; - + m_capacity = _Count; + } + } - template <typename L> - void quickSortInternal(const L& CompareFunc,int lo, int hi) + class less + { + public: + bool operator()(const T& a, const T& b) { - // lo is the lower index, hi is the upper index - // of the region of array a that is to be sorted - int i=lo, j=hi; - T x=m_data[(lo+hi)/2]; - - // partition - do - { - while (CompareFunc(m_data[i],x)) - i++; - while (CompareFunc(x,m_data[j])) - j--; - if (i<=j) - { - swap(i,j); - i++; j--; - } - } while (i<=j); - - // recursion - if (lo<j) - quickSortInternal( CompareFunc, lo, j); - if (i<hi) - quickSortInternal( CompareFunc, i, hi); + return (a < b); } + }; + template <typename L> + void quickSortInternal(const L& CompareFunc, int lo, int hi) + { + // lo is the lower index, hi is the upper index + // of the region of array a that is to be sorted + int i = lo, j = hi; + T x = m_data[(lo + hi) / 2]; - template <typename L> - void quickSort(const L& CompareFunc) + // partition + do { - //don't sort 0 or 1 elements - if (size()>1) + while (CompareFunc(m_data[i], x)) + i++; + while (CompareFunc(x, m_data[j])) + j--; + if (i <= j) { - quickSortInternal(CompareFunc,0,size()-1); + swap(i, j); + i++; + j--; } + } while (i <= j); + + // recursion + if (lo < j) + quickSortInternal(CompareFunc, lo, j); + if (i < hi) + quickSortInternal(CompareFunc, i, hi); + } + + template <typename L> + void quickSort(const L& CompareFunc) + { + //don't sort 0 or 1 elements + if (size() > 1) + { + quickSortInternal(CompareFunc, 0, size() - 1); } + } + ///heap sort from http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Sort/Heap/ + template <typename L> + void downHeap(T* pArr, int k, int n, const L& CompareFunc) + { + /* PRE: a[k+1..N] is a heap */ + /* POST: a[k..N] is a heap */ - ///heap sort from http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Sort/Heap/ - template <typename L> - void downHeap(T *pArr, int k, int n, const L& CompareFunc) + T temp = pArr[k - 1]; + /* k has child(s) */ + while (k <= n / 2) { - /* PRE: a[k+1..N] is a heap */ - /* POST: a[k..N] is a heap */ - - T temp = pArr[k - 1]; - /* k has child(s) */ - while (k <= n/2) + int child = 2 * k; + + if ((child < n) && CompareFunc(pArr[child - 1], pArr[child])) + { + child++; + } + /* pick larger child */ + if (CompareFunc(temp, pArr[child - 1])) { - int child = 2*k; - - if ((child < n) && CompareFunc(pArr[child - 1] , pArr[child])) - { - child++; - } - /* pick larger child */ - if (CompareFunc(temp , pArr[child - 1])) - { - /* move child up */ - pArr[k - 1] = pArr[child - 1]; - k = child; - } - else - { - break; - } + /* move child up */ + pArr[k - 1] = pArr[child - 1]; + k = child; } - pArr[k - 1] = temp; - } /*downHeap*/ + else + { + break; + } + } + pArr[k - 1] = temp; + } /*downHeap*/ - void swap(int index0,int index1) - { + void swap(int index0, int index1) + { #ifdef B3_USE_MEMCPY - char temp[sizeof(T)]; - memcpy(temp,&m_data[index0],sizeof(T)); - memcpy(&m_data[index0],&m_data[index1],sizeof(T)); - memcpy(&m_data[index1],temp,sizeof(T)); + char temp[sizeof(T)]; + memcpy(temp, &m_data[index0], sizeof(T)); + memcpy(&m_data[index0], &m_data[index1], sizeof(T)); + memcpy(&m_data[index1], temp, sizeof(T)); #else - T temp = m_data[index0]; - m_data[index0] = m_data[index1]; - m_data[index1] = temp; -#endif //B3_USE_PLACEMENT_NEW - - } + T temp = m_data[index0]; + m_data[index0] = m_data[index1]; + m_data[index1] = temp; +#endif //B3_USE_PLACEMENT_NEW + } template <typename L> void heapSort(const L& CompareFunc) @@ -431,49 +414,48 @@ protected: /* sort a[0..N-1], N.B. 0 to N-1 */ int k; int n = m_size; - for (k = n/2; k > 0; k--) + for (k = n / 2; k > 0; k--) { downHeap(m_data, k, n, CompareFunc); } /* a[1..N] is now a heap */ - while ( n>=1 ) + while (n >= 1) { - swap(0,n-1); /* largest of a[0..n-1] */ - + swap(0, n - 1); /* largest of a[0..n-1] */ n = n - 1; /* restore a[1..i-1] heap */ downHeap(m_data, 1, n, CompareFunc); - } + } } ///non-recursive binary search, assumes sorted array - int findBinarySearch(const T& key) const + int findBinarySearch(const T& key) const { int first = 0; - int last = size()-1; + int last = size() - 1; //assume sorted array - while (first <= last) { + while (first <= last) + { int mid = (first + last) / 2; // compute mid point. - if (key > m_data[mid]) + if (key > m_data[mid]) first = mid + 1; // repeat search in top half. - else if (key < m_data[mid]) - last = mid - 1; // repeat search in bottom half. + else if (key < m_data[mid]) + last = mid - 1; // repeat search in bottom half. else - return mid; // found it. return position ///// + return mid; // found it. return position ///// } - return size(); // failed to find key + return size(); // failed to find key } - - int findLinearSearch(const T& key) const + int findLinearSearch(const T& key) const { - int index=size(); + int index = size(); int i; - for (i=0;i<size();i++) + for (i = 0; i < size(); i++) { if (m_data[i] == key) { @@ -483,36 +465,35 @@ protected: } return index; } - - int findLinearSearch2(const T& key) const - { - int index=-1; - int i; - - for (i=0;i<size();i++) - { - if (m_data[i] == key) - { - index = i; - break; - } - } - return index; - } - - void remove(const T& key) + + int findLinearSearch2(const T& key) const { + int index = -1; + int i; + for (i = 0; i < size(); i++) + { + if (m_data[i] == key) + { + index = i; + break; + } + } + return index; + } + + void remove(const T& key) + { int findIndex = findLinearSearch(key); - if (findIndex<size()) + if (findIndex < size()) { - swap( findIndex,size()-1); + swap(findIndex, size() - 1); pop_back(); } } //PCK: whole function - void initializeFromBuffer(void *buffer, int size, int capacity) + void initializeFromBuffer(void* buffer, int size, int capacity) { clear(); m_ownsMemory = false; @@ -524,10 +505,18 @@ protected: void copyFromArray(const b3AlignedObjectArray& otherArray) { int otherSize = otherArray.size(); - resize (otherSize); + resize(otherSize); otherArray.copy(0, otherSize, m_data); } + void removeAtIndex(int index) + { + if (index < size()) + { + swap(index, size() - 1); + pop_back(); + } + } }; -#endif //B3_OBJECT_ARRAY__ +#endif //B3_OBJECT_ARRAY__ diff --git a/thirdparty/bullet/Bullet3Common/b3CommandLineArgs.h b/thirdparty/bullet/Bullet3Common/b3CommandLineArgs.h index 38df8e2600..5fe4f25f8d 100644 --- a/thirdparty/bullet/Bullet3Common/b3CommandLineArgs.h +++ b/thirdparty/bullet/Bullet3Common/b3CommandLineArgs.h @@ -12,51 +12,54 @@ class b3CommandLineArgs { protected: - std::map<std::string, std::string> pairs; public: - // Constructor b3CommandLineArgs(int argc, char **argv) { - addArgs(argc,argv); + addArgs(argc, argv); } - void addArgs(int argc, char**argv) + void addArgs(int argc, char **argv) { - for (int i = 1; i < argc; i++) - { - std::string arg = argv[i]; + for (int i = 1; i < argc; i++) + { + std::string arg = argv[i]; + + if ((arg.length() < 2) || (arg[0] != '-') || (arg[1] != '-')) + { + continue; + } - if ((arg.length() < 2) || (arg[0] != '-') || (arg[1] != '-')) { - continue; - } + std::string::size_type pos; + std::string key, val; + if ((pos = arg.find('=')) == std::string::npos) + { + key = std::string(arg, 2, arg.length() - 2); + val = ""; + } + else + { + key = std::string(arg, 2, pos - 2); + val = std::string(arg, pos + 1, arg.length() - 1); + } - std::string::size_type pos; - std::string key, val; - if ((pos = arg.find( '=')) == std::string::npos) { - key = std::string(arg, 2, arg.length() - 2); - val = ""; - } else { - key = std::string(arg, 2, pos - 2); - val = std::string(arg, pos + 1, arg.length() - 1); - } - //only add new keys, don't replace existing - if(pairs.find(key) == pairs.end()) + if (pairs.find(key) == pairs.end()) { - pairs[key] = val; + pairs[key] = val; } - } + } } - bool CheckCmdLineFlag(const char* arg_name) + bool CheckCmdLineFlag(const char *arg_name) { std::map<std::string, std::string>::iterator itr; - if ((itr = pairs.find(arg_name)) != pairs.end()) { + if ((itr = pairs.find(arg_name)) != pairs.end()) + { return true; - } + } return false; } @@ -73,29 +76,31 @@ template <typename T> inline bool b3CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val) { std::map<std::string, std::string>::iterator itr; - if ((itr = pairs.find(arg_name)) != pairs.end()) { + if ((itr = pairs.find(arg_name)) != pairs.end()) + { std::istringstream strstream(itr->second); strstream >> val; return true; - } + } return false; } template <> -inline bool b3CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val) +inline bool b3CommandLineArgs::GetCmdLineArgument<char *>(const char *arg_name, char *&val) { std::map<std::string, std::string>::iterator itr; - if ((itr = pairs.find(arg_name)) != pairs.end()) { - + if ((itr = pairs.find(arg_name)) != pairs.end()) + { std::string s = itr->second; - val = (char*) malloc(sizeof(char) * (s.length() + 1)); + val = (char *)malloc(sizeof(char) * (s.length() + 1)); std::strcpy(val, s.c_str()); return true; - } else { - val = NULL; + } + else + { + val = NULL; } return false; } - -#endif //COMMAND_LINE_ARGS_H +#endif //COMMAND_LINE_ARGS_H diff --git a/thirdparty/bullet/Bullet3Common/b3FileUtils.h b/thirdparty/bullet/Bullet3Common/b3FileUtils.h index 1a331029ea..9ded17eaaf 100644 --- a/thirdparty/bullet/Bullet3Common/b3FileUtils.h +++ b/thirdparty/bullet/Bullet3Common/b3FileUtils.h @@ -3,7 +3,7 @@ #include <stdio.h> #include "b3Scalar.h" -#include <stddef.h>//ptrdiff_h +#include <stddef.h> //ptrdiff_h #include <string.h> struct b3FileUtils @@ -17,42 +17,42 @@ struct b3FileUtils static bool findFile(const char* orgFileName, char* relativeFileName, int maxRelativeFileNameMaxLen) { - FILE* f=0; - f = fopen(orgFileName,"rb"); - if (f) - { + FILE* f = 0; + f = fopen(orgFileName, "rb"); + if (f) + { //printf("original file found: [%s]\n", orgFileName); - sprintf(relativeFileName,"%s", orgFileName); + sprintf(relativeFileName, "%s", orgFileName); fclose(f); return true; } - //printf("Trying various directories, relative to current working directory\n"); - const char* prefix[]={"./","./data/","../data/","../../data/","../../../data/","../../../../data/"}; - int numPrefixes = sizeof(prefix)/sizeof(const char*); - - f=0; - bool fileFound = false; + //printf("Trying various directories, relative to current working directory\n"); + const char* prefix[] = {"./", "./data/", "../data/", "../../data/", "../../../data/", "../../../../data/"}; + int numPrefixes = sizeof(prefix) / sizeof(const char*); - for (int i=0;!f && i<numPrefixes;i++) - { -#ifdef _WIN32 - sprintf_s(relativeFileName,maxRelativeFileNameMaxLen,"%s%s",prefix[i],orgFileName); + f = 0; + bool fileFound = false; + + for (int i = 0; !f && i < numPrefixes; i++) + { +#ifdef _MSC_VER + sprintf_s(relativeFileName, maxRelativeFileNameMaxLen, "%s%s", prefix[i], orgFileName); #else - sprintf(relativeFileName,"%s%s",prefix[i],orgFileName); + sprintf(relativeFileName, "%s%s", prefix[i], orgFileName); #endif - f = fopen(relativeFileName,"rb"); - if (f) - { - fileFound = true; - break; - } - } + f = fopen(relativeFileName, "rb"); if (f) { - fclose(f); + fileFound = true; + break; } - + } + if (f) + { + fclose(f); + } + return fileFound; } @@ -60,8 +60,8 @@ struct b3FileUtils { size_t const patlen = strlen(pattern); size_t patcnt = 0; - const char * oriptr; - const char * patloc; + const char* oriptr; + const char* patloc; // find how many times the pattern occurs in the original string for (oriptr = name; (patloc = strstr(oriptr, pattern)); oriptr = patloc + patlen) { @@ -70,29 +70,27 @@ struct b3FileUtils return oriptr; } - - static int extractPath(const char* fileName, char* path, int maxPathLength) { const char* stripped = strip2(fileName, "/"); stripped = strip2(stripped, "\\"); - ptrdiff_t len = stripped-fileName; - b3Assert((len+1)<maxPathLength); + ptrdiff_t len = stripped - fileName; + b3Assert((len + 1) < maxPathLength); - if (len && ((len+1)<maxPathLength)) + if (len && ((len + 1) < maxPathLength)) { - - for (int i=0;i<len;i++) + for (int i = 0; i < len; i++) { path[i] = fileName[i]; } - path[len]=0; - } else + path[len] = 0; + } + else { len = 0; - b3Assert(maxPathLength>0); - if (maxPathLength>0) + b3Assert(maxPathLength > 0); + if (maxPathLength > 0) { path[len] = 0; } @@ -102,23 +100,21 @@ struct b3FileUtils static char toLowerChar(const char t) { - if (t>=(char)'A' && t<=(char)'Z') + if (t >= (char)'A' && t <= (char)'Z') return t + ((char)'a' - (char)'A'); else return t; } - static void toLower(char* str) { - int len=strlen(str); - for (int i=0;i<len;i++) + int len = strlen(str); + for (int i = 0; i < len; i++) { str[i] = toLowerChar(str[i]); } } - /*static const char* strip2(const char* name, const char* pattern) { size_t const patlen = strlen(pattern); @@ -133,6 +129,5 @@ struct b3FileUtils return oriptr; } */ - }; -#endif //B3_FILE_UTILS_H +#endif //B3_FILE_UTILS_H diff --git a/thirdparty/bullet/Bullet3Common/b3HashMap.h b/thirdparty/bullet/Bullet3Common/b3HashMap.h index 24a59d9baa..3009e2cf2f 100644 --- a/thirdparty/bullet/Bullet3Common/b3HashMap.h +++ b/thirdparty/bullet/Bullet3Common/b3HashMap.h @@ -13,86 +13,80 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - #ifndef B3_HASH_MAP_H #define B3_HASH_MAP_H #include "b3AlignedObjectArray.h" - #include <string> ///very basic hashable string implementation, compatible with b3HashMap struct b3HashString { std::string m_string; - unsigned int m_hash; + unsigned int m_hash; - B3_FORCE_INLINE unsigned int getHash()const + B3_FORCE_INLINE unsigned int getHash() const { return m_hash; } - b3HashString(const char* name) - :m_string(name) + : m_string(name) { - /* magic numbers from http://www.isthe.com/chongo/tech/comp/fnv/ */ - static const unsigned int InitialFNV = 2166136261u; + static const unsigned int InitialFNV = 2166136261u; static const unsigned int FNVMultiple = 16777619u; /* Fowler / Noll / Vo (FNV) Hash */ unsigned int hash = InitialFNV; int len = m_string.length(); - for(int i = 0; i<len; i++) + for (int i = 0; i < len; i++) { - hash = hash ^ (m_string[i]); /* xor the low 8 bits */ - hash = hash * FNVMultiple; /* multiply by the magic number */ + hash = hash ^ (m_string[i]); /* xor the low 8 bits */ + hash = hash * FNVMultiple; /* multiply by the magic number */ } m_hash = hash; } - int portableStringCompare(const char* src, const char* dst) const + int portableStringCompare(const char* src, const char* dst) const { - int ret = 0 ; + int ret = 0; - while( ! (ret = *(unsigned char *)src - *(unsigned char *)dst) && *dst) - ++src, ++dst; + while (!(ret = *(unsigned char*)src - *(unsigned char*)dst) && *dst) + ++src, ++dst; - if ( ret < 0 ) - ret = -1 ; - else if ( ret > 0 ) - ret = 1 ; + if (ret < 0) + ret = -1; + else if (ret > 0) + ret = 1; - return( ret ); + return (ret); } bool equals(const b3HashString& other) const { return (m_string == other.m_string); } - }; - -const int B3_HASH_NULL=0xffffffff; - +const int B3_HASH_NULL = 0xffffffff; class b3HashInt { - int m_uid; + int m_uid; + public: - b3HashInt(int uid) :m_uid(uid) + b3HashInt(int uid) : m_uid(uid) { } - int getUid1() const + int getUid1() const { return m_uid; } - void setUid1(int uid) + void setUid1(int uid) { m_uid = uid; } @@ -102,34 +96,34 @@ public: return getUid1() == other.getUid1(); } //to our success - B3_FORCE_INLINE unsigned int getHash()const + B3_FORCE_INLINE unsigned int getHash() const { int key = m_uid; // Thomas Wang's hash - key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); return key; } }; - - class b3HashPtr { - - union - { - const void* m_pointer; - int m_hashValues[2]; + union { + const void* m_pointer; + int m_hashValues[2]; }; public: - b3HashPtr(const void* ptr) - :m_pointer(ptr) + : m_pointer(ptr) { } - const void* getPointer() const + const void* getPointer() const { return m_pointer; } @@ -140,65 +134,69 @@ public: } //to our success - B3_FORCE_INLINE unsigned int getHash()const + B3_FORCE_INLINE unsigned int getHash() const { - const bool VOID_IS_8 = ((sizeof(void*)==8)); - - int key = VOID_IS_8? m_hashValues[0]+m_hashValues[1] : m_hashValues[0]; - + const bool VOID_IS_8 = ((sizeof(void*) == 8)); + + int key = VOID_IS_8 ? m_hashValues[0] + m_hashValues[1] : m_hashValues[0]; + // Thomas Wang's hash - key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); return key; } - - }; - template <class Value> class b3HashKeyPtr { - int m_uid; + int m_uid; + public: + b3HashKeyPtr(int uid) : m_uid(uid) + { + } - b3HashKeyPtr(int uid) :m_uid(uid) - { - } - - int getUid1() const - { - return m_uid; - } - - bool equals(const b3HashKeyPtr<Value>& other) const - { - return getUid1() == other.getUid1(); - } - - //to our success - B3_FORCE_INLINE unsigned int getHash()const - { - int key = m_uid; - // Thomas Wang's hash - key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); - return key; - } - - -}; + int getUid1() const + { + return m_uid; + } + bool equals(const b3HashKeyPtr<Value>& other) const + { + return getUid1() == other.getUid1(); + } + + //to our success + B3_FORCE_INLINE unsigned int getHash() const + { + int key = m_uid; + // Thomas Wang's hash + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; + } +}; template <class Value> class b3HashKey { - int m_uid; -public: + int m_uid; - b3HashKey(int uid) :m_uid(uid) +public: + b3HashKey(int uid) : m_uid(uid) { } - int getUid1() const + int getUid1() const { return m_uid; } @@ -208,30 +206,33 @@ public: return getUid1() == other.getUid1(); } //to our success - B3_FORCE_INLINE unsigned int getHash()const + B3_FORCE_INLINE unsigned int getHash() const { int key = m_uid; // Thomas Wang's hash - key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); return key; } }; - ///The b3HashMap template class implements a generic and lightweight hashmap. ///A basic sample of how to use b3HashMap is located in Demos\BasicDemo\main.cpp template <class Key, class Value> class b3HashMap { - protected: - b3AlignedObjectArray<int> m_hashTable; - b3AlignedObjectArray<int> m_next; - - b3AlignedObjectArray<Value> m_valueArray; - b3AlignedObjectArray<Key> m_keyArray; + b3AlignedObjectArray<int> m_hashTable; + b3AlignedObjectArray<int> m_next; + + b3AlignedObjectArray<Value> m_valueArray; + b3AlignedObjectArray<Key> m_keyArray; - void growTables(const Key& /*key*/) + void growTables(const Key& /*key*/) { int newCapacity = m_valueArray.capacity(); @@ -245,7 +246,7 @@ protected: int i; - for (i= 0; i < newCapacity; ++i) + for (i = 0; i < newCapacity; ++i) { m_hashTable[i] = B3_HASH_NULL; } @@ -254,30 +255,28 @@ protected: m_next[i] = B3_HASH_NULL; } - for(i=0;i<curHashtableSize;i++) + for (i = 0; i < curHashtableSize; i++) { //const Value& value = m_valueArray[i]; //const Key& key = m_keyArray[i]; - int hashValue = m_keyArray[i].getHash() & (m_valueArray.capacity()-1); // New hash value with new mask + int hashValue = m_keyArray[i].getHash() & (m_valueArray.capacity() - 1); // New hash value with new mask m_next[i] = m_hashTable[hashValue]; m_hashTable[hashValue] = i; } - - } } - public: - - void insert(const Key& key, const Value& value) { - int hash = key.getHash() & (m_valueArray.capacity()-1); +public: + void insert(const Key& key, const Value& value) + { + int hash = key.getHash() & (m_valueArray.capacity() - 1); //replace value if the key is already there int index = findIndex(key); if (index != B3_HASH_NULL) { - m_valueArray[index]=value; + m_valueArray[index] = value; return; } @@ -291,19 +290,19 @@ protected: { growTables(key); //hash with new capacity - hash = key.getHash() & (m_valueArray.capacity()-1); + hash = key.getHash() & (m_valueArray.capacity() - 1); } m_next[count] = m_hashTable[hash]; m_hashTable[hash] = count; } - void remove(const Key& key) { - - int hash = key.getHash() & (m_valueArray.capacity()-1); + void remove(const Key& key) + { + int hash = key.getHash() & (m_valueArray.capacity() - 1); int pairIndex = findIndex(key); - - if (pairIndex ==B3_HASH_NULL) + + if (pairIndex == B3_HASH_NULL) { return; } @@ -344,7 +343,7 @@ protected: } // Remove the last pair from the hash table. - int lastHash = m_keyArray[lastPairIndex].getHash() & (m_valueArray.capacity()-1); + int lastHash = m_keyArray[lastPairIndex].getHash() & (m_valueArray.capacity() - 1); index = m_hashTable[lastHash]; b3Assert(index != B3_HASH_NULL); @@ -376,10 +375,8 @@ protected: m_valueArray.pop_back(); m_keyArray.pop_back(); - } - int size() const { return m_valueArray.size(); @@ -399,23 +396,24 @@ protected: return &m_valueArray[index]; } - Key getKeyAtIndex(int index) - { - b3Assert(index < m_keyArray.size()); - return m_keyArray[index]; - } - - const Key getKeyAtIndex(int index) const - { - b3Assert(index < m_keyArray.size()); - return m_keyArray[index]; - } + Key getKeyAtIndex(int index) + { + b3Assert(index < m_keyArray.size()); + return m_keyArray[index]; + } + + const Key getKeyAtIndex(int index) const + { + b3Assert(index < m_keyArray.size()); + return m_keyArray[index]; + } - Value* operator[](const Key& key) { + Value* operator[](const Key& key) + { return find(key); } - const Value* find(const Key& key) const + const Value* find(const Key& key) const { int index = findIndex(key); if (index == B3_HASH_NULL) @@ -425,7 +423,7 @@ protected: return &m_valueArray[index]; } - Value* find(const Key& key) + Value* find(const Key& key) { int index = findIndex(key); if (index == B3_HASH_NULL) @@ -435,10 +433,9 @@ protected: return &m_valueArray[index]; } - - int findIndex(const Key& key) const + int findIndex(const Key& key) const { - unsigned int hash = key.getHash() & (m_valueArray.capacity()-1); + unsigned int hash = key.getHash() & (m_valueArray.capacity() - 1); if (hash >= (unsigned int)m_hashTable.size()) { @@ -453,14 +450,13 @@ protected: return index; } - void clear() + void clear() { m_hashTable.clear(); m_next.clear(); m_valueArray.clear(); m_keyArray.clear(); } - }; -#endif //B3_HASH_MAP_H +#endif //B3_HASH_MAP_H diff --git a/thirdparty/bullet/Bullet3Common/b3Logging.cpp b/thirdparty/bullet/Bullet3Common/b3Logging.cpp index a8e9507155..9c9f7c09ea 100644 --- a/thirdparty/bullet/Bullet3Common/b3Logging.cpp +++ b/thirdparty/bullet/Bullet3Common/b3Logging.cpp @@ -20,17 +20,16 @@ subject to the following restrictions: #ifdef _WIN32 #include <windows.h> -#endif //_WIN32 - +#endif //_WIN32 void b3PrintfFuncDefault(const char* msg) { #ifdef _WIN32 OutputDebugStringA(msg); #endif - printf("%s",msg); - //is this portable? - fflush(stdout); + printf("%s", msg); + //is this portable? + fflush(stdout); } void b3WarningMessageFuncDefault(const char* msg) @@ -38,32 +37,26 @@ void b3WarningMessageFuncDefault(const char* msg) #ifdef _WIN32 OutputDebugStringA(msg); #endif - printf("%s",msg); - //is this portable? - fflush(stdout); - + printf("%s", msg); + //is this portable? + fflush(stdout); } - void b3ErrorMessageFuncDefault(const char* msg) { #ifdef _WIN32 OutputDebugStringA(msg); #endif - printf("%s",msg); + printf("%s", msg); - //is this portable? - fflush(stdout); - + //is this portable? + fflush(stdout); } - - static b3PrintfFunc* b3s_printfFunc = b3PrintfFuncDefault; static b3WarningMessageFunc* b3s_warningMessageFunc = b3WarningMessageFuncDefault; static b3ErrorMessageFunc* b3s_errorMessageFunc = b3ErrorMessageFuncDefault; - ///The developer can route b3Printf output using their own implementation void b3SetCustomPrintfFunc(b3PrintfFunc* printfFunc) { @@ -81,54 +74,50 @@ void b3SetCustomErrorMessageFunc(b3PrintfFunc* errorMessageFunc) //#define B3_MAX_DEBUG_STRING_LENGTH 2048 #define B3_MAX_DEBUG_STRING_LENGTH 32768 - -void b3OutputPrintfVarArgsInternal(const char *str, ...) +void b3OutputPrintfVarArgsInternal(const char* str, ...) { - char strDebug[B3_MAX_DEBUG_STRING_LENGTH]={0}; - va_list argList; - va_start(argList, str); + char strDebug[B3_MAX_DEBUG_STRING_LENGTH] = {0}; + va_list argList; + va_start(argList, str); #ifdef _MSC_VER - vsprintf_s(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList); + vsprintf_s(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList); #else - vsnprintf(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList); + vsnprintf(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList); #endif - (b3s_printfFunc)(strDebug); - va_end(argList); + (b3s_printfFunc)(strDebug); + va_end(argList); } -void b3OutputWarningMessageVarArgsInternal(const char *str, ...) +void b3OutputWarningMessageVarArgsInternal(const char* str, ...) { - char strDebug[B3_MAX_DEBUG_STRING_LENGTH]={0}; - va_list argList; - va_start(argList, str); + char strDebug[B3_MAX_DEBUG_STRING_LENGTH] = {0}; + va_list argList; + va_start(argList, str); #ifdef _MSC_VER - vsprintf_s(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList); + vsprintf_s(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList); #else - vsnprintf(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList); + vsnprintf(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList); #endif - (b3s_warningMessageFunc)(strDebug); - va_end(argList); + (b3s_warningMessageFunc)(strDebug); + va_end(argList); } -void b3OutputErrorMessageVarArgsInternal(const char *str, ...) +void b3OutputErrorMessageVarArgsInternal(const char* str, ...) { - - char strDebug[B3_MAX_DEBUG_STRING_LENGTH]={0}; - va_list argList; - va_start(argList, str); + char strDebug[B3_MAX_DEBUG_STRING_LENGTH] = {0}; + va_list argList; + va_start(argList, str); #ifdef _MSC_VER - vsprintf_s(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList); + vsprintf_s(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList); #else - vsnprintf(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList); + vsnprintf(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList); #endif - (b3s_errorMessageFunc)(strDebug); - va_end(argList); - + (b3s_errorMessageFunc)(strDebug); + va_end(argList); } - -void b3EnterProfileZoneDefault(const char* name) +void b3EnterProfileZoneDefault(const char* name) { } -void b3LeaveProfileZoneDefault() +void b3LeaveProfileZoneDefault() { } static b3EnterProfileZoneFunc* b3s_enterFunc = b3EnterProfileZoneDefault; @@ -151,10 +140,6 @@ void b3SetCustomLeaveProfileZoneFunc(b3LeaveProfileZoneFunc* leaveFunc) b3s_leaveFunc = leaveFunc; } - - - #ifndef _MSC_VER #undef vsprintf_s #endif - diff --git a/thirdparty/bullet/Bullet3Common/b3Logging.h b/thirdparty/bullet/Bullet3Common/b3Logging.h index b302effe43..9c92b12ebb 100644 --- a/thirdparty/bullet/Bullet3Common/b3Logging.h +++ b/thirdparty/bullet/Bullet3Common/b3Logging.h @@ -3,75 +3,84 @@ #define B3_LOGGING_H #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif - + ///We add the do/while so that the statement "if (condition) b3Printf("test"); else {...}" would fail ///You can also customize the message by uncommenting out a different line below #define b3Printf(...) b3OutputPrintfVarArgsInternal(__VA_ARGS__) -//#define b3Printf(...) do {b3OutputPrintfVarArgsInternal("b3Printf[%s,%d]:",__FILE__,__LINE__);b3OutputPrintfVarArgsInternal(__VA_ARGS__); } while(0) -//#define b3Printf b3OutputPrintfVarArgsInternal -//#define b3Printf(...) printf(__VA_ARGS__) -//#define b3Printf(...) - -#define b3Warning(...) do {b3OutputWarningMessageVarArgsInternal("b3Warning[%s,%d]:\n",__FILE__,__LINE__);b3OutputWarningMessageVarArgsInternal(__VA_ARGS__); }while(0) -#define b3Error(...) do {b3OutputErrorMessageVarArgsInternal("b3Error[%s,%d]:\n",__FILE__,__LINE__);b3OutputErrorMessageVarArgsInternal(__VA_ARGS__); } while(0) - + //#define b3Printf(...) do {b3OutputPrintfVarArgsInternal("b3Printf[%s,%d]:",__FILE__,__LINE__);b3OutputPrintfVarArgsInternal(__VA_ARGS__); } while(0) + //#define b3Printf b3OutputPrintfVarArgsInternal + //#define b3Printf(...) printf(__VA_ARGS__) + //#define b3Printf(...) + +#define b3Warning(...) \ + do \ + { \ + b3OutputWarningMessageVarArgsInternal("b3Warning[%s,%d]:\n", __FILE__, __LINE__); \ + b3OutputWarningMessageVarArgsInternal(__VA_ARGS__); \ + } while (0) +#define b3Error(...) \ + do \ + { \ + b3OutputErrorMessageVarArgsInternal("b3Error[%s,%d]:\n", __FILE__, __LINE__); \ + b3OutputErrorMessageVarArgsInternal(__VA_ARGS__); \ + } while (0) #ifndef B3_NO_PROFILE -void b3EnterProfileZone(const char* name); -void b3LeaveProfileZone(); + void b3EnterProfileZone(const char* name); + void b3LeaveProfileZone(); #ifdef __cplusplus -class b3ProfileZone -{ -public: - b3ProfileZone(const char* name) - { - b3EnterProfileZone( name ); - } - - ~b3ProfileZone() - { - b3LeaveProfileZone(); - } -}; - -#define B3_PROFILE( name ) b3ProfileZone __profile( name ) + class b3ProfileZone + { + public: + b3ProfileZone(const char* name) + { + b3EnterProfileZone(name); + } + + ~b3ProfileZone() + { + b3LeaveProfileZone(); + } + }; + +#define B3_PROFILE(name) b3ProfileZone __profile(name) #endif -#else //B3_NO_PROFILE +#else //B3_NO_PROFILE -#define B3_PROFILE( name ) +#define B3_PROFILE(name) #define b3StartProfile(a) #define b3StopProfile -#endif //#ifndef B3_NO_PROFILE - +#endif //#ifndef B3_NO_PROFILE -typedef void (b3PrintfFunc)(const char* msg); -typedef void (b3WarningMessageFunc)(const char* msg); -typedef void (b3ErrorMessageFunc)(const char* msg); -typedef void (b3EnterProfileZoneFunc)(const char* msg); -typedef void (b3LeaveProfileZoneFunc)(); + typedef void(b3PrintfFunc)(const char* msg); + typedef void(b3WarningMessageFunc)(const char* msg); + typedef void(b3ErrorMessageFunc)(const char* msg); + typedef void(b3EnterProfileZoneFunc)(const char* msg); + typedef void(b3LeaveProfileZoneFunc)(); -///The developer can route b3Printf output using their own implementation -void b3SetCustomPrintfFunc(b3PrintfFunc* printfFunc); -void b3SetCustomWarningMessageFunc(b3WarningMessageFunc* warningMsgFunc); -void b3SetCustomErrorMessageFunc(b3ErrorMessageFunc* errorMsgFunc); + ///The developer can route b3Printf output using their own implementation + void b3SetCustomPrintfFunc(b3PrintfFunc* printfFunc); + void b3SetCustomWarningMessageFunc(b3WarningMessageFunc* warningMsgFunc); + void b3SetCustomErrorMessageFunc(b3ErrorMessageFunc* errorMsgFunc); -///Set custom profile zone functions (zones can be nested) -void b3SetCustomEnterProfileZoneFunc(b3EnterProfileZoneFunc* enterFunc); -void b3SetCustomLeaveProfileZoneFunc(b3LeaveProfileZoneFunc* leaveFunc); + ///Set custom profile zone functions (zones can be nested) + void b3SetCustomEnterProfileZoneFunc(b3EnterProfileZoneFunc* enterFunc); + void b3SetCustomLeaveProfileZoneFunc(b3LeaveProfileZoneFunc* leaveFunc); -///Don't use those internal functions directly, use the b3Printf or b3SetCustomPrintfFunc instead (or warning/error version) -void b3OutputPrintfVarArgsInternal(const char *str, ...); -void b3OutputWarningMessageVarArgsInternal(const char *str, ...); -void b3OutputErrorMessageVarArgsInternal(const char *str, ...); + ///Don't use those internal functions directly, use the b3Printf or b3SetCustomPrintfFunc instead (or warning/error version) + void b3OutputPrintfVarArgsInternal(const char* str, ...); + void b3OutputWarningMessageVarArgsInternal(const char* str, ...); + void b3OutputErrorMessageVarArgsInternal(const char* str, ...); #ifdef __cplusplus - } +} #endif -#endif//B3_LOGGING_H
\ No newline at end of file +#endif //B3_LOGGING_H
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3Common/b3Matrix3x3.h b/thirdparty/bullet/Bullet3Common/b3Matrix3x3.h index 89b57cf59a..6c46536a81 100644 --- a/thirdparty/bullet/Bullet3Common/b3Matrix3x3.h +++ b/thirdparty/bullet/Bullet3Common/b3Matrix3x3.h @@ -12,8 +12,7 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - -#ifndef B3_MATRIX3x3_H +#ifndef B3_MATRIX3x3_H #define B3_MATRIX3x3_H #include "b3Vector3.h" @@ -32,22 +31,22 @@ const b3SimdFloat4 B3_ATTRIBUTE_ALIGNED16(b3v0010) = {0.0f, 0.0f, 1.0f, 0.0f}; #endif #ifdef B3_USE_DOUBLE_PRECISION -#define b3Matrix3x3Data b3Matrix3x3DoubleData +#define b3Matrix3x3Data b3Matrix3x3DoubleData #else -#define b3Matrix3x3Data b3Matrix3x3FloatData -#endif //B3_USE_DOUBLE_PRECISION - +#define b3Matrix3x3Data b3Matrix3x3FloatData +#endif //B3_USE_DOUBLE_PRECISION /**@brief The b3Matrix3x3 class implements a 3x3 rotation matrix, to perform linear algebra in combination with b3Quaternion, b3Transform and b3Vector3. * Make sure to only include a pure orthogonal matrix without scaling. */ -B3_ATTRIBUTE_ALIGNED16(class) b3Matrix3x3 { - +B3_ATTRIBUTE_ALIGNED16(class) +b3Matrix3x3 +{ ///Data storage for the matrix, each vector is a row of the matrix b3Vector3 m_el[3]; public: /** @brief No initializaion constructor */ - b3Matrix3x3 () {} + b3Matrix3x3() {} // explicit b3Matrix3x3(const b3Scalar *m) { setFromOpenGLSubMatrix(m); } @@ -62,27 +61,27 @@ public: */ /** @brief Constructor with row major formatting */ b3Matrix3x3(const b3Scalar& xx, const b3Scalar& xy, const b3Scalar& xz, - const b3Scalar& yx, const b3Scalar& yy, const b3Scalar& yz, - const b3Scalar& zx, const b3Scalar& zy, const b3Scalar& zz) - { - setValue(xx, xy, xz, - yx, yy, yz, - zx, zy, zz); + const b3Scalar& yx, const b3Scalar& yy, const b3Scalar& yz, + const b3Scalar& zx, const b3Scalar& zy, const b3Scalar& zz) + { + setValue(xx, xy, xz, + yx, yy, yz, + zx, zy, zz); } -#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON) - B3_FORCE_INLINE b3Matrix3x3 (const b3SimdFloat4 v0, const b3SimdFloat4 v1, const b3SimdFloat4 v2 ) +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON) + B3_FORCE_INLINE b3Matrix3x3(const b3SimdFloat4 v0, const b3SimdFloat4 v1, const b3SimdFloat4 v2) { - m_el[0].mVec128 = v0; - m_el[1].mVec128 = v1; - m_el[2].mVec128 = v2; + m_el[0].mVec128 = v0; + m_el[1].mVec128 = v1; + m_el[2].mVec128 = v2; } - B3_FORCE_INLINE b3Matrix3x3 (const b3Vector3& v0, const b3Vector3& v1, const b3Vector3& v2 ) + B3_FORCE_INLINE b3Matrix3x3(const b3Vector3& v0, const b3Vector3& v1, const b3Vector3& v2) { - m_el[0] = v0; - m_el[1] = v1; - m_el[2] = v2; + m_el[0] = v0; + m_el[1] = v1; + m_el[2] = v2; } // Copy constructor @@ -94,25 +93,25 @@ public: } // Assignment Operator - B3_FORCE_INLINE b3Matrix3x3& operator=(const b3Matrix3x3& m) + B3_FORCE_INLINE b3Matrix3x3& operator=(const b3Matrix3x3& m) { m_el[0].mVec128 = m.m_el[0].mVec128; m_el[1].mVec128 = m.m_el[1].mVec128; m_el[2].mVec128 = m.m_el[2].mVec128; - + return *this; } #else /** @brief Copy constructor */ - B3_FORCE_INLINE b3Matrix3x3 (const b3Matrix3x3& other) + B3_FORCE_INLINE b3Matrix3x3(const b3Matrix3x3& other) { m_el[0] = other.m_el[0]; m_el[1] = other.m_el[1]; m_el[2] = other.m_el[2]; } - + /** @brief Assignment Operator */ B3_FORCE_INLINE b3Matrix3x3& operator=(const b3Matrix3x3& other) { @@ -128,10 +127,9 @@ public: * @param i Column number 0 indexed */ B3_FORCE_INLINE b3Vector3 getColumn(int i) const { - return b3MakeVector3(m_el[0][i],m_el[1][i],m_el[2][i]); + return b3MakeVector3(m_el[0][i], m_el[1][i], m_el[2][i]); } - /** @brief Get a row of the matrix as a vector * @param i Row number 0 indexed */ B3_FORCE_INLINE const b3Vector3& getRow(int i) const @@ -142,10 +140,10 @@ public: /** @brief Get a mutable reference to a row of the matrix as a vector * @param i Row number 0 indexed */ - B3_FORCE_INLINE b3Vector3& operator[](int i) - { + B3_FORCE_INLINE b3Vector3& operator[](int i) + { b3FullAssert(0 <= i && i < 3); - return m_el[i]; + return m_el[i]; } /** @brief Get a const reference to a row of the matrix as a vector @@ -153,32 +151,31 @@ public: B3_FORCE_INLINE const b3Vector3& operator[](int i) const { b3FullAssert(0 <= i && i < 3); - return m_el[i]; + return m_el[i]; } /** @brief Multiply by the target matrix on the right * @param m Rotation matrix to be applied * Equivilant to this = this * m */ - b3Matrix3x3& operator*=(const b3Matrix3x3& m); + b3Matrix3x3& operator*=(const b3Matrix3x3& m); /** @brief Adds by the target matrix on the right * @param m matrix to be applied * Equivilant to this = this + m */ - b3Matrix3x3& operator+=(const b3Matrix3x3& m); + b3Matrix3x3& operator+=(const b3Matrix3x3& m); /** @brief Substractss by the target matrix on the right * @param m matrix to be applied * Equivilant to this = this - m */ - b3Matrix3x3& operator-=(const b3Matrix3x3& m); + b3Matrix3x3& operator-=(const b3Matrix3x3& m); /** @brief Set from the rotational part of a 4x4 OpenGL matrix * @param m A pointer to the beginning of the array of scalars*/ - void setFromOpenGLSubMatrix(const b3Scalar *m) + void setFromOpenGLSubMatrix(const b3Scalar* m) { - m_el[0].setValue(m[0],m[4],m[8]); - m_el[1].setValue(m[1],m[5],m[9]); - m_el[2].setValue(m[2],m[6],m[10]); - + m_el[0].setValue(m[0], m[4], m[8]); + m_el[1].setValue(m[1], m[5], m[9]); + m_el[2].setValue(m[2], m[6], m[10]); } /** @brief Set the values of the matrix explicitly (row major) * @param xx Top left @@ -190,93 +187,92 @@ public: * @param zx Bottom Left * @param zy Bottom Middle * @param zz Bottom Right*/ - void setValue(const b3Scalar& xx, const b3Scalar& xy, const b3Scalar& xz, - const b3Scalar& yx, const b3Scalar& yy, const b3Scalar& yz, - const b3Scalar& zx, const b3Scalar& zy, const b3Scalar& zz) + void setValue(const b3Scalar& xx, const b3Scalar& xy, const b3Scalar& xz, + const b3Scalar& yx, const b3Scalar& yy, const b3Scalar& yz, + const b3Scalar& zx, const b3Scalar& zy, const b3Scalar& zz) { - m_el[0].setValue(xx,xy,xz); - m_el[1].setValue(yx,yy,yz); - m_el[2].setValue(zx,zy,zz); + m_el[0].setValue(xx, xy, xz); + m_el[1].setValue(yx, yy, yz); + m_el[2].setValue(zx, zy, zz); } /** @brief Set the matrix from a quaternion - * @param q The Quaternion to match */ - void setRotation(const b3Quaternion& q) + * @param q The Quaternion to match */ + void setRotation(const b3Quaternion& q) { b3Scalar d = q.length2(); b3FullAssert(d != b3Scalar(0.0)); b3Scalar s = b3Scalar(2.0) / d; - - #if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) - __m128 vs, Q = q.get128(); + +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) + __m128 vs, Q = q.get128(); __m128i Qi = b3CastfTo128i(Q); - __m128 Y, Z; - __m128 V1, V2, V3; - __m128 V11, V21, V31; - __m128 NQ = _mm_xor_ps(Q, b3vMzeroMask); + __m128 Y, Z; + __m128 V1, V2, V3; + __m128 V11, V21, V31; + __m128 NQ = _mm_xor_ps(Q, b3vMzeroMask); __m128i NQi = b3CastfTo128i(NQ); - - V1 = b3CastiTo128f(_mm_shuffle_epi32 (Qi, B3_SHUFFLE(1,0,2,3))); // Y X Z W - V2 = _mm_shuffle_ps(NQ, Q, B3_SHUFFLE(0,0,1,3)); // -X -X Y W - V3 = b3CastiTo128f(_mm_shuffle_epi32 (Qi, B3_SHUFFLE(2,1,0,3))); // Z Y X W - V1 = _mm_xor_ps(V1, b3vMPPP); // change the sign of the first element - - V11 = b3CastiTo128f(_mm_shuffle_epi32 (Qi, B3_SHUFFLE(1,1,0,3))); // Y Y X W - V21 = _mm_unpackhi_ps(Q, Q); // Z Z W W - V31 = _mm_shuffle_ps(Q, NQ, B3_SHUFFLE(0,2,0,3)); // X Z -X -W - - V2 = V2 * V1; // - V1 = V1 * V11; // - V3 = V3 * V31; // - - V11 = _mm_shuffle_ps(NQ, Q, B3_SHUFFLE(2,3,1,3)); // -Z -W Y W - V11 = V11 * V21; // - V21 = _mm_xor_ps(V21, b3vMPPP); // change the sign of the first element - V31 = _mm_shuffle_ps(Q, NQ, B3_SHUFFLE(3,3,1,3)); // W W -Y -W - V31 = _mm_xor_ps(V31, b3vMPPP); // change the sign of the first element - Y = b3CastiTo128f(_mm_shuffle_epi32 (NQi, B3_SHUFFLE(3,2,0,3))); // -W -Z -X -W - Z = b3CastiTo128f(_mm_shuffle_epi32 (Qi, B3_SHUFFLE(1,0,1,3))); // Y X Y W + + V1 = b3CastiTo128f(_mm_shuffle_epi32(Qi, B3_SHUFFLE(1, 0, 2, 3))); // Y X Z W + V2 = _mm_shuffle_ps(NQ, Q, B3_SHUFFLE(0, 0, 1, 3)); // -X -X Y W + V3 = b3CastiTo128f(_mm_shuffle_epi32(Qi, B3_SHUFFLE(2, 1, 0, 3))); // Z Y X W + V1 = _mm_xor_ps(V1, b3vMPPP); // change the sign of the first element + + V11 = b3CastiTo128f(_mm_shuffle_epi32(Qi, B3_SHUFFLE(1, 1, 0, 3))); // Y Y X W + V21 = _mm_unpackhi_ps(Q, Q); // Z Z W W + V31 = _mm_shuffle_ps(Q, NQ, B3_SHUFFLE(0, 2, 0, 3)); // X Z -X -W + + V2 = V2 * V1; // + V1 = V1 * V11; // + V3 = V3 * V31; // + + V11 = _mm_shuffle_ps(NQ, Q, B3_SHUFFLE(2, 3, 1, 3)); // -Z -W Y W + V11 = V11 * V21; // + V21 = _mm_xor_ps(V21, b3vMPPP); // change the sign of the first element + V31 = _mm_shuffle_ps(Q, NQ, B3_SHUFFLE(3, 3, 1, 3)); // W W -Y -W + V31 = _mm_xor_ps(V31, b3vMPPP); // change the sign of the first element + Y = b3CastiTo128f(_mm_shuffle_epi32(NQi, B3_SHUFFLE(3, 2, 0, 3))); // -W -Z -X -W + Z = b3CastiTo128f(_mm_shuffle_epi32(Qi, B3_SHUFFLE(1, 0, 1, 3))); // Y X Y W vs = _mm_load_ss(&s); V21 = V21 * Y; V31 = V31 * Z; V1 = V1 + V11; - V2 = V2 + V21; - V3 = V3 + V31; - - vs = b3_splat3_ps(vs, 0); - // s ready - V1 = V1 * vs; - V2 = V2 * vs; - V3 = V3 * vs; - - V1 = V1 + b3v1000; - V2 = V2 + b3v0100; - V3 = V3 + b3v0010; - - m_el[0] = b3MakeVector3(V1); - m_el[1] = b3MakeVector3(V2); - m_el[2] = b3MakeVector3(V3); - #else - b3Scalar xs = q.getX() * s, ys = q.getY() * s, zs = q.getZ() * s; - b3Scalar wx = q.getW() * xs, wy = q.getW() * ys, wz = q.getW() * zs; - b3Scalar xx = q.getX() * xs, xy = q.getX() * ys, xz = q.getX() * zs; - b3Scalar yy = q.getY() * ys, yz = q.getY() * zs, zz = q.getZ() * zs; + V2 = V2 + V21; + V3 = V3 + V31; + + vs = b3_splat3_ps(vs, 0); + // s ready + V1 = V1 * vs; + V2 = V2 * vs; + V3 = V3 * vs; + + V1 = V1 + b3v1000; + V2 = V2 + b3v0100; + V3 = V3 + b3v0010; + + m_el[0] = b3MakeVector3(V1); + m_el[1] = b3MakeVector3(V2); + m_el[2] = b3MakeVector3(V3); +#else + b3Scalar xs = q.getX() * s, ys = q.getY() * s, zs = q.getZ() * s; + b3Scalar wx = q.getW() * xs, wy = q.getW() * ys, wz = q.getW() * zs; + b3Scalar xx = q.getX() * xs, xy = q.getX() * ys, xz = q.getX() * zs; + b3Scalar yy = q.getY() * ys, yz = q.getY() * zs, zz = q.getZ() * zs; setValue( - b3Scalar(1.0) - (yy + zz), xy - wz, xz + wy, + b3Scalar(1.0) - (yy + zz), xy - wz, xz + wy, xy + wz, b3Scalar(1.0) - (xx + zz), yz - wx, xz - wy, yz + wx, b3Scalar(1.0) - (xx + yy)); - #endif - } - +#endif + } /** @brief Set the matrix from euler angles using YPR around YXZ respectively * @param yaw Yaw about Y axis * @param pitch Pitch about X axis * @param roll Roll about Z axis */ - void setEulerYPR(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll) + void setEulerYPR(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll) { setEulerZYX(roll, pitch, yaw); } @@ -290,182 +286,197 @@ public: * angles are applied in ZYX order. I.e a vector is first rotated * about X then Y and then Z **/ - void setEulerZYX(b3Scalar eulerX,b3Scalar eulerY,b3Scalar eulerZ) { + void setEulerZYX(b3Scalar eulerX, b3Scalar eulerY, b3Scalar eulerZ) + { ///@todo proposed to reverse this since it's labeled zyx but takes arguments xyz and it will match all other parts of the code - b3Scalar ci ( b3Cos(eulerX)); - b3Scalar cj ( b3Cos(eulerY)); - b3Scalar ch ( b3Cos(eulerZ)); - b3Scalar si ( b3Sin(eulerX)); - b3Scalar sj ( b3Sin(eulerY)); - b3Scalar sh ( b3Sin(eulerZ)); - b3Scalar cc = ci * ch; - b3Scalar cs = ci * sh; - b3Scalar sc = si * ch; + b3Scalar ci(b3Cos(eulerX)); + b3Scalar cj(b3Cos(eulerY)); + b3Scalar ch(b3Cos(eulerZ)); + b3Scalar si(b3Sin(eulerX)); + b3Scalar sj(b3Sin(eulerY)); + b3Scalar sh(b3Sin(eulerZ)); + b3Scalar cc = ci * ch; + b3Scalar cs = ci * sh; + b3Scalar sc = si * ch; b3Scalar ss = si * sh; setValue(cj * ch, sj * sc - cs, sj * cc + ss, - cj * sh, sj * ss + cc, sj * cs - sc, - -sj, cj * si, cj * ci); + cj * sh, sj * ss + cc, sj * cs - sc, + -sj, cj * si, cj * ci); } /**@brief Set the matrix to the identity */ void setIdentity() - { -#if (defined(B3_USE_SSE_IN_API)&& defined (B3_USE_SSE)) || defined(B3_USE_NEON) - m_el[0] = b3MakeVector3(b3v1000); - m_el[1] = b3MakeVector3(b3v0100); - m_el[2] = b3MakeVector3(b3v0010); + { +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON) + m_el[0] = b3MakeVector3(b3v1000); + m_el[1] = b3MakeVector3(b3v0100); + m_el[2] = b3MakeVector3(b3v0010); #else - setValue(b3Scalar(1.0), b3Scalar(0.0), b3Scalar(0.0), - b3Scalar(0.0), b3Scalar(1.0), b3Scalar(0.0), - b3Scalar(0.0), b3Scalar(0.0), b3Scalar(1.0)); + setValue(b3Scalar(1.0), b3Scalar(0.0), b3Scalar(0.0), + b3Scalar(0.0), b3Scalar(1.0), b3Scalar(0.0), + b3Scalar(0.0), b3Scalar(0.0), b3Scalar(1.0)); #endif } - static const b3Matrix3x3& getIdentity() + static const b3Matrix3x3& getIdentity() { -#if (defined(B3_USE_SSE_IN_API)&& defined (B3_USE_SSE)) || defined(B3_USE_NEON) - static const b3Matrix3x3 - identityMatrix(b3v1000, b3v0100, b3v0010); +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON) + static const b3Matrix3x3 + identityMatrix(b3v1000, b3v0100, b3v0010); #else - static const b3Matrix3x3 - identityMatrix( - b3Scalar(1.0), b3Scalar(0.0), b3Scalar(0.0), - b3Scalar(0.0), b3Scalar(1.0), b3Scalar(0.0), - b3Scalar(0.0), b3Scalar(0.0), b3Scalar(1.0)); + static const b3Matrix3x3 + identityMatrix( + b3Scalar(1.0), b3Scalar(0.0), b3Scalar(0.0), + b3Scalar(0.0), b3Scalar(1.0), b3Scalar(0.0), + b3Scalar(0.0), b3Scalar(0.0), b3Scalar(1.0)); #endif return identityMatrix; } /**@brief Fill the rotational part of an OpenGL matrix and clear the shear/perspective * @param m The array to be filled */ - void getOpenGLSubMatrix(b3Scalar *m) const + void getOpenGLSubMatrix(b3Scalar * m) const { -#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) - __m128 v0 = m_el[0].mVec128; - __m128 v1 = m_el[1].mVec128; - __m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2 - __m128 *vm = (__m128 *)m; - __m128 vT; - - v2 = _mm_and_ps(v2, b3vFFF0fMask); // x2 y2 z2 0 - - vT = _mm_unpackhi_ps(v0, v1); // z0 z1 * * - v0 = _mm_unpacklo_ps(v0, v1); // x0 x1 y0 y1 - - v1 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(2, 3, 1, 3) ); // y0 y1 y2 0 - v0 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(0, 1, 0, 3) ); // x0 x1 x2 0 - v2 = b3CastdTo128f(_mm_move_sd(b3CastfTo128d(v2), b3CastfTo128d(vT))); // z0 z1 z2 0 - - vm[0] = v0; - vm[1] = v1; - vm[2] = v2; +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) + __m128 v0 = m_el[0].mVec128; + __m128 v1 = m_el[1].mVec128; + __m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2 + __m128* vm = (__m128*)m; + __m128 vT; + + v2 = _mm_and_ps(v2, b3vFFF0fMask); // x2 y2 z2 0 + + vT = _mm_unpackhi_ps(v0, v1); // z0 z1 * * + v0 = _mm_unpacklo_ps(v0, v1); // x0 x1 y0 y1 + + v1 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(2, 3, 1, 3)); // y0 y1 y2 0 + v0 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(0, 1, 0, 3)); // x0 x1 x2 0 + v2 = b3CastdTo128f(_mm_move_sd(b3CastfTo128d(v2), b3CastfTo128d(vT))); // z0 z1 z2 0 + + vm[0] = v0; + vm[1] = v1; + vm[2] = v2; #elif defined(B3_USE_NEON) - // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions. - static const uint32x2_t zMask = (const uint32x2_t) {-1, 0 }; - float32x4_t *vm = (float32x4_t *)m; - float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 ); // {x0 x1 z0 z1}, {y0 y1 w0 w1} - float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) ); // {x2 0 }, {y2 0} - float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] ); - float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] ); - float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask ); - float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q ); // z0 z1 z2 0 - - vm[0] = v0; - vm[1] = v1; - vm[2] = v2; + // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions. + static const uint32x2_t zMask = (const uint32x2_t){-1, 0}; + float32x4_t* vm = (float32x4_t*)m; + float32x4x2_t top = vtrnq_f32(m_el[0].mVec128, m_el[1].mVec128); // {x0 x1 z0 z1}, {y0 y1 w0 w1} + float32x2x2_t bl = vtrn_f32(vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f)); // {x2 0 }, {y2 0} + float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]); + float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]); + float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(m_el[2].mVec128), zMask); + float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q); // z0 z1 z2 0 + + vm[0] = v0; + vm[1] = v1; + vm[2] = v2; #else - m[0] = b3Scalar(m_el[0].getX()); - m[1] = b3Scalar(m_el[1].getX()); - m[2] = b3Scalar(m_el[2].getX()); - m[3] = b3Scalar(0.0); - m[4] = b3Scalar(m_el[0].getY()); - m[5] = b3Scalar(m_el[1].getY()); - m[6] = b3Scalar(m_el[2].getY()); - m[7] = b3Scalar(0.0); - m[8] = b3Scalar(m_el[0].getZ()); - m[9] = b3Scalar(m_el[1].getZ()); + m[0] = b3Scalar(m_el[0].getX()); + m[1] = b3Scalar(m_el[1].getX()); + m[2] = b3Scalar(m_el[2].getX()); + m[3] = b3Scalar(0.0); + m[4] = b3Scalar(m_el[0].getY()); + m[5] = b3Scalar(m_el[1].getY()); + m[6] = b3Scalar(m_el[2].getY()); + m[7] = b3Scalar(0.0); + m[8] = b3Scalar(m_el[0].getZ()); + m[9] = b3Scalar(m_el[1].getZ()); m[10] = b3Scalar(m_el[2].getZ()); - m[11] = b3Scalar(0.0); + m[11] = b3Scalar(0.0); #endif } /**@brief Get the matrix represented as a quaternion * @param q The quaternion which will be set */ - void getRotation(b3Quaternion& q) const + void getRotation(b3Quaternion & q) const { -#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON) - b3Scalar trace = m_el[0].getX() + m_el[1].getY() + m_el[2].getZ(); - b3Scalar s, x; - - union { - b3SimdFloat4 vec; - b3Scalar f[4]; - } temp; - - if (trace > b3Scalar(0.0)) - { - x = trace + b3Scalar(1.0); - - temp.f[0]=m_el[2].getY() - m_el[1].getZ(); - temp.f[1]=m_el[0].getZ() - m_el[2].getX(); - temp.f[2]=m_el[1].getX() - m_el[0].getY(); - temp.f[3]=x; - //temp.f[3]= s * b3Scalar(0.5); - } - else - { - int i, j, k; - if(m_el[0].getX() < m_el[1].getY()) - { - if( m_el[1].getY() < m_el[2].getZ() ) - { i = 2; j = 0; k = 1; } - else - { i = 1; j = 2; k = 0; } - } - else - { - if( m_el[0].getX() < m_el[2].getZ()) - { i = 2; j = 0; k = 1; } - else - { i = 0; j = 1; k = 2; } - } - - x = m_el[i][i] - m_el[j][j] - m_el[k][k] + b3Scalar(1.0); - - temp.f[3] = (m_el[k][j] - m_el[j][k]); - temp.f[j] = (m_el[j][i] + m_el[i][j]); - temp.f[k] = (m_el[k][i] + m_el[i][k]); - temp.f[i] = x; - //temp.f[i] = s * b3Scalar(0.5); - } - - s = b3Sqrt(x); - q.set128(temp.vec); - s = b3Scalar(0.5) / s; - - q *= s; -#else +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON) + b3Scalar trace = m_el[0].getX() + m_el[1].getY() + m_el[2].getZ(); + b3Scalar s, x; + + union { + b3SimdFloat4 vec; + b3Scalar f[4]; + } temp; + + if (trace > b3Scalar(0.0)) + { + x = trace + b3Scalar(1.0); + + temp.f[0] = m_el[2].getY() - m_el[1].getZ(); + temp.f[1] = m_el[0].getZ() - m_el[2].getX(); + temp.f[2] = m_el[1].getX() - m_el[0].getY(); + temp.f[3] = x; + //temp.f[3]= s * b3Scalar(0.5); + } + else + { + int i, j, k; + if (m_el[0].getX() < m_el[1].getY()) + { + if (m_el[1].getY() < m_el[2].getZ()) + { + i = 2; + j = 0; + k = 1; + } + else + { + i = 1; + j = 2; + k = 0; + } + } + else + { + if (m_el[0].getX() < m_el[2].getZ()) + { + i = 2; + j = 0; + k = 1; + } + else + { + i = 0; + j = 1; + k = 2; + } + } + + x = m_el[i][i] - m_el[j][j] - m_el[k][k] + b3Scalar(1.0); + + temp.f[3] = (m_el[k][j] - m_el[j][k]); + temp.f[j] = (m_el[j][i] + m_el[i][j]); + temp.f[k] = (m_el[k][i] + m_el[i][k]); + temp.f[i] = x; + //temp.f[i] = s * b3Scalar(0.5); + } + + s = b3Sqrt(x); + q.set128(temp.vec); + s = b3Scalar(0.5) / s; + + q *= s; +#else b3Scalar trace = m_el[0].getX() + m_el[1].getY() + m_el[2].getZ(); b3Scalar temp[4]; - if (trace > b3Scalar(0.0)) + if (trace > b3Scalar(0.0)) { b3Scalar s = b3Sqrt(trace + b3Scalar(1.0)); - temp[3]=(s * b3Scalar(0.5)); + temp[3] = (s * b3Scalar(0.5)); s = b3Scalar(0.5) / s; - temp[0]=((m_el[2].getY() - m_el[1].getZ()) * s); - temp[1]=((m_el[0].getZ() - m_el[2].getX()) * s); - temp[2]=((m_el[1].getX() - m_el[0].getY()) * s); - } - else + temp[0] = ((m_el[2].getY() - m_el[1].getZ()) * s); + temp[1] = ((m_el[0].getZ() - m_el[2].getX()) * s); + temp[2] = ((m_el[1].getX() - m_el[0].getY()) * s); + } + else { - int i = m_el[0].getX() < m_el[1].getY() ? - (m_el[1].getY() < m_el[2].getZ() ? 2 : 1) : - (m_el[0].getX() < m_el[2].getZ() ? 2 : 0); - int j = (i + 1) % 3; + int i = m_el[0].getX() < m_el[1].getY() ? (m_el[1].getY() < m_el[2].getZ() ? 2 : 1) : (m_el[0].getX() < m_el[2].getZ() ? 2 : 0); + int j = (i + 1) % 3; int k = (i + 2) % 3; b3Scalar s = b3Sqrt(m_el[i][i] - m_el[j][j] - m_el[k][k] + b3Scalar(1.0)); @@ -476,44 +487,42 @@ public: temp[j] = (m_el[j][i] + m_el[i][j]) * s; temp[k] = (m_el[k][i] + m_el[i][k]) * s; } - q.setValue(temp[0],temp[1],temp[2],temp[3]); + q.setValue(temp[0], temp[1], temp[2], temp[3]); #endif } /**@brief Get the matrix represented as euler angles around YXZ, roundtrip with setEulerYPR * @param yaw Yaw around Y axis * @param pitch Pitch around X axis - * @param roll around Z axis */ - void getEulerYPR(b3Scalar& yaw, b3Scalar& pitch, b3Scalar& roll) const + * @param roll around Z axis */ + void getEulerYPR(b3Scalar & yaw, b3Scalar & pitch, b3Scalar & roll) const { - // first use the normal calculus yaw = b3Scalar(b3Atan2(m_el[1].getX(), m_el[0].getX())); pitch = b3Scalar(b3Asin(-m_el[2].getX())); roll = b3Scalar(b3Atan2(m_el[2].getY(), m_el[2].getZ())); // on pitch = +/-HalfPI - if (b3Fabs(pitch)==B3_HALF_PI) + if (b3Fabs(pitch) == B3_HALF_PI) { - if (yaw>0) - yaw-=B3_PI; + if (yaw > 0) + yaw -= B3_PI; else - yaw+=B3_PI; + yaw += B3_PI; - if (roll>0) - roll-=B3_PI; + if (roll > 0) + roll -= B3_PI; else - roll+=B3_PI; + roll += B3_PI; } }; - /**@brief Get the matrix represented as euler angles around ZYX * @param yaw Yaw around X axis * @param pitch Pitch around Y axis * @param roll around X axis - * @param solution_number Which solution of two possible solutions ( 1 or 2) are possible values*/ - void getEulerZYX(b3Scalar& yaw, b3Scalar& pitch, b3Scalar& roll, unsigned int solution_number = 1) const + * @param solution_number Which solution of two possible solutions ( 1 or 2) are possible values*/ + void getEulerZYX(b3Scalar & yaw, b3Scalar & pitch, b3Scalar & roll, unsigned int solution_number = 1) const { struct Euler { @@ -523,7 +532,7 @@ public: }; Euler euler_out; - Euler euler_out2; //second solution + Euler euler_out2; //second solution //get the pointer to the raw data // Check that pitch is not at a singularity @@ -533,7 +542,7 @@ public: euler_out2.yaw = 0; // From difference of angles formula - b3Scalar delta = b3Atan2(m_el[0].getX(),m_el[0].getZ()); + b3Scalar delta = b3Atan2(m_el[0].getX(), m_el[0].getZ()); if (m_el[2].getX() > 0) //gimbal locked up { euler_out.pitch = B3_PI / b3Scalar(2.0); @@ -541,7 +550,7 @@ public: euler_out.roll = euler_out.pitch + delta; euler_out2.roll = euler_out.pitch + delta; } - else // gimbal locked down + else // gimbal locked down { euler_out.pitch = -B3_PI / b3Scalar(2.0); euler_out2.pitch = -B3_PI / b3Scalar(2.0); @@ -551,29 +560,29 @@ public: } else { - euler_out.pitch = - b3Asin(m_el[2].getX()); + euler_out.pitch = -b3Asin(m_el[2].getX()); euler_out2.pitch = B3_PI - euler_out.pitch; - euler_out.roll = b3Atan2(m_el[2].getY()/b3Cos(euler_out.pitch), - m_el[2].getZ()/b3Cos(euler_out.pitch)); - euler_out2.roll = b3Atan2(m_el[2].getY()/b3Cos(euler_out2.pitch), - m_el[2].getZ()/b3Cos(euler_out2.pitch)); + euler_out.roll = b3Atan2(m_el[2].getY() / b3Cos(euler_out.pitch), + m_el[2].getZ() / b3Cos(euler_out.pitch)); + euler_out2.roll = b3Atan2(m_el[2].getY() / b3Cos(euler_out2.pitch), + m_el[2].getZ() / b3Cos(euler_out2.pitch)); - euler_out.yaw = b3Atan2(m_el[1].getX()/b3Cos(euler_out.pitch), - m_el[0].getX()/b3Cos(euler_out.pitch)); - euler_out2.yaw = b3Atan2(m_el[1].getX()/b3Cos(euler_out2.pitch), - m_el[0].getX()/b3Cos(euler_out2.pitch)); + euler_out.yaw = b3Atan2(m_el[1].getX() / b3Cos(euler_out.pitch), + m_el[0].getX() / b3Cos(euler_out.pitch)); + euler_out2.yaw = b3Atan2(m_el[1].getX() / b3Cos(euler_out2.pitch), + m_el[0].getX() / b3Cos(euler_out2.pitch)); } if (solution_number == 1) - { - yaw = euler_out.yaw; + { + yaw = euler_out.yaw; pitch = euler_out.pitch; roll = euler_out.roll; } else - { - yaw = euler_out2.yaw; + { + yaw = euler_out2.yaw; pitch = euler_out2.pitch; roll = euler_out2.roll; } @@ -584,18 +593,18 @@ public: b3Matrix3x3 scaled(const b3Vector3& s) const { -#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON) +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON) return b3Matrix3x3(m_el[0] * s, m_el[1] * s, m_el[2] * s); -#else +#else return b3Matrix3x3( - m_el[0].getX() * s.getX(), m_el[0].getY() * s.getY(), m_el[0].getZ() * s.getZ(), + m_el[0].getX() * s.getX(), m_el[0].getY() * s.getY(), m_el[0].getZ() * s.getZ(), m_el[1].getX() * s.getX(), m_el[1].getY() * s.getY(), m_el[1].getZ() * s.getZ(), m_el[2].getX() * s.getX(), m_el[2].getY() * s.getY(), m_el[2].getZ() * s.getZ()); #endif } /**@brief Return the determinant of the matrix */ - b3Scalar determinant() const; + b3Scalar determinant() const; /**@brief Return the adjoint of the matrix */ b3Matrix3x3 adjoint() const; /**@brief Return the matrix with all values non negative */ @@ -603,25 +612,24 @@ public: /**@brief Return the transpose of the matrix */ b3Matrix3x3 transpose() const; /**@brief Return the inverse of the matrix */ - b3Matrix3x3 inverse() const; + b3Matrix3x3 inverse() const; b3Matrix3x3 transposeTimes(const b3Matrix3x3& m) const; b3Matrix3x3 timesTranspose(const b3Matrix3x3& m) const; - B3_FORCE_INLINE b3Scalar tdotx(const b3Vector3& v) const + B3_FORCE_INLINE b3Scalar tdotx(const b3Vector3& v) const { return m_el[0].getX() * v.getX() + m_el[1].getX() * v.getY() + m_el[2].getX() * v.getZ(); } - B3_FORCE_INLINE b3Scalar tdoty(const b3Vector3& v) const + B3_FORCE_INLINE b3Scalar tdoty(const b3Vector3& v) const { return m_el[0].getY() * v.getX() + m_el[1].getY() * v.getY() + m_el[2].getY() * v.getZ(); } - B3_FORCE_INLINE b3Scalar tdotz(const b3Vector3& v) const + B3_FORCE_INLINE b3Scalar tdotz(const b3Vector3& v) const { return m_el[0].getZ() * v.getX() + m_el[1].getZ() * v.getY() + m_el[2].getZ() * v.getZ(); } - /**@brief diagonalizes this matrix by the Jacobi method. * @param rot stores the rotation from the coordinate system in which the matrix is diagonal to the original * coordinate system, i.e., old_this = rot * new_this * rot^T. @@ -631,7 +639,7 @@ public: * * Note that this matrix is assumed to be symmetric. */ - void diagonalize(b3Matrix3x3& rot, b3Scalar threshold, int maxSteps) + void diagonalize(b3Matrix3x3 & rot, b3Scalar threshold, int maxSteps) { rot.setIdentity(); for (int step = maxSteps; step > 0; step--) @@ -667,7 +675,7 @@ public: step = 1; } - // compute Jacobi rotation J which leads to a zero for element [p][q] + // compute Jacobi rotation J which leads to a zero for element [p][q] b3Scalar mpq = m_el[p][q]; b3Scalar theta = (m_el[q][q] - m_el[p][p]) / (2 * mpq); b3Scalar theta2 = theta * theta; @@ -676,7 +684,7 @@ public: if (theta2 * theta2 < b3Scalar(10 / B3_EPSILON)) { t = (theta >= 0) ? 1 / (theta + b3Sqrt(1 + theta2)) - : 1 / (theta - b3Sqrt(1 + theta2)); + : 1 / (theta - b3Sqrt(1 + theta2)); cos = 1 / b3Sqrt(1 + t * t); sin = cos * t; } @@ -709,9 +717,6 @@ public: } } - - - /**@brief Calculate the matrix cofactor * @param r1 The first row to use for calculating the cofactor * @param c1 The first column to use for calculating the cofactor @@ -719,304 +724,298 @@ public: * @param c1 The second column to use for calculating the cofactor * See http://en.wikipedia.org/wiki/Cofactor_(linear_algebra) for more details */ - b3Scalar cofac(int r1, int c1, int r2, int c2) const + b3Scalar cofac(int r1, int c1, int r2, int c2) const { return m_el[r1][c1] * m_el[r2][c2] - m_el[r1][c2] * m_el[r2][c1]; } - void serialize(struct b3Matrix3x3Data& dataOut) const; + void serialize(struct b3Matrix3x3Data & dataOut) const; - void serializeFloat(struct b3Matrix3x3FloatData& dataOut) const; + void serializeFloat(struct b3Matrix3x3FloatData & dataOut) const; - void deSerialize(const struct b3Matrix3x3Data& dataIn); + void deSerialize(const struct b3Matrix3x3Data& dataIn); - void deSerializeFloat(const struct b3Matrix3x3FloatData& dataIn); - - void deSerializeDouble(const struct b3Matrix3x3DoubleData& dataIn); + void deSerializeFloat(const struct b3Matrix3x3FloatData& dataIn); + void deSerializeDouble(const struct b3Matrix3x3DoubleData& dataIn); }; - -B3_FORCE_INLINE b3Matrix3x3& +B3_FORCE_INLINE b3Matrix3x3& b3Matrix3x3::operator*=(const b3Matrix3x3& m) { -#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) - __m128 rv00, rv01, rv02; - __m128 rv10, rv11, rv12; - __m128 rv20, rv21, rv22; - __m128 mv0, mv1, mv2; - - rv02 = m_el[0].mVec128; - rv12 = m_el[1].mVec128; - rv22 = m_el[2].mVec128; - - mv0 = _mm_and_ps(m[0].mVec128, b3vFFF0fMask); - mv1 = _mm_and_ps(m[1].mVec128, b3vFFF0fMask); - mv2 = _mm_and_ps(m[2].mVec128, b3vFFF0fMask); - - // rv0 - rv00 = b3_splat_ps(rv02, 0); - rv01 = b3_splat_ps(rv02, 1); - rv02 = b3_splat_ps(rv02, 2); - - rv00 = _mm_mul_ps(rv00, mv0); - rv01 = _mm_mul_ps(rv01, mv1); - rv02 = _mm_mul_ps(rv02, mv2); - - // rv1 - rv10 = b3_splat_ps(rv12, 0); - rv11 = b3_splat_ps(rv12, 1); - rv12 = b3_splat_ps(rv12, 2); - - rv10 = _mm_mul_ps(rv10, mv0); - rv11 = _mm_mul_ps(rv11, mv1); - rv12 = _mm_mul_ps(rv12, mv2); - - // rv2 - rv20 = b3_splat_ps(rv22, 0); - rv21 = b3_splat_ps(rv22, 1); - rv22 = b3_splat_ps(rv22, 2); - - rv20 = _mm_mul_ps(rv20, mv0); - rv21 = _mm_mul_ps(rv21, mv1); - rv22 = _mm_mul_ps(rv22, mv2); - - rv00 = _mm_add_ps(rv00, rv01); - rv10 = _mm_add_ps(rv10, rv11); - rv20 = _mm_add_ps(rv20, rv21); - - m_el[0].mVec128 = _mm_add_ps(rv00, rv02); - m_el[1].mVec128 = _mm_add_ps(rv10, rv12); - m_el[2].mVec128 = _mm_add_ps(rv20, rv22); +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) + __m128 rv00, rv01, rv02; + __m128 rv10, rv11, rv12; + __m128 rv20, rv21, rv22; + __m128 mv0, mv1, mv2; + + rv02 = m_el[0].mVec128; + rv12 = m_el[1].mVec128; + rv22 = m_el[2].mVec128; + + mv0 = _mm_and_ps(m[0].mVec128, b3vFFF0fMask); + mv1 = _mm_and_ps(m[1].mVec128, b3vFFF0fMask); + mv2 = _mm_and_ps(m[2].mVec128, b3vFFF0fMask); + + // rv0 + rv00 = b3_splat_ps(rv02, 0); + rv01 = b3_splat_ps(rv02, 1); + rv02 = b3_splat_ps(rv02, 2); + + rv00 = _mm_mul_ps(rv00, mv0); + rv01 = _mm_mul_ps(rv01, mv1); + rv02 = _mm_mul_ps(rv02, mv2); + + // rv1 + rv10 = b3_splat_ps(rv12, 0); + rv11 = b3_splat_ps(rv12, 1); + rv12 = b3_splat_ps(rv12, 2); + + rv10 = _mm_mul_ps(rv10, mv0); + rv11 = _mm_mul_ps(rv11, mv1); + rv12 = _mm_mul_ps(rv12, mv2); + + // rv2 + rv20 = b3_splat_ps(rv22, 0); + rv21 = b3_splat_ps(rv22, 1); + rv22 = b3_splat_ps(rv22, 2); + + rv20 = _mm_mul_ps(rv20, mv0); + rv21 = _mm_mul_ps(rv21, mv1); + rv22 = _mm_mul_ps(rv22, mv2); + + rv00 = _mm_add_ps(rv00, rv01); + rv10 = _mm_add_ps(rv10, rv11); + rv20 = _mm_add_ps(rv20, rv21); + + m_el[0].mVec128 = _mm_add_ps(rv00, rv02); + m_el[1].mVec128 = _mm_add_ps(rv10, rv12); + m_el[2].mVec128 = _mm_add_ps(rv20, rv22); #elif defined(B3_USE_NEON) - float32x4_t rv0, rv1, rv2; - float32x4_t v0, v1, v2; - float32x4_t mv0, mv1, mv2; - - v0 = m_el[0].mVec128; - v1 = m_el[1].mVec128; - v2 = m_el[2].mVec128; - - mv0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, b3vFFF0Mask); - mv1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, b3vFFF0Mask); - mv2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, b3vFFF0Mask); - - rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0); - rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0); - rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0); - - rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1); - rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1); - rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1); - - rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0); - rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0); - rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0); - - m_el[0].mVec128 = rv0; - m_el[1].mVec128 = rv1; - m_el[2].mVec128 = rv2; -#else + float32x4_t rv0, rv1, rv2; + float32x4_t v0, v1, v2; + float32x4_t mv0, mv1, mv2; + + v0 = m_el[0].mVec128; + v1 = m_el[1].mVec128; + v2 = m_el[2].mVec128; + + mv0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, b3vFFF0Mask); + mv1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, b3vFFF0Mask); + mv2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, b3vFFF0Mask); + + rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0); + rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0); + rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0); + + rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1); + rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1); + rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1); + + rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0); + rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0); + rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0); + + m_el[0].mVec128 = rv0; + m_el[1].mVec128 = rv1; + m_el[2].mVec128 = rv2; +#else setValue( - m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]), + m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]), m.tdotx(m_el[1]), m.tdoty(m_el[1]), m.tdotz(m_el[1]), m.tdotx(m_el[2]), m.tdoty(m_el[2]), m.tdotz(m_el[2])); #endif return *this; } -B3_FORCE_INLINE b3Matrix3x3& +B3_FORCE_INLINE b3Matrix3x3& b3Matrix3x3::operator+=(const b3Matrix3x3& m) { -#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON) - m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128; - m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128; - m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128; +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON) + m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128; + m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128; + m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128; #else setValue( - m_el[0][0]+m.m_el[0][0], - m_el[0][1]+m.m_el[0][1], - m_el[0][2]+m.m_el[0][2], - m_el[1][0]+m.m_el[1][0], - m_el[1][1]+m.m_el[1][1], - m_el[1][2]+m.m_el[1][2], - m_el[2][0]+m.m_el[2][0], - m_el[2][1]+m.m_el[2][1], - m_el[2][2]+m.m_el[2][2]); + m_el[0][0] + m.m_el[0][0], + m_el[0][1] + m.m_el[0][1], + m_el[0][2] + m.m_el[0][2], + m_el[1][0] + m.m_el[1][0], + m_el[1][1] + m.m_el[1][1], + m_el[1][2] + m.m_el[1][2], + m_el[2][0] + m.m_el[2][0], + m_el[2][1] + m.m_el[2][1], + m_el[2][2] + m.m_el[2][2]); #endif return *this; } B3_FORCE_INLINE b3Matrix3x3 -operator*(const b3Matrix3x3& m, const b3Scalar & k) +operator*(const b3Matrix3x3& m, const b3Scalar& k) { -#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)) - __m128 vk = b3_splat_ps(_mm_load_ss((float *)&k), 0x80); - return b3Matrix3x3( - _mm_mul_ps(m[0].mVec128, vk), - _mm_mul_ps(m[1].mVec128, vk), - _mm_mul_ps(m[2].mVec128, vk)); +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) + __m128 vk = b3_splat_ps(_mm_load_ss((float*)&k), 0x80); + return b3Matrix3x3( + _mm_mul_ps(m[0].mVec128, vk), + _mm_mul_ps(m[1].mVec128, vk), + _mm_mul_ps(m[2].mVec128, vk)); #elif defined(B3_USE_NEON) - return b3Matrix3x3( - vmulq_n_f32(m[0].mVec128, k), - vmulq_n_f32(m[1].mVec128, k), - vmulq_n_f32(m[2].mVec128, k)); + return b3Matrix3x3( + vmulq_n_f32(m[0].mVec128, k), + vmulq_n_f32(m[1].mVec128, k), + vmulq_n_f32(m[2].mVec128, k)); #else return b3Matrix3x3( - m[0].getX()*k,m[0].getY()*k,m[0].getZ()*k, - m[1].getX()*k,m[1].getY()*k,m[1].getZ()*k, - m[2].getX()*k,m[2].getY()*k,m[2].getZ()*k); + m[0].getX() * k, m[0].getY() * k, m[0].getZ() * k, + m[1].getX() * k, m[1].getY() * k, m[1].getZ() * k, + m[2].getX() * k, m[2].getY() * k, m[2].getZ() * k); #endif } -B3_FORCE_INLINE b3Matrix3x3 +B3_FORCE_INLINE b3Matrix3x3 operator+(const b3Matrix3x3& m1, const b3Matrix3x3& m2) { -#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON) +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON) return b3Matrix3x3( - m1[0].mVec128 + m2[0].mVec128, - m1[1].mVec128 + m2[1].mVec128, - m1[2].mVec128 + m2[2].mVec128); + m1[0].mVec128 + m2[0].mVec128, + m1[1].mVec128 + m2[1].mVec128, + m1[2].mVec128 + m2[2].mVec128); #else return b3Matrix3x3( - m1[0][0]+m2[0][0], - m1[0][1]+m2[0][1], - m1[0][2]+m2[0][2], - - m1[1][0]+m2[1][0], - m1[1][1]+m2[1][1], - m1[1][2]+m2[1][2], - - m1[2][0]+m2[2][0], - m1[2][1]+m2[2][1], - m1[2][2]+m2[2][2]); -#endif + m1[0][0] + m2[0][0], + m1[0][1] + m2[0][1], + m1[0][2] + m2[0][2], + + m1[1][0] + m2[1][0], + m1[1][1] + m2[1][1], + m1[1][2] + m2[1][2], + + m1[2][0] + m2[2][0], + m1[2][1] + m2[2][1], + m1[2][2] + m2[2][2]); +#endif } -B3_FORCE_INLINE b3Matrix3x3 +B3_FORCE_INLINE b3Matrix3x3 operator-(const b3Matrix3x3& m1, const b3Matrix3x3& m2) { -#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON) +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON) return b3Matrix3x3( - m1[0].mVec128 - m2[0].mVec128, - m1[1].mVec128 - m2[1].mVec128, - m1[2].mVec128 - m2[2].mVec128); + m1[0].mVec128 - m2[0].mVec128, + m1[1].mVec128 - m2[1].mVec128, + m1[2].mVec128 - m2[2].mVec128); #else return b3Matrix3x3( - m1[0][0]-m2[0][0], - m1[0][1]-m2[0][1], - m1[0][2]-m2[0][2], - - m1[1][0]-m2[1][0], - m1[1][1]-m2[1][1], - m1[1][2]-m2[1][2], - - m1[2][0]-m2[2][0], - m1[2][1]-m2[2][1], - m1[2][2]-m2[2][2]); + m1[0][0] - m2[0][0], + m1[0][1] - m2[0][1], + m1[0][2] - m2[0][2], + + m1[1][0] - m2[1][0], + m1[1][1] - m2[1][1], + m1[1][2] - m2[1][2], + + m1[2][0] - m2[2][0], + m1[2][1] - m2[2][1], + m1[2][2] - m2[2][2]); #endif } - -B3_FORCE_INLINE b3Matrix3x3& +B3_FORCE_INLINE b3Matrix3x3& b3Matrix3x3::operator-=(const b3Matrix3x3& m) { -#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON) - m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128; - m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128; - m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128; +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON) + m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128; + m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128; + m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128; #else setValue( - m_el[0][0]-m.m_el[0][0], - m_el[0][1]-m.m_el[0][1], - m_el[0][2]-m.m_el[0][2], - m_el[1][0]-m.m_el[1][0], - m_el[1][1]-m.m_el[1][1], - m_el[1][2]-m.m_el[1][2], - m_el[2][0]-m.m_el[2][0], - m_el[2][1]-m.m_el[2][1], - m_el[2][2]-m.m_el[2][2]); + m_el[0][0] - m.m_el[0][0], + m_el[0][1] - m.m_el[0][1], + m_el[0][2] - m.m_el[0][2], + m_el[1][0] - m.m_el[1][0], + m_el[1][1] - m.m_el[1][1], + m_el[1][2] - m.m_el[1][2], + m_el[2][0] - m.m_el[2][0], + m_el[2][1] - m.m_el[2][1], + m_el[2][2] - m.m_el[2][2]); #endif return *this; } - -B3_FORCE_INLINE b3Scalar +B3_FORCE_INLINE b3Scalar b3Matrix3x3::determinant() const -{ +{ return b3Triple((*this)[0], (*this)[1], (*this)[2]); } - -B3_FORCE_INLINE b3Matrix3x3 +B3_FORCE_INLINE b3Matrix3x3 b3Matrix3x3::absolute() const { -#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)) - return b3Matrix3x3( - _mm_and_ps(m_el[0].mVec128, b3vAbsfMask), - _mm_and_ps(m_el[1].mVec128, b3vAbsfMask), - _mm_and_ps(m_el[2].mVec128, b3vAbsfMask)); +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) + return b3Matrix3x3( + _mm_and_ps(m_el[0].mVec128, b3vAbsfMask), + _mm_and_ps(m_el[1].mVec128, b3vAbsfMask), + _mm_and_ps(m_el[2].mVec128, b3vAbsfMask)); #elif defined(B3_USE_NEON) - return b3Matrix3x3( - (float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, b3v3AbsMask), - (float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, b3v3AbsMask), - (float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, b3v3AbsMask)); -#else return b3Matrix3x3( - b3Fabs(m_el[0].getX()), b3Fabs(m_el[0].getY()), b3Fabs(m_el[0].getZ()), - b3Fabs(m_el[1].getX()), b3Fabs(m_el[1].getY()), b3Fabs(m_el[1].getZ()), - b3Fabs(m_el[2].getX()), b3Fabs(m_el[2].getY()), b3Fabs(m_el[2].getZ())); + (float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, b3v3AbsMask), + (float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, b3v3AbsMask), + (float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, b3v3AbsMask)); +#else + return b3Matrix3x3( + b3Fabs(m_el[0].getX()), b3Fabs(m_el[0].getY()), b3Fabs(m_el[0].getZ()), + b3Fabs(m_el[1].getX()), b3Fabs(m_el[1].getY()), b3Fabs(m_el[1].getZ()), + b3Fabs(m_el[2].getX()), b3Fabs(m_el[2].getY()), b3Fabs(m_el[2].getZ())); #endif } -B3_FORCE_INLINE b3Matrix3x3 -b3Matrix3x3::transpose() const +B3_FORCE_INLINE b3Matrix3x3 +b3Matrix3x3::transpose() const { -#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)) - __m128 v0 = m_el[0].mVec128; - __m128 v1 = m_el[1].mVec128; - __m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2 - __m128 vT; - - v2 = _mm_and_ps(v2, b3vFFF0fMask); // x2 y2 z2 0 - - vT = _mm_unpackhi_ps(v0, v1); // z0 z1 * * - v0 = _mm_unpacklo_ps(v0, v1); // x0 x1 y0 y1 - - v1 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(2, 3, 1, 3) ); // y0 y1 y2 0 - v0 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(0, 1, 0, 3) ); // x0 x1 x2 0 - v2 = b3CastdTo128f(_mm_move_sd(b3CastfTo128d(v2), b3CastfTo128d(vT))); // z0 z1 z2 0 - - - return b3Matrix3x3( v0, v1, v2 ); +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) + __m128 v0 = m_el[0].mVec128; + __m128 v1 = m_el[1].mVec128; + __m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2 + __m128 vT; + + v2 = _mm_and_ps(v2, b3vFFF0fMask); // x2 y2 z2 0 + + vT = _mm_unpackhi_ps(v0, v1); // z0 z1 * * + v0 = _mm_unpacklo_ps(v0, v1); // x0 x1 y0 y1 + + v1 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(2, 3, 1, 3)); // y0 y1 y2 0 + v0 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(0, 1, 0, 3)); // x0 x1 x2 0 + v2 = b3CastdTo128f(_mm_move_sd(b3CastfTo128d(v2), b3CastfTo128d(vT))); // z0 z1 z2 0 + + return b3Matrix3x3(v0, v1, v2); #elif defined(B3_USE_NEON) - // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions. - static const uint32x2_t zMask = (const uint32x2_t) {-1, 0 }; - float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 ); // {x0 x1 z0 z1}, {y0 y1 w0 w1} - float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) ); // {x2 0 }, {y2 0} - float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] ); - float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] ); - float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask ); - float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q ); // z0 z1 z2 0 - return b3Matrix3x3( v0, v1, v2 ); + // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions. + static const uint32x2_t zMask = (const uint32x2_t){-1, 0}; + float32x4x2_t top = vtrnq_f32(m_el[0].mVec128, m_el[1].mVec128); // {x0 x1 z0 z1}, {y0 y1 w0 w1} + float32x2x2_t bl = vtrn_f32(vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f)); // {x2 0 }, {y2 0} + float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]); + float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]); + float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(m_el[2].mVec128), zMask); + float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q); // z0 z1 z2 0 + return b3Matrix3x3(v0, v1, v2); #else - return b3Matrix3x3( m_el[0].getX(), m_el[1].getX(), m_el[2].getX(), - m_el[0].getY(), m_el[1].getY(), m_el[2].getY(), - m_el[0].getZ(), m_el[1].getZ(), m_el[2].getZ()); + return b3Matrix3x3(m_el[0].getX(), m_el[1].getX(), m_el[2].getX(), + m_el[0].getY(), m_el[1].getY(), m_el[2].getY(), + m_el[0].getZ(), m_el[1].getZ(), m_el[2].getZ()); #endif } -B3_FORCE_INLINE b3Matrix3x3 -b3Matrix3x3::adjoint() const +B3_FORCE_INLINE b3Matrix3x3 +b3Matrix3x3::adjoint() const { return b3Matrix3x3(cofac(1, 1, 2, 2), cofac(0, 2, 2, 1), cofac(0, 1, 1, 2), - cofac(1, 2, 2, 0), cofac(0, 0, 2, 2), cofac(0, 2, 1, 0), - cofac(1, 0, 2, 1), cofac(0, 1, 2, 0), cofac(0, 0, 1, 1)); + cofac(1, 2, 2, 0), cofac(0, 0, 2, 2), cofac(0, 2, 1, 0), + cofac(1, 0, 2, 1), cofac(0, 1, 2, 0), cofac(0, 0, 1, 1)); } -B3_FORCE_INLINE b3Matrix3x3 +B3_FORCE_INLINE b3Matrix3x3 b3Matrix3x3::inverse() const { b3Vector3 co = b3MakeVector3(cofac(1, 1, 2, 2), cofac(1, 2, 2, 0), cofac(1, 0, 2, 1)); @@ -1024,54 +1023,54 @@ b3Matrix3x3::inverse() const b3FullAssert(det != b3Scalar(0.0)); b3Scalar s = b3Scalar(1.0) / det; return b3Matrix3x3(co.getX() * s, cofac(0, 2, 2, 1) * s, cofac(0, 1, 1, 2) * s, - co.getY() * s, cofac(0, 0, 2, 2) * s, cofac(0, 2, 1, 0) * s, - co.getZ() * s, cofac(0, 1, 2, 0) * s, cofac(0, 0, 1, 1) * s); + co.getY() * s, cofac(0, 0, 2, 2) * s, cofac(0, 2, 1, 0) * s, + co.getZ() * s, cofac(0, 1, 2, 0) * s, cofac(0, 0, 1, 1) * s); } -B3_FORCE_INLINE b3Matrix3x3 +B3_FORCE_INLINE b3Matrix3x3 b3Matrix3x3::transposeTimes(const b3Matrix3x3& m) const { -#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)) - // zeros w -// static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL }; - __m128 row = m_el[0].mVec128; - __m128 m0 = _mm_and_ps( m.getRow(0).mVec128, b3vFFF0fMask ); - __m128 m1 = _mm_and_ps( m.getRow(1).mVec128, b3vFFF0fMask); - __m128 m2 = _mm_and_ps( m.getRow(2).mVec128, b3vFFF0fMask ); - __m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0)); - __m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55)); - __m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa)); - row = m_el[1].mVec128; - r0 = _mm_add_ps( r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0))); - r1 = _mm_add_ps( r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55))); - r2 = _mm_add_ps( r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa))); - row = m_el[2].mVec128; - r0 = _mm_add_ps( r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0))); - r1 = _mm_add_ps( r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55))); - r2 = _mm_add_ps( r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa))); - return b3Matrix3x3( r0, r1, r2 ); +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) + // zeros w + // static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL }; + __m128 row = m_el[0].mVec128; + __m128 m0 = _mm_and_ps(m.getRow(0).mVec128, b3vFFF0fMask); + __m128 m1 = _mm_and_ps(m.getRow(1).mVec128, b3vFFF0fMask); + __m128 m2 = _mm_and_ps(m.getRow(2).mVec128, b3vFFF0fMask); + __m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0)); + __m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55)); + __m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa)); + row = m_el[1].mVec128; + r0 = _mm_add_ps(r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0))); + r1 = _mm_add_ps(r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55))); + r2 = _mm_add_ps(r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa))); + row = m_el[2].mVec128; + r0 = _mm_add_ps(r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0))); + r1 = _mm_add_ps(r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55))); + r2 = _mm_add_ps(r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa))); + return b3Matrix3x3(r0, r1, r2); #elif defined B3_USE_NEON - // zeros w - static const uint32x4_t xyzMask = (const uint32x4_t){ -1, -1, -1, 0 }; - float32x4_t m0 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(0).mVec128, xyzMask ); - float32x4_t m1 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(1).mVec128, xyzMask ); - float32x4_t m2 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(2).mVec128, xyzMask ); - float32x4_t row = m_el[0].mVec128; - float32x4_t r0 = vmulq_lane_f32( m0, vget_low_f32(row), 0); - float32x4_t r1 = vmulq_lane_f32( m0, vget_low_f32(row), 1); - float32x4_t r2 = vmulq_lane_f32( m0, vget_high_f32(row), 0); - row = m_el[1].mVec128; - r0 = vmlaq_lane_f32( r0, m1, vget_low_f32(row), 0); - r1 = vmlaq_lane_f32( r1, m1, vget_low_f32(row), 1); - r2 = vmlaq_lane_f32( r2, m1, vget_high_f32(row), 0); - row = m_el[2].mVec128; - r0 = vmlaq_lane_f32( r0, m2, vget_low_f32(row), 0); - r1 = vmlaq_lane_f32( r1, m2, vget_low_f32(row), 1); - r2 = vmlaq_lane_f32( r2, m2, vget_high_f32(row), 0); - return b3Matrix3x3( r0, r1, r2 ); + // zeros w + static const uint32x4_t xyzMask = (const uint32x4_t){-1, -1, -1, 0}; + float32x4_t m0 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(0).mVec128, xyzMask); + float32x4_t m1 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(1).mVec128, xyzMask); + float32x4_t m2 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(2).mVec128, xyzMask); + float32x4_t row = m_el[0].mVec128; + float32x4_t r0 = vmulq_lane_f32(m0, vget_low_f32(row), 0); + float32x4_t r1 = vmulq_lane_f32(m0, vget_low_f32(row), 1); + float32x4_t r2 = vmulq_lane_f32(m0, vget_high_f32(row), 0); + row = m_el[1].mVec128; + r0 = vmlaq_lane_f32(r0, m1, vget_low_f32(row), 0); + r1 = vmlaq_lane_f32(r1, m1, vget_low_f32(row), 1); + r2 = vmlaq_lane_f32(r2, m1, vget_high_f32(row), 0); + row = m_el[2].mVec128; + r0 = vmlaq_lane_f32(r0, m2, vget_low_f32(row), 0); + r1 = vmlaq_lane_f32(r1, m2, vget_low_f32(row), 1); + r2 = vmlaq_lane_f32(r2, m2, vget_high_f32(row), 0); + return b3Matrix3x3(r0, r1, r2); #else - return b3Matrix3x3( + return b3Matrix3x3( m_el[0].getX() * m[0].getX() + m_el[1].getX() * m[1].getX() + m_el[2].getX() * m[2].getX(), m_el[0].getX() * m[0].getY() + m_el[1].getX() * m[1].getY() + m_el[2].getX() * m[2].getY(), m_el[0].getX() * m[0].getZ() + m_el[1].getX() * m[1].getZ() + m_el[2].getX() * m[2].getZ(), @@ -1084,51 +1083,51 @@ b3Matrix3x3::transposeTimes(const b3Matrix3x3& m) const #endif } -B3_FORCE_INLINE b3Matrix3x3 +B3_FORCE_INLINE b3Matrix3x3 b3Matrix3x3::timesTranspose(const b3Matrix3x3& m) const { -#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)) - __m128 a0 = m_el[0].mVec128; - __m128 a1 = m_el[1].mVec128; - __m128 a2 = m_el[2].mVec128; - - b3Matrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here - __m128 mx = mT[0].mVec128; - __m128 my = mT[1].mVec128; - __m128 mz = mT[2].mVec128; - - __m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00)); - __m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00)); - __m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00)); - r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55))); - r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55))); - r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55))); - r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa))); - r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa))); - r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa))); - return b3Matrix3x3( r0, r1, r2); - +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) + __m128 a0 = m_el[0].mVec128; + __m128 a1 = m_el[1].mVec128; + __m128 a2 = m_el[2].mVec128; + + b3Matrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here + __m128 mx = mT[0].mVec128; + __m128 my = mT[1].mVec128; + __m128 mz = mT[2].mVec128; + + __m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00)); + __m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00)); + __m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00)); + r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55))); + r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55))); + r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55))); + r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa))); + r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa))); + r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa))); + return b3Matrix3x3(r0, r1, r2); + #elif defined B3_USE_NEON - float32x4_t a0 = m_el[0].mVec128; - float32x4_t a1 = m_el[1].mVec128; - float32x4_t a2 = m_el[2].mVec128; - - b3Matrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here - float32x4_t mx = mT[0].mVec128; - float32x4_t my = mT[1].mVec128; - float32x4_t mz = mT[2].mVec128; - - float32x4_t r0 = vmulq_lane_f32( mx, vget_low_f32(a0), 0); - float32x4_t r1 = vmulq_lane_f32( mx, vget_low_f32(a1), 0); - float32x4_t r2 = vmulq_lane_f32( mx, vget_low_f32(a2), 0); - r0 = vmlaq_lane_f32( r0, my, vget_low_f32(a0), 1); - r1 = vmlaq_lane_f32( r1, my, vget_low_f32(a1), 1); - r2 = vmlaq_lane_f32( r2, my, vget_low_f32(a2), 1); - r0 = vmlaq_lane_f32( r0, mz, vget_high_f32(a0), 0); - r1 = vmlaq_lane_f32( r1, mz, vget_high_f32(a1), 0); - r2 = vmlaq_lane_f32( r2, mz, vget_high_f32(a2), 0); - return b3Matrix3x3( r0, r1, r2 ); - + float32x4_t a0 = m_el[0].mVec128; + float32x4_t a1 = m_el[1].mVec128; + float32x4_t a2 = m_el[2].mVec128; + + b3Matrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here + float32x4_t mx = mT[0].mVec128; + float32x4_t my = mT[1].mVec128; + float32x4_t mz = mT[2].mVec128; + + float32x4_t r0 = vmulq_lane_f32(mx, vget_low_f32(a0), 0); + float32x4_t r1 = vmulq_lane_f32(mx, vget_low_f32(a1), 0); + float32x4_t r2 = vmulq_lane_f32(mx, vget_low_f32(a2), 0); + r0 = vmlaq_lane_f32(r0, my, vget_low_f32(a0), 1); + r1 = vmlaq_lane_f32(r1, my, vget_low_f32(a1), 1); + r2 = vmlaq_lane_f32(r2, my, vget_low_f32(a2), 1); + r0 = vmlaq_lane_f32(r0, mz, vget_high_f32(a0), 0); + r1 = vmlaq_lane_f32(r1, mz, vget_high_f32(a1), 0); + r2 = vmlaq_lane_f32(r2, mz, vget_high_f32(a2), 0); + return b3Matrix3x3(r0, r1, r2); + #else return b3Matrix3x3( m_el[0].dot(m[0]), m_el[0].dot(m[1]), m_el[0].dot(m[2]), @@ -1137,139 +1136,138 @@ b3Matrix3x3::timesTranspose(const b3Matrix3x3& m) const #endif } -B3_FORCE_INLINE b3Vector3 -operator*(const b3Matrix3x3& m, const b3Vector3& v) +B3_FORCE_INLINE b3Vector3 +operator*(const b3Matrix3x3& m, const b3Vector3& v) { -#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON) - return v.dot3(m[0], m[1], m[2]); +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON) + return v.dot3(m[0], m[1], m[2]); #else return b3MakeVector3(m[0].dot(v), m[1].dot(v), m[2].dot(v)); #endif } - B3_FORCE_INLINE b3Vector3 operator*(const b3Vector3& v, const b3Matrix3x3& m) { -#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)) +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) - const __m128 vv = v.mVec128; + const __m128 vv = v.mVec128; - __m128 c0 = b3_splat_ps( vv, 0); - __m128 c1 = b3_splat_ps( vv, 1); - __m128 c2 = b3_splat_ps( vv, 2); + __m128 c0 = b3_splat_ps(vv, 0); + __m128 c1 = b3_splat_ps(vv, 1); + __m128 c2 = b3_splat_ps(vv, 2); - c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, b3vFFF0fMask) ); - c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, b3vFFF0fMask) ); - c0 = _mm_add_ps(c0, c1); - c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, b3vFFF0fMask) ); - - return b3MakeVector3(_mm_add_ps(c0, c2)); + c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, b3vFFF0fMask)); + c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, b3vFFF0fMask)); + c0 = _mm_add_ps(c0, c1); + c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, b3vFFF0fMask)); + + return b3MakeVector3(_mm_add_ps(c0, c2)); #elif defined(B3_USE_NEON) - const float32x4_t vv = v.mVec128; - const float32x2_t vlo = vget_low_f32(vv); - const float32x2_t vhi = vget_high_f32(vv); - - float32x4_t c0, c1, c2; - - c0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, b3vFFF0Mask); - c1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, b3vFFF0Mask); - c2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, b3vFFF0Mask); - - c0 = vmulq_lane_f32(c0, vlo, 0); - c1 = vmulq_lane_f32(c1, vlo, 1); - c2 = vmulq_lane_f32(c2, vhi, 0); - c0 = vaddq_f32(c0, c1); - c0 = vaddq_f32(c0, c2); - - return b3MakeVector3(c0); + const float32x4_t vv = v.mVec128; + const float32x2_t vlo = vget_low_f32(vv); + const float32x2_t vhi = vget_high_f32(vv); + + float32x4_t c0, c1, c2; + + c0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, b3vFFF0Mask); + c1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, b3vFFF0Mask); + c2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, b3vFFF0Mask); + + c0 = vmulq_lane_f32(c0, vlo, 0); + c1 = vmulq_lane_f32(c1, vlo, 1); + c2 = vmulq_lane_f32(c2, vhi, 0); + c0 = vaddq_f32(c0, c1); + c0 = vaddq_f32(c0, c2); + + return b3MakeVector3(c0); #else return b3MakeVector3(m.tdotx(v), m.tdoty(v), m.tdotz(v)); #endif } -B3_FORCE_INLINE b3Matrix3x3 +B3_FORCE_INLINE b3Matrix3x3 operator*(const b3Matrix3x3& m1, const b3Matrix3x3& m2) { -#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)) - - __m128 m10 = m1[0].mVec128; - __m128 m11 = m1[1].mVec128; - __m128 m12 = m1[2].mVec128; - - __m128 m2v = _mm_and_ps(m2[0].mVec128, b3vFFF0fMask); - - __m128 c0 = b3_splat_ps( m10, 0); - __m128 c1 = b3_splat_ps( m11, 0); - __m128 c2 = b3_splat_ps( m12, 0); - - c0 = _mm_mul_ps(c0, m2v); - c1 = _mm_mul_ps(c1, m2v); - c2 = _mm_mul_ps(c2, m2v); - - m2v = _mm_and_ps(m2[1].mVec128, b3vFFF0fMask); - - __m128 c0_1 = b3_splat_ps( m10, 1); - __m128 c1_1 = b3_splat_ps( m11, 1); - __m128 c2_1 = b3_splat_ps( m12, 1); - - c0_1 = _mm_mul_ps(c0_1, m2v); - c1_1 = _mm_mul_ps(c1_1, m2v); - c2_1 = _mm_mul_ps(c2_1, m2v); - - m2v = _mm_and_ps(m2[2].mVec128, b3vFFF0fMask); - - c0 = _mm_add_ps(c0, c0_1); - c1 = _mm_add_ps(c1, c1_1); - c2 = _mm_add_ps(c2, c2_1); - - m10 = b3_splat_ps( m10, 2); - m11 = b3_splat_ps( m11, 2); - m12 = b3_splat_ps( m12, 2); - - m10 = _mm_mul_ps(m10, m2v); - m11 = _mm_mul_ps(m11, m2v); - m12 = _mm_mul_ps(m12, m2v); - - c0 = _mm_add_ps(c0, m10); - c1 = _mm_add_ps(c1, m11); - c2 = _mm_add_ps(c2, m12); - - return b3Matrix3x3(c0, c1, c2); +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) + + __m128 m10 = m1[0].mVec128; + __m128 m11 = m1[1].mVec128; + __m128 m12 = m1[2].mVec128; + + __m128 m2v = _mm_and_ps(m2[0].mVec128, b3vFFF0fMask); + + __m128 c0 = b3_splat_ps(m10, 0); + __m128 c1 = b3_splat_ps(m11, 0); + __m128 c2 = b3_splat_ps(m12, 0); + + c0 = _mm_mul_ps(c0, m2v); + c1 = _mm_mul_ps(c1, m2v); + c2 = _mm_mul_ps(c2, m2v); + + m2v = _mm_and_ps(m2[1].mVec128, b3vFFF0fMask); + + __m128 c0_1 = b3_splat_ps(m10, 1); + __m128 c1_1 = b3_splat_ps(m11, 1); + __m128 c2_1 = b3_splat_ps(m12, 1); + + c0_1 = _mm_mul_ps(c0_1, m2v); + c1_1 = _mm_mul_ps(c1_1, m2v); + c2_1 = _mm_mul_ps(c2_1, m2v); + + m2v = _mm_and_ps(m2[2].mVec128, b3vFFF0fMask); + + c0 = _mm_add_ps(c0, c0_1); + c1 = _mm_add_ps(c1, c1_1); + c2 = _mm_add_ps(c2, c2_1); + + m10 = b3_splat_ps(m10, 2); + m11 = b3_splat_ps(m11, 2); + m12 = b3_splat_ps(m12, 2); + + m10 = _mm_mul_ps(m10, m2v); + m11 = _mm_mul_ps(m11, m2v); + m12 = _mm_mul_ps(m12, m2v); + + c0 = _mm_add_ps(c0, m10); + c1 = _mm_add_ps(c1, m11); + c2 = _mm_add_ps(c2, m12); + + return b3Matrix3x3(c0, c1, c2); #elif defined(B3_USE_NEON) - float32x4_t rv0, rv1, rv2; - float32x4_t v0, v1, v2; - float32x4_t mv0, mv1, mv2; - - v0 = m1[0].mVec128; - v1 = m1[1].mVec128; - v2 = m1[2].mVec128; - - mv0 = (float32x4_t) vandq_s32((int32x4_t)m2[0].mVec128, b3vFFF0Mask); - mv1 = (float32x4_t) vandq_s32((int32x4_t)m2[1].mVec128, b3vFFF0Mask); - mv2 = (float32x4_t) vandq_s32((int32x4_t)m2[2].mVec128, b3vFFF0Mask); - - rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0); - rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0); - rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0); - - rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1); - rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1); - rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1); - - rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0); - rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0); - rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0); + float32x4_t rv0, rv1, rv2; + float32x4_t v0, v1, v2; + float32x4_t mv0, mv1, mv2; + + v0 = m1[0].mVec128; + v1 = m1[1].mVec128; + v2 = m1[2].mVec128; + + mv0 = (float32x4_t)vandq_s32((int32x4_t)m2[0].mVec128, b3vFFF0Mask); + mv1 = (float32x4_t)vandq_s32((int32x4_t)m2[1].mVec128, b3vFFF0Mask); + mv2 = (float32x4_t)vandq_s32((int32x4_t)m2[2].mVec128, b3vFFF0Mask); + + rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0); + rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0); + rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0); + + rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1); + rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1); + rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1); + + rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0); + rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0); + rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0); return b3Matrix3x3(rv0, rv1, rv2); - -#else + +#else return b3Matrix3x3( - m2.tdotx( m1[0]), m2.tdoty( m1[0]), m2.tdotz( m1[0]), - m2.tdotx( m1[1]), m2.tdoty( m1[1]), m2.tdotz( m1[1]), - m2.tdotx( m1[2]), m2.tdoty( m1[2]), m2.tdotz( m1[2])); + m2.tdotx(m1[0]), m2.tdoty(m1[0]), m2.tdotz(m1[0]), + m2.tdotx(m1[1]), m2.tdoty(m1[1]), m2.tdotz(m1[1]), + m2.tdotx(m1[2]), m2.tdoty(m1[2]), m2.tdotz(m1[2])); #endif } @@ -1292,71 +1290,65 @@ m1[0][2] * m2[0][2] + m1[1][2] * m2[1][2] + m1[2][2] * m2[2][2]); * It will test all elements are equal. */ B3_FORCE_INLINE bool operator==(const b3Matrix3x3& m1, const b3Matrix3x3& m2) { -#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)) - - __m128 c0, c1, c2; - - c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128); - c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128); - c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128); - - c0 = _mm_and_ps(c0, c1); - c0 = _mm_and_ps(c0, c2); - - return (0x7 == _mm_movemask_ps((__m128)c0)); -#else - return - ( m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] && - m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] && - m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2] ); +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) + + __m128 c0, c1, c2; + + c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128); + c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128); + c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128); + + c0 = _mm_and_ps(c0, c1); + c0 = _mm_and_ps(c0, c2); + + return (0x7 == _mm_movemask_ps((__m128)c0)); +#else + return (m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] && + m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] && + m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2]); #endif } ///for serialization -struct b3Matrix3x3FloatData +struct b3Matrix3x3FloatData { b3Vector3FloatData m_el[3]; }; ///for serialization -struct b3Matrix3x3DoubleData +struct b3Matrix3x3DoubleData { b3Vector3DoubleData m_el[3]; }; - - - -B3_FORCE_INLINE void b3Matrix3x3::serialize(struct b3Matrix3x3Data& dataOut) const +B3_FORCE_INLINE void b3Matrix3x3::serialize(struct b3Matrix3x3Data& dataOut) const { - for (int i=0;i<3;i++) + for (int i = 0; i < 3; i++) m_el[i].serialize(dataOut.m_el[i]); } -B3_FORCE_INLINE void b3Matrix3x3::serializeFloat(struct b3Matrix3x3FloatData& dataOut) const +B3_FORCE_INLINE void b3Matrix3x3::serializeFloat(struct b3Matrix3x3FloatData& dataOut) const { - for (int i=0;i<3;i++) + for (int i = 0; i < 3; i++) m_el[i].serializeFloat(dataOut.m_el[i]); } - -B3_FORCE_INLINE void b3Matrix3x3::deSerialize(const struct b3Matrix3x3Data& dataIn) +B3_FORCE_INLINE void b3Matrix3x3::deSerialize(const struct b3Matrix3x3Data& dataIn) { - for (int i=0;i<3;i++) + for (int i = 0; i < 3; i++) m_el[i].deSerialize(dataIn.m_el[i]); } -B3_FORCE_INLINE void b3Matrix3x3::deSerializeFloat(const struct b3Matrix3x3FloatData& dataIn) +B3_FORCE_INLINE void b3Matrix3x3::deSerializeFloat(const struct b3Matrix3x3FloatData& dataIn) { - for (int i=0;i<3;i++) + for (int i = 0; i < 3; i++) m_el[i].deSerializeFloat(dataIn.m_el[i]); } -B3_FORCE_INLINE void b3Matrix3x3::deSerializeDouble(const struct b3Matrix3x3DoubleData& dataIn) +B3_FORCE_INLINE void b3Matrix3x3::deSerializeDouble(const struct b3Matrix3x3DoubleData& dataIn) { - for (int i=0;i<3;i++) + for (int i = 0; i < 3; i++) m_el[i].deSerializeDouble(dataIn.m_el[i]); } -#endif //B3_MATRIX3x3_H - +#endif //B3_MATRIX3x3_H diff --git a/thirdparty/bullet/Bullet3Common/b3MinMax.h b/thirdparty/bullet/Bullet3Common/b3MinMax.h index 73af23a4f9..c09c3db3f5 100644 --- a/thirdparty/bullet/Bullet3Common/b3MinMax.h +++ b/thirdparty/bullet/Bullet3Common/b3MinMax.h @@ -12,60 +12,58 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - - #ifndef B3_GEN_MINMAX_H #define B3_GEN_MINMAX_H #include "b3Scalar.h" template <class T> -B3_FORCE_INLINE const T& b3Min(const T& a, const T& b) +B3_FORCE_INLINE const T& b3Min(const T& a, const T& b) { - return a < b ? a : b ; + return a < b ? a : b; } template <class T> -B3_FORCE_INLINE const T& b3Max(const T& a, const T& b) +B3_FORCE_INLINE const T& b3Max(const T& a, const T& b) { - return a > b ? a : b; + return a > b ? a : b; } template <class T> -B3_FORCE_INLINE const T& b3Clamped(const T& a, const T& lb, const T& ub) +B3_FORCE_INLINE const T& b3Clamped(const T& a, const T& lb, const T& ub) { - return a < lb ? lb : (ub < a ? ub : a); + return a < lb ? lb : (ub < a ? ub : a); } template <class T> -B3_FORCE_INLINE void b3SetMin(T& a, const T& b) +B3_FORCE_INLINE void b3SetMin(T& a, const T& b) { - if (b < a) + if (b < a) { a = b; } } template <class T> -B3_FORCE_INLINE void b3SetMax(T& a, const T& b) +B3_FORCE_INLINE void b3SetMax(T& a, const T& b) { - if (a < b) + if (a < b) { a = b; } } template <class T> -B3_FORCE_INLINE void b3Clamp(T& a, const T& lb, const T& ub) +B3_FORCE_INLINE void b3Clamp(T& a, const T& lb, const T& ub) { - if (a < lb) + if (a < lb) { - a = lb; + a = lb; } - else if (ub < a) + else if (ub < a) { a = ub; } } -#endif //B3_GEN_MINMAX_H +#endif //B3_GEN_MINMAX_H diff --git a/thirdparty/bullet/Bullet3Common/b3PoolAllocator.h b/thirdparty/bullet/Bullet3Common/b3PoolAllocator.h index 2fcdcf5b24..ed56bc627d 100644 --- a/thirdparty/bullet/Bullet3Common/b3PoolAllocator.h +++ b/thirdparty/bullet/Bullet3Common/b3PoolAllocator.h @@ -12,7 +12,6 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - #ifndef _BT_POOL_ALLOCATOR_H #define _BT_POOL_ALLOCATOR_H @@ -22,37 +21,37 @@ subject to the following restrictions: ///The b3PoolAllocator class allows to efficiently allocate a large pool of objects, instead of dynamically allocating them separately. class b3PoolAllocator { - int m_elemSize; - int m_maxElements; - int m_freeCount; - void* m_firstFree; - unsigned char* m_pool; + int m_elemSize; + int m_maxElements; + int m_freeCount; + void* m_firstFree; + unsigned char* m_pool; public: - b3PoolAllocator(int elemSize, int maxElements) - :m_elemSize(elemSize), - m_maxElements(maxElements) + : m_elemSize(elemSize), + m_maxElements(maxElements) { - m_pool = (unsigned char*) b3AlignedAlloc( static_cast<unsigned int>(m_elemSize*m_maxElements),16); + m_pool = (unsigned char*)b3AlignedAlloc(static_cast<unsigned int>(m_elemSize * m_maxElements), 16); unsigned char* p = m_pool; - m_firstFree = p; - m_freeCount = m_maxElements; - int count = m_maxElements; - while (--count) { - *(void**)p = (p + m_elemSize); - p += m_elemSize; - } - *(void**)p = 0; - } + m_firstFree = p; + m_freeCount = m_maxElements; + int count = m_maxElements; + while (--count) + { + *(void**)p = (p + m_elemSize); + p += m_elemSize; + } + *(void**)p = 0; + } ~b3PoolAllocator() { - b3AlignedFree( m_pool); + b3AlignedFree(m_pool); } - int getFreeCount() const + int getFreeCount() const { return m_freeCount; } @@ -67,21 +66,22 @@ public: return m_maxElements; } - void* allocate(int size) + void* allocate(int size) { // release mode fix (void)size; - b3Assert(!size || size<=m_elemSize); - b3Assert(m_freeCount>0); - void* result = m_firstFree; - m_firstFree = *(void**)m_firstFree; - --m_freeCount; - return result; + b3Assert(!size || size <= m_elemSize); + b3Assert(m_freeCount > 0); + void* result = m_firstFree; + m_firstFree = *(void**)m_firstFree; + --m_freeCount; + return result; } bool validPtr(void* ptr) { - if (ptr) { + if (ptr) + { if (((unsigned char*)ptr >= m_pool && (unsigned char*)ptr < m_pool + m_maxElements * m_elemSize)) { return true; @@ -90,32 +90,32 @@ public: return false; } - void freeMemory(void* ptr) + void freeMemory(void* ptr) { - if (ptr) { - b3Assert((unsigned char*)ptr >= m_pool && (unsigned char*)ptr < m_pool + m_maxElements * m_elemSize); + if (ptr) + { + b3Assert((unsigned char*)ptr >= m_pool && (unsigned char*)ptr < m_pool + m_maxElements * m_elemSize); - *(void**)ptr = m_firstFree; - m_firstFree = ptr; - ++m_freeCount; - } + *(void**)ptr = m_firstFree; + m_firstFree = ptr; + ++m_freeCount; + } } - int getElementSize() const + int getElementSize() const { return m_elemSize; } - unsigned char* getPoolAddress() + unsigned char* getPoolAddress() { return m_pool; } - const unsigned char* getPoolAddress() const + const unsigned char* getPoolAddress() const { return m_pool; } - }; -#endif //_BT_POOL_ALLOCATOR_H +#endif //_BT_POOL_ALLOCATOR_H diff --git a/thirdparty/bullet/Bullet3Common/b3QuadWord.h b/thirdparty/bullet/Bullet3Common/b3QuadWord.h index 65c9581977..0def305fac 100644 --- a/thirdparty/bullet/Bullet3Common/b3QuadWord.h +++ b/thirdparty/bullet/Bullet3Common/b3QuadWord.h @@ -12,18 +12,13 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - #ifndef B3_SIMD_QUADWORD_H #define B3_SIMD_QUADWORD_H #include "b3Scalar.h" #include "b3MinMax.h" - - - - -#if defined (__CELLOS_LV2) && defined (__SPU__) +#if defined(__CELLOS_LV2) && defined(__SPU__) #include <altivec.h> #endif @@ -31,58 +26,64 @@ subject to the following restrictions: * Some issues under PS3 Linux with IBM 2.1 SDK, gcc compiler prevent from using aligned quadword. */ #ifndef USE_LIBSPE2 -B3_ATTRIBUTE_ALIGNED16(class) b3QuadWord +B3_ATTRIBUTE_ALIGNED16(class) +b3QuadWord #else class b3QuadWord #endif { protected: - -#if defined (__SPU__) && defined (__CELLOS_LV2__) +#if defined(__SPU__) && defined(__CELLOS_LV2__) union { vec_float4 mVec128; - b3Scalar m_floats[4]; + b3Scalar m_floats[4]; }; + public: - vec_float4 get128() const + vec_float4 get128() const { return mVec128; } -#else //__CELLOS_LV2__ __SPU__ +#else //__CELLOS_LV2__ __SPU__ -#if defined(B3_USE_SSE) || defined(B3_USE_NEON) +#if defined(B3_USE_SSE) || defined(B3_USE_NEON) public: union { b3SimdFloat4 mVec128; - b3Scalar m_floats[4]; - struct {b3Scalar x,y,z,w;}; + b3Scalar m_floats[4]; + struct + { + b3Scalar x, y, z, w; + }; }; + public: - B3_FORCE_INLINE b3SimdFloat4 get128() const + B3_FORCE_INLINE b3SimdFloat4 get128() const { return mVec128; } - B3_FORCE_INLINE void set128(b3SimdFloat4 v128) + B3_FORCE_INLINE void set128(b3SimdFloat4 v128) { mVec128 = v128; } #else public: - union - { - b3Scalar m_floats[4]; - struct {b3Scalar x,y,z,w;}; + union { + b3Scalar m_floats[4]; + struct + { + b3Scalar x, y, z, w; + }; }; -#endif // B3_USE_SSE +#endif // B3_USE_SSE -#endif //__CELLOS_LV2__ __SPU__ +#endif //__CELLOS_LV2__ __SPU__ - public: - +public: #if defined(B3_USE_SSE) || defined(B3_USE_NEON) - // Set Vector + // Set Vector B3_FORCE_INLINE b3QuadWord(const b3SimdFloat4 vec) { mVec128 = vec; @@ -95,151 +96,147 @@ public: } // Assignment Operator - B3_FORCE_INLINE b3QuadWord& - operator=(const b3QuadWord& v) + B3_FORCE_INLINE b3QuadWord& + operator=(const b3QuadWord& v) { mVec128 = v.mVec128; - + return *this; } - + #endif - /**@brief Return the x value */ - B3_FORCE_INLINE const b3Scalar& getX() const { return m_floats[0]; } - /**@brief Return the y value */ - B3_FORCE_INLINE const b3Scalar& getY() const { return m_floats[1]; } - /**@brief Return the z value */ - B3_FORCE_INLINE const b3Scalar& getZ() const { return m_floats[2]; } - /**@brief Set the x value */ - B3_FORCE_INLINE void setX(b3Scalar _x) { m_floats[0] = _x;}; - /**@brief Set the y value */ - B3_FORCE_INLINE void setY(b3Scalar _y) { m_floats[1] = _y;}; - /**@brief Set the z value */ - B3_FORCE_INLINE void setZ(b3Scalar _z) { m_floats[2] = _z;}; - /**@brief Set the w value */ - B3_FORCE_INLINE void setW(b3Scalar _w) { m_floats[3] = _w;}; - /**@brief Return the x value */ - - - //B3_FORCE_INLINE b3Scalar& operator[](int i) { return (&m_floats[0])[i]; } + /**@brief Return the x value */ + B3_FORCE_INLINE const b3Scalar& getX() const { return m_floats[0]; } + /**@brief Return the y value */ + B3_FORCE_INLINE const b3Scalar& getY() const { return m_floats[1]; } + /**@brief Return the z value */ + B3_FORCE_INLINE const b3Scalar& getZ() const { return m_floats[2]; } + /**@brief Set the x value */ + B3_FORCE_INLINE void setX(b3Scalar _x) { m_floats[0] = _x; }; + /**@brief Set the y value */ + B3_FORCE_INLINE void setY(b3Scalar _y) { m_floats[1] = _y; }; + /**@brief Set the z value */ + B3_FORCE_INLINE void setZ(b3Scalar _z) { m_floats[2] = _z; }; + /**@brief Set the w value */ + B3_FORCE_INLINE void setW(b3Scalar _w) { m_floats[3] = _w; }; + /**@brief Return the x value */ + + //B3_FORCE_INLINE b3Scalar& operator[](int i) { return (&m_floats[0])[i]; } //B3_FORCE_INLINE const b3Scalar& operator[](int i) const { return (&m_floats[0])[i]; } ///operator b3Scalar*() replaces operator[], using implicit conversion. We added operator != and operator == to avoid pointer comparisons. - B3_FORCE_INLINE operator b3Scalar *() { return &m_floats[0]; } - B3_FORCE_INLINE operator const b3Scalar *() const { return &m_floats[0]; } + B3_FORCE_INLINE operator b3Scalar*() { return &m_floats[0]; } + B3_FORCE_INLINE operator const b3Scalar*() const { return &m_floats[0]; } - B3_FORCE_INLINE bool operator==(const b3QuadWord& other) const + B3_FORCE_INLINE bool operator==(const b3QuadWord& other) const { #ifdef B3_USE_SSE - return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128))); -#else - return ((m_floats[3]==other.m_floats[3]) && - (m_floats[2]==other.m_floats[2]) && - (m_floats[1]==other.m_floats[1]) && - (m_floats[0]==other.m_floats[0])); + return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128))); +#else + return ((m_floats[3] == other.m_floats[3]) && + (m_floats[2] == other.m_floats[2]) && + (m_floats[1] == other.m_floats[1]) && + (m_floats[0] == other.m_floats[0])); #endif } - B3_FORCE_INLINE bool operator!=(const b3QuadWord& other) const + B3_FORCE_INLINE bool operator!=(const b3QuadWord& other) const { return !(*this == other); } - /**@brief Set x,y,z and zero w + /**@brief Set x,y,z and zero w * @param x Value of x * @param y Value of y * @param z Value of z */ - B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z) - { - m_floats[0]=_x; - m_floats[1]=_y; - m_floats[2]=_z; - m_floats[3] = 0.f; - } + B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z) + { + m_floats[0] = _x; + m_floats[1] = _y; + m_floats[2] = _z; + m_floats[3] = 0.f; + } -/* void getValue(b3Scalar *m) const + /* void getValue(b3Scalar *m) const { m[0] = m_floats[0]; m[1] = m_floats[1]; m[2] = m_floats[2]; } */ -/**@brief Set the values + /**@brief Set the values * @param x Value of x * @param y Value of y * @param z Value of z * @param w Value of w */ - B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z,const b3Scalar& _w) - { - m_floats[0]=_x; - m_floats[1]=_y; - m_floats[2]=_z; - m_floats[3]=_w; - } - /**@brief No initialization constructor */ - B3_FORCE_INLINE b3QuadWord() - // :m_floats[0](b3Scalar(0.)),m_floats[1](b3Scalar(0.)),m_floats[2](b3Scalar(0.)),m_floats[3](b3Scalar(0.)) - { - } - - /**@brief Three argument constructor (zeros w) + B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z, const b3Scalar& _w) + { + m_floats[0] = _x; + m_floats[1] = _y; + m_floats[2] = _z; + m_floats[3] = _w; + } + /**@brief No initialization constructor */ + B3_FORCE_INLINE b3QuadWord() + // :m_floats[0](b3Scalar(0.)),m_floats[1](b3Scalar(0.)),m_floats[2](b3Scalar(0.)),m_floats[3](b3Scalar(0.)) + { + } + + /**@brief Three argument constructor (zeros w) * @param x Value of x * @param y Value of y * @param z Value of z */ - B3_FORCE_INLINE b3QuadWord(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z) - { - m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = 0.0f; - } + B3_FORCE_INLINE b3QuadWord(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z) + { + m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = 0.0f; + } -/**@brief Initializing constructor + /**@brief Initializing constructor * @param x Value of x * @param y Value of y * @param z Value of z * @param w Value of w */ - B3_FORCE_INLINE b3QuadWord(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z,const b3Scalar& _w) - { - m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = _w; - } + B3_FORCE_INLINE b3QuadWord(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z, const b3Scalar& _w) + { + m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = _w; + } - /**@brief Set each element to the max of the current values and the values of another b3QuadWord + /**@brief Set each element to the max of the current values and the values of another b3QuadWord * @param other The other b3QuadWord to compare with */ - B3_FORCE_INLINE void setMax(const b3QuadWord& other) - { - #ifdef B3_USE_SSE - mVec128 = _mm_max_ps(mVec128, other.mVec128); - #elif defined(B3_USE_NEON) - mVec128 = vmaxq_f32(mVec128, other.mVec128); - #else - b3SetMax(m_floats[0], other.m_floats[0]); - b3SetMax(m_floats[1], other.m_floats[1]); - b3SetMax(m_floats[2], other.m_floats[2]); - b3SetMax(m_floats[3], other.m_floats[3]); - #endif - } - /**@brief Set each element to the min of the current values and the values of another b3QuadWord + B3_FORCE_INLINE void setMax(const b3QuadWord& other) + { +#ifdef B3_USE_SSE + mVec128 = _mm_max_ps(mVec128, other.mVec128); +#elif defined(B3_USE_NEON) + mVec128 = vmaxq_f32(mVec128, other.mVec128); +#else + b3SetMax(m_floats[0], other.m_floats[0]); + b3SetMax(m_floats[1], other.m_floats[1]); + b3SetMax(m_floats[2], other.m_floats[2]); + b3SetMax(m_floats[3], other.m_floats[3]); +#endif + } + /**@brief Set each element to the min of the current values and the values of another b3QuadWord * @param other The other b3QuadWord to compare with */ - B3_FORCE_INLINE void setMin(const b3QuadWord& other) - { - #ifdef B3_USE_SSE - mVec128 = _mm_min_ps(mVec128, other.mVec128); - #elif defined(B3_USE_NEON) - mVec128 = vminq_f32(mVec128, other.mVec128); - #else - b3SetMin(m_floats[0], other.m_floats[0]); - b3SetMin(m_floats[1], other.m_floats[1]); - b3SetMin(m_floats[2], other.m_floats[2]); - b3SetMin(m_floats[3], other.m_floats[3]); - #endif - } - - - + B3_FORCE_INLINE void setMin(const b3QuadWord& other) + { +#ifdef B3_USE_SSE + mVec128 = _mm_min_ps(mVec128, other.mVec128); +#elif defined(B3_USE_NEON) + mVec128 = vminq_f32(mVec128, other.mVec128); +#else + b3SetMin(m_floats[0], other.m_floats[0]); + b3SetMin(m_floats[1], other.m_floats[1]); + b3SetMin(m_floats[2], other.m_floats[2]); + b3SetMin(m_floats[3], other.m_floats[3]); +#endif + } }; -#endif //B3_SIMD_QUADWORD_H +#endif //B3_SIMD_QUADWORD_H diff --git a/thirdparty/bullet/Bullet3Common/b3Quaternion.h b/thirdparty/bullet/Bullet3Common/b3Quaternion.h index ad20543348..9bd5ff7d90 100644 --- a/thirdparty/bullet/Bullet3Common/b3Quaternion.h +++ b/thirdparty/bullet/Bullet3Common/b3Quaternion.h @@ -12,19 +12,12 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - - #ifndef B3_SIMD__QUATERNION_H_ #define B3_SIMD__QUATERNION_H_ - #include "b3Vector3.h" #include "b3QuadWord.h" - - - - #ifdef B3_USE_SSE const __m128 B3_ATTRIBUTE_ALIGNED16(b3vOnes) = {1.0f, 1.0f, 1.0f, 1.0f}; @@ -39,13 +32,14 @@ const b3SimdFloat4 B3_ATTRIBUTE_ALIGNED16(b3vPPPM) = {+0.0f, +0.0f, +0.0f, -0.0f #endif /**@brief The b3Quaternion implements quaternion to perform linear algebra rotations in combination with b3Matrix3x3, b3Vector3 and b3Transform. */ -class b3Quaternion : public b3QuadWord { +class b3Quaternion : public b3QuadWord +{ public: - /**@brief No initialization constructor */ + /**@brief No initialization constructor */ b3Quaternion() {} -#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))|| defined(B3_USE_NEON) - // Set Vector +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON) + // Set Vector B3_FORCE_INLINE b3Quaternion(const b3SimdFloat4 vec) { mVec128 = vec; @@ -58,63 +52,70 @@ public: } // Assignment Operator - B3_FORCE_INLINE b3Quaternion& - operator=(const b3Quaternion& v) + B3_FORCE_INLINE b3Quaternion& + operator=(const b3Quaternion& v) { mVec128 = v.mVec128; - + return *this; } - + #endif // template <typename b3Scalar> // explicit Quaternion(const b3Scalar *v) : Tuple4<b3Scalar>(v) {} - /**@brief Constructor from scalars */ - b3Quaternion(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z, const b3Scalar& _w) - : b3QuadWord(_x, _y, _z, _w) + /**@brief Constructor from scalars */ + b3Quaternion(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z, const b3Scalar& _w) + : b3QuadWord(_x, _y, _z, _w) { //b3Assert(!((_x==1.f) && (_y==0.f) && (_z==0.f) && (_w==0.f))); } - /**@brief Axis angle Constructor + /**@brief Axis angle Constructor * @param axis The axis which the rotation is around * @param angle The magnitude of the rotation around the angle (Radians) */ - b3Quaternion(const b3Vector3& _axis, const b3Scalar& _angle) - { - setRotation(_axis, _angle); + b3Quaternion(const b3Vector3& _axis, const b3Scalar& _angle) + { + setRotation(_axis, _angle); } - /**@brief Constructor from Euler angles + /**@brief Constructor from Euler angles * @param yaw Angle around Y unless B3_EULER_DEFAULT_ZYX defined then Z * @param pitch Angle around X unless B3_EULER_DEFAULT_ZYX defined then Y * @param roll Angle around Z unless B3_EULER_DEFAULT_ZYX defined then X */ b3Quaternion(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll) - { + { #ifndef B3_EULER_DEFAULT_ZYX - setEuler(yaw, pitch, roll); + setEuler(yaw, pitch, roll); #else - setEulerZYX(yaw, pitch, roll); -#endif + setEulerZYX(yaw, pitch, roll); +#endif } - /**@brief Set the rotation using axis angle notation + /**@brief Set the rotation using axis angle notation * @param axis The axis around which to rotate * @param angle The magnitude of the rotation in Radians */ void setRotation(const b3Vector3& axis, const b3Scalar& _angle) { b3Scalar d = axis.length(); b3Assert(d != b3Scalar(0.0)); - b3Scalar s = b3Sin(_angle * b3Scalar(0.5)) / d; - setValue(axis.getX() * s, axis.getY() * s, axis.getZ() * s, - b3Cos(_angle * b3Scalar(0.5))); + if (d < B3_EPSILON) + { + setValue(0, 0, 0, 1); + } + else + { + b3Scalar s = b3Sin(_angle * b3Scalar(0.5)) / d; + setValue(axis.getX() * s, axis.getY() * s, axis.getZ() * s, + b3Cos(_angle * b3Scalar(0.5))); + } } - /**@brief Set the quaternion using Euler angles + /**@brief Set the quaternion using Euler angles * @param yaw Angle around Y * @param pitch Angle around X * @param roll Angle around Z */ void setEuler(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll) { - b3Scalar halfYaw = b3Scalar(yaw) * b3Scalar(0.5); - b3Scalar halfPitch = b3Scalar(pitch) * b3Scalar(0.5); - b3Scalar halfRoll = b3Scalar(roll) * b3Scalar(0.5); + b3Scalar halfYaw = b3Scalar(yaw) * b3Scalar(0.5); + b3Scalar halfPitch = b3Scalar(pitch) * b3Scalar(0.5); + b3Scalar halfRoll = b3Scalar(roll) * b3Scalar(0.5); b3Scalar cosYaw = b3Cos(halfYaw); b3Scalar sinYaw = b3Sin(halfYaw); b3Scalar cosPitch = b3Cos(halfPitch); @@ -122,34 +123,34 @@ public: b3Scalar cosRoll = b3Cos(halfRoll); b3Scalar sinRoll = b3Sin(halfRoll); setValue(cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw, - cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw, - sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw, - cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw); + cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw, + sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw, + cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw); } - + /**@brief Set the quaternion using euler angles * @param yaw Angle around Z * @param pitch Angle around Y * @param roll Angle around X */ void setEulerZYX(const b3Scalar& yawZ, const b3Scalar& pitchY, const b3Scalar& rollX) { - b3Scalar halfYaw = b3Scalar(yawZ) * b3Scalar(0.5); - b3Scalar halfPitch = b3Scalar(pitchY) * b3Scalar(0.5); - b3Scalar halfRoll = b3Scalar(rollX) * b3Scalar(0.5); + b3Scalar halfYaw = b3Scalar(yawZ) * b3Scalar(0.5); + b3Scalar halfPitch = b3Scalar(pitchY) * b3Scalar(0.5); + b3Scalar halfRoll = b3Scalar(rollX) * b3Scalar(0.5); b3Scalar cosYaw = b3Cos(halfYaw); b3Scalar sinYaw = b3Sin(halfYaw); b3Scalar cosPitch = b3Cos(halfPitch); b3Scalar sinPitch = b3Sin(halfPitch); b3Scalar cosRoll = b3Cos(halfRoll); b3Scalar sinRoll = b3Sin(halfRoll); - setValue(sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw, //x - cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw, //y - cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw, //z - cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw); //formerly yzx + setValue(sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw, //x + cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw, //y + cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw, //z + cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw); //formerly yzx normalize(); } - /**@brief Get the euler angles from this quaternion + /**@brief Get the euler angles from this quaternion * @param yaw Angle around Z * @param pitch Angle around Y * @param roll Angle around X */ @@ -166,221 +167,221 @@ public: squ = m_floats[3] * m_floats[3]; rollX = b3Atan2(2 * (m_floats[1] * m_floats[2] + m_floats[3] * m_floats[0]), squ - sqx - sqy + sqz); sarg = b3Scalar(-2.) * (m_floats[0] * m_floats[2] - m_floats[3] * m_floats[1]); - pitchY = sarg <= b3Scalar(-1.0) ? b3Scalar(-0.5) * B3_PI: (sarg >= b3Scalar(1.0) ? b3Scalar(0.5) * B3_PI : b3Asin(sarg)); + pitchY = sarg <= b3Scalar(-1.0) ? b3Scalar(-0.5) * B3_PI : (sarg >= b3Scalar(1.0) ? b3Scalar(0.5) * B3_PI : b3Asin(sarg)); yawZ = b3Atan2(2 * (m_floats[0] * m_floats[1] + m_floats[3] * m_floats[2]), squ + sqx - sqy - sqz); } - /**@brief Add two quaternions + /**@brief Add two quaternions * @param q The quaternion to add to this one */ - B3_FORCE_INLINE b3Quaternion& operator+=(const b3Quaternion& q) + B3_FORCE_INLINE b3Quaternion& operator+=(const b3Quaternion& q) { -#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) mVec128 = _mm_add_ps(mVec128, q.mVec128); #elif defined(B3_USE_NEON) mVec128 = vaddq_f32(mVec128, q.mVec128); -#else - m_floats[0] += q.getX(); - m_floats[1] += q.getY(); - m_floats[2] += q.getZ(); - m_floats[3] += q.m_floats[3]; +#else + m_floats[0] += q.getX(); + m_floats[1] += q.getY(); + m_floats[2] += q.getZ(); + m_floats[3] += q.m_floats[3]; #endif return *this; } - /**@brief Subtract out a quaternion + /**@brief Subtract out a quaternion * @param q The quaternion to subtract from this one */ - b3Quaternion& operator-=(const b3Quaternion& q) + b3Quaternion& operator-=(const b3Quaternion& q) { -#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) mVec128 = _mm_sub_ps(mVec128, q.mVec128); #elif defined(B3_USE_NEON) mVec128 = vsubq_f32(mVec128, q.mVec128); -#else - m_floats[0] -= q.getX(); - m_floats[1] -= q.getY(); - m_floats[2] -= q.getZ(); - m_floats[3] -= q.m_floats[3]; +#else + m_floats[0] -= q.getX(); + m_floats[1] -= q.getY(); + m_floats[2] -= q.getZ(); + m_floats[3] -= q.m_floats[3]; #endif - return *this; + return *this; } - /**@brief Scale this quaternion + /**@brief Scale this quaternion * @param s The scalar to scale by */ b3Quaternion& operator*=(const b3Scalar& s) { -#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) - __m128 vs = _mm_load_ss(&s); // (S 0 0 0) - vs = b3_pshufd_ps(vs, 0); // (S S S S) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) + __m128 vs = _mm_load_ss(&s); // (S 0 0 0) + vs = b3_pshufd_ps(vs, 0); // (S S S S) mVec128 = _mm_mul_ps(mVec128, vs); #elif defined(B3_USE_NEON) mVec128 = vmulq_n_f32(mVec128, s); #else - m_floats[0] *= s; - m_floats[1] *= s; - m_floats[2] *= s; - m_floats[3] *= s; + m_floats[0] *= s; + m_floats[1] *= s; + m_floats[2] *= s; + m_floats[3] *= s; #endif return *this; } - /**@brief Multiply this quaternion by q on the right + /**@brief Multiply this quaternion by q on the right * @param q The other quaternion * Equivilant to this = this * q */ b3Quaternion& operator*=(const b3Quaternion& q) { -#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) __m128 vQ2 = q.get128(); - - __m128 A1 = b3_pshufd_ps(mVec128, B3_SHUFFLE(0,1,2,0)); - __m128 B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3,3,3,0)); - + + __m128 A1 = b3_pshufd_ps(mVec128, B3_SHUFFLE(0, 1, 2, 0)); + __m128 B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3, 3, 3, 0)); + A1 = A1 * B1; - - __m128 A2 = b3_pshufd_ps(mVec128, B3_SHUFFLE(1,2,0,1)); - __m128 B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2,0,1,1)); - + + __m128 A2 = b3_pshufd_ps(mVec128, B3_SHUFFLE(1, 2, 0, 1)); + __m128 B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2, 0, 1, 1)); + A2 = A2 * B2; - - B1 = b3_pshufd_ps(mVec128, B3_SHUFFLE(2,0,1,2)); - B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1,2,0,2)); - - B1 = B1 * B2; // A3 *= B3 - - mVec128 = b3_splat_ps(mVec128, 3); // A0 - mVec128 = mVec128 * vQ2; // A0 * B0 - - A1 = A1 + A2; // AB12 - mVec128 = mVec128 - B1; // AB03 = AB0 - AB3 - A1 = _mm_xor_ps(A1, b3vPPPM); // change sign of the last element - mVec128 = mVec128+ A1; // AB03 + AB12 - -#elif defined(B3_USE_NEON) - - float32x4_t vQ1 = mVec128; - float32x4_t vQ2 = q.get128(); - float32x4_t A0, A1, B1, A2, B2, A3, B3; - float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz; - - { - float32x2x2_t tmp; - tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) ); // {z x}, {w y} - vQ1zx = tmp.val[0]; - - tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) ); // {z x}, {w y} - vQ2zx = tmp.val[0]; - } - vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); - - vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1); - - vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1); - vQ2xz = vext_f32(vQ2zx, vQ2zx, 1); - - A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx); // X Y z x - B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W W X - - A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1)); - B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1)); - - A3 = vcombine_f32(vQ1zx, vQ1yz); // Z X Y Z - B3 = vcombine_f32(vQ2yz, vQ2xz); // Y Z x z - - A1 = vmulq_f32(A1, B1); - A2 = vmulq_f32(A2, B2); - A3 = vmulq_f32(A3, B3); // A3 *= B3 - A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); // A0 * B0 - - A1 = vaddq_f32(A1, A2); // AB12 = AB1 + AB2 - A0 = vsubq_f32(A0, A3); // AB03 = AB0 - AB3 - - // change the sign of the last element - A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM); - A0 = vaddq_f32(A0, A1); // AB03 + AB12 - - mVec128 = A0; + + B1 = b3_pshufd_ps(mVec128, B3_SHUFFLE(2, 0, 1, 2)); + B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1, 2, 0, 2)); + + B1 = B1 * B2; // A3 *= B3 + + mVec128 = b3_splat_ps(mVec128, 3); // A0 + mVec128 = mVec128 * vQ2; // A0 * B0 + + A1 = A1 + A2; // AB12 + mVec128 = mVec128 - B1; // AB03 = AB0 - AB3 + A1 = _mm_xor_ps(A1, b3vPPPM); // change sign of the last element + mVec128 = mVec128 + A1; // AB03 + AB12 + +#elif defined(B3_USE_NEON) + + float32x4_t vQ1 = mVec128; + float32x4_t vQ2 = q.get128(); + float32x4_t A0, A1, B1, A2, B2, A3, B3; + float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz; + + { + float32x2x2_t tmp; + tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1)); // {z x}, {w y} + vQ1zx = tmp.val[0]; + + tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2)); // {z x}, {w y} + vQ2zx = tmp.val[0]; + } + vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); + + vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1); + + vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1); + vQ2xz = vext_f32(vQ2zx, vQ2zx, 1); + + A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx); // X Y z x + B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W W X + + A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1)); + B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1)); + + A3 = vcombine_f32(vQ1zx, vQ1yz); // Z X Y Z + B3 = vcombine_f32(vQ2yz, vQ2xz); // Y Z x z + + A1 = vmulq_f32(A1, B1); + A2 = vmulq_f32(A2, B2); + A3 = vmulq_f32(A3, B3); // A3 *= B3 + A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); // A0 * B0 + + A1 = vaddq_f32(A1, A2); // AB12 = AB1 + AB2 + A0 = vsubq_f32(A0, A3); // AB03 = AB0 - AB3 + + // change the sign of the last element + A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM); + A0 = vaddq_f32(A0, A1); // AB03 + AB12 + + mVec128 = A0; #else setValue( - m_floats[3] * q.getX() + m_floats[0] * q.m_floats[3] + m_floats[1] * q.getZ() - m_floats[2] * q.getY(), + m_floats[3] * q.getX() + m_floats[0] * q.m_floats[3] + m_floats[1] * q.getZ() - m_floats[2] * q.getY(), m_floats[3] * q.getY() + m_floats[1] * q.m_floats[3] + m_floats[2] * q.getX() - m_floats[0] * q.getZ(), m_floats[3] * q.getZ() + m_floats[2] * q.m_floats[3] + m_floats[0] * q.getY() - m_floats[1] * q.getX(), m_floats[3] * q.m_floats[3] - m_floats[0] * q.getX() - m_floats[1] * q.getY() - m_floats[2] * q.getZ()); #endif return *this; } - /**@brief Return the dot product between this quaternion and another + /**@brief Return the dot product between this quaternion and another * @param q The other quaternion */ b3Scalar dot(const b3Quaternion& q) const { -#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) - __m128 vd; - +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) + __m128 vd; + vd = _mm_mul_ps(mVec128, q.mVec128); - - __m128 t = _mm_movehl_ps(vd, vd); + + __m128 t = _mm_movehl_ps(vd, vd); vd = _mm_add_ps(vd, t); t = _mm_shuffle_ps(vd, vd, 0x55); vd = _mm_add_ss(vd, t); - - return _mm_cvtss_f32(vd); + + return _mm_cvtss_f32(vd); #elif defined(B3_USE_NEON) float32x4_t vd = vmulq_f32(mVec128, q.mVec128); - float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_high_f32(vd)); + float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_high_f32(vd)); x = vpadd_f32(x, x); return vget_lane_f32(x, 0); -#else - return m_floats[0] * q.getX() + - m_floats[1] * q.getY() + - m_floats[2] * q.getZ() + - m_floats[3] * q.m_floats[3]; +#else + return m_floats[0] * q.getX() + + m_floats[1] * q.getY() + + m_floats[2] * q.getZ() + + m_floats[3] * q.m_floats[3]; #endif } - /**@brief Return the length squared of the quaternion */ + /**@brief Return the length squared of the quaternion */ b3Scalar length2() const { return dot(*this); } - /**@brief Return the length of the quaternion */ + /**@brief Return the length of the quaternion */ b3Scalar length() const { return b3Sqrt(length2()); } - /**@brief Normalize the quaternion + /**@brief Normalize the quaternion * Such that x^2 + y^2 + z^2 +w^2 = 1 */ - b3Quaternion& normalize() + b3Quaternion& normalize() { -#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) - __m128 vd; - +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) + __m128 vd; + vd = _mm_mul_ps(mVec128, mVec128); - - __m128 t = _mm_movehl_ps(vd, vd); + + __m128 t = _mm_movehl_ps(vd, vd); vd = _mm_add_ps(vd, t); t = _mm_shuffle_ps(vd, vd, 0x55); vd = _mm_add_ss(vd, t); vd = _mm_sqrt_ss(vd); vd = _mm_div_ss(b3vOnes, vd); - vd = b3_pshufd_ps(vd, 0); // splat + vd = b3_pshufd_ps(vd, 0); // splat mVec128 = _mm_mul_ps(mVec128, vd); - + return *this; -#else +#else return *this /= length(); #endif } - /**@brief Return a scaled version of this quaternion + /**@brief Return a scaled version of this quaternion * @param s The scale factor */ B3_FORCE_INLINE b3Quaternion operator*(const b3Scalar& s) const { -#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) - __m128 vs = _mm_load_ss(&s); // (S 0 0 0) - vs = b3_pshufd_ps(vs, 0x00); // (S S S S) - +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) + __m128 vs = _mm_load_ss(&s); // (S 0 0 0) + vs = b3_pshufd_ps(vs, 0x00); // (S S S S) + return b3Quaternion(_mm_mul_ps(mVec128, vs)); #elif defined(B3_USE_NEON) return b3Quaternion(vmulq_n_f32(mVec128, s)); @@ -389,7 +390,7 @@ public: #endif } - /**@brief Return an inversely scaled versionof this quaternion + /**@brief Return an inversely scaled versionof this quaternion * @param s The inverse scale factor */ b3Quaternion operator/(const b3Scalar& s) const { @@ -397,29 +398,29 @@ public: return *this * (b3Scalar(1.0) / s); } - /**@brief Inversely scale this quaternion + /**@brief Inversely scale this quaternion * @param s The scale factor */ - b3Quaternion& operator/=(const b3Scalar& s) + b3Quaternion& operator/=(const b3Scalar& s) { b3Assert(s != b3Scalar(0.0)); return *this *= b3Scalar(1.0) / s; } - /**@brief Return a normalized version of this quaternion */ - b3Quaternion normalized() const + /**@brief Return a normalized version of this quaternion */ + b3Quaternion normalized() const { return *this / length(); - } - /**@brief Return the angle between this quaternion and the other + } + /**@brief Return the angle between this quaternion and the other * @param q The other quaternion */ - b3Scalar angle(const b3Quaternion& q) const + b3Scalar angle(const b3Quaternion& q) const { b3Scalar s = b3Sqrt(length2() * q.length2()); b3Assert(s != b3Scalar(0.0)); return b3Acos(dot(q) / s); } - /**@brief Return the angle of rotation represented by this quaternion */ - b3Scalar getAngle() const + /**@brief Return the angle of rotation represented by this quaternion */ + b3Scalar getAngle() const { b3Scalar s = b3Scalar(2.) * b3Acos(m_floats[3]); return s; @@ -428,117 +429,116 @@ public: /**@brief Return the axis of the rotation represented by this quaternion */ b3Vector3 getAxis() const { - b3Scalar s_squared = 1.f-m_floats[3]*m_floats[3]; - - if (s_squared < b3Scalar(10.) * B3_EPSILON) //Check for divide by zero - return b3MakeVector3(1.0, 0.0, 0.0); // Arbitrary - b3Scalar s = 1.f/b3Sqrt(s_squared); + b3Scalar s_squared = 1.f - m_floats[3] * m_floats[3]; + + if (s_squared < b3Scalar(10.) * B3_EPSILON) //Check for divide by zero + return b3MakeVector3(1.0, 0.0, 0.0); // Arbitrary + b3Scalar s = 1.f / b3Sqrt(s_squared); return b3MakeVector3(m_floats[0] * s, m_floats[1] * s, m_floats[2] * s); } /**@brief Return the inverse of this quaternion */ b3Quaternion inverse() const { -#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) return b3Quaternion(_mm_xor_ps(mVec128, b3vQInv)); #elif defined(B3_USE_NEON) - return b3Quaternion((b3SimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)b3vQInv)); -#else + return b3Quaternion((b3SimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)b3vQInv)); +#else return b3Quaternion(-m_floats[0], -m_floats[1], -m_floats[2], m_floats[3]); #endif } - /**@brief Return the sum of this quaternion and the other + /**@brief Return the sum of this quaternion and the other * @param q2 The other quaternion */ B3_FORCE_INLINE b3Quaternion operator+(const b3Quaternion& q2) const { -#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) return b3Quaternion(_mm_add_ps(mVec128, q2.mVec128)); #elif defined(B3_USE_NEON) - return b3Quaternion(vaddq_f32(mVec128, q2.mVec128)); -#else + return b3Quaternion(vaddq_f32(mVec128, q2.mVec128)); +#else const b3Quaternion& q1 = *this; return b3Quaternion(q1.getX() + q2.getX(), q1.getY() + q2.getY(), q1.getZ() + q2.getZ(), q1.m_floats[3] + q2.m_floats[3]); #endif } - /**@brief Return the difference between this quaternion and the other + /**@brief Return the difference between this quaternion and the other * @param q2 The other quaternion */ B3_FORCE_INLINE b3Quaternion operator-(const b3Quaternion& q2) const { -#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) return b3Quaternion(_mm_sub_ps(mVec128, q2.mVec128)); #elif defined(B3_USE_NEON) - return b3Quaternion(vsubq_f32(mVec128, q2.mVec128)); -#else + return b3Quaternion(vsubq_f32(mVec128, q2.mVec128)); +#else const b3Quaternion& q1 = *this; return b3Quaternion(q1.getX() - q2.getX(), q1.getY() - q2.getY(), q1.getZ() - q2.getZ(), q1.m_floats[3] - q2.m_floats[3]); #endif } - /**@brief Return the negative of this quaternion + /**@brief Return the negative of this quaternion * This simply negates each element */ B3_FORCE_INLINE b3Quaternion operator-() const { -#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) return b3Quaternion(_mm_xor_ps(mVec128, b3vMzeroMask)); #elif defined(B3_USE_NEON) - return b3Quaternion((b3SimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)b3vMzeroMask) ); -#else + return b3Quaternion((b3SimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)b3vMzeroMask)); +#else const b3Quaternion& q2 = *this; - return b3Quaternion( - q2.getX(), - q2.getY(), - q2.getZ(), - q2.m_floats[3]); + return b3Quaternion(-q2.getX(), -q2.getY(), -q2.getZ(), -q2.m_floats[3]); #endif } - /**@todo document this and it's use */ - B3_FORCE_INLINE b3Quaternion farthest( const b3Quaternion& qd) const + /**@todo document this and it's use */ + B3_FORCE_INLINE b3Quaternion farthest(const b3Quaternion& qd) const { - b3Quaternion diff,sum; + b3Quaternion diff, sum; diff = *this - qd; sum = *this + qd; - if( diff.dot(diff) > sum.dot(sum) ) + if (diff.dot(diff) > sum.dot(sum)) return qd; return (-qd); } /**@todo document this and it's use */ - B3_FORCE_INLINE b3Quaternion nearest( const b3Quaternion& qd) const + B3_FORCE_INLINE b3Quaternion nearest(const b3Quaternion& qd) const { - b3Quaternion diff,sum; + b3Quaternion diff, sum; diff = *this - qd; sum = *this + qd; - if( diff.dot(diff) < sum.dot(sum) ) + if (diff.dot(diff) < sum.dot(sum)) return qd; return (-qd); } - - /**@brief Return the quaternion which is the result of Spherical Linear Interpolation between this and the other quaternion + /**@brief Return the quaternion which is the result of Spherical Linear Interpolation between this and the other quaternion * @param q The other quaternion to interpolate with * @param t The ratio between this and q to interpolate. If t = 0 the result is this, if t=1 the result is q. * Slerp interpolates assuming constant velocity. */ b3Quaternion slerp(const b3Quaternion& q, const b3Scalar& t) const { - b3Scalar magnitude = b3Sqrt(length2() * q.length2()); - b3Assert(magnitude > b3Scalar(0)); + b3Scalar magnitude = b3Sqrt(length2() * q.length2()); + b3Assert(magnitude > b3Scalar(0)); - b3Scalar product = dot(q) / magnitude; - if (b3Fabs(product) < b3Scalar(1)) + b3Scalar product = dot(q) / magnitude; + if (b3Fabs(product) < b3Scalar(1)) { - // Take care of long angle case see http://en.wikipedia.org/wiki/Slerp - const b3Scalar sign = (product < 0) ? b3Scalar(-1) : b3Scalar(1); - - const b3Scalar theta = b3Acos(sign * product); - const b3Scalar s1 = b3Sin(sign * t * theta); - const b3Scalar d = b3Scalar(1.0) / b3Sin(theta); - const b3Scalar s0 = b3Sin((b3Scalar(1.0) - t) * theta); - - return b3Quaternion( - (m_floats[0] * s0 + q.getX() * s1) * d, - (m_floats[1] * s0 + q.getY() * s1) * d, - (m_floats[2] * s0 + q.getZ() * s1) * d, - (m_floats[3] * s0 + q.m_floats[3] * s1) * d); + // Take care of long angle case see http://en.wikipedia.org/wiki/Slerp + const b3Scalar sign = (product < 0) ? b3Scalar(-1) : b3Scalar(1); + + const b3Scalar theta = b3Acos(sign * product); + const b3Scalar s1 = b3Sin(sign * t * theta); + const b3Scalar d = b3Scalar(1.0) / b3Sin(theta); + const b3Scalar s0 = b3Sin((b3Scalar(1.0) - t) * theta); + + return b3Quaternion( + (m_floats[0] * s0 + q.getX() * s1) * d, + (m_floats[1] * s0 + q.getY() * s1) * d, + (m_floats[2] * s0 + q.getZ() * s1) * d, + (m_floats[3] * s0 + q.m_floats[3] * s1) * d); } else { @@ -546,301 +546,294 @@ public: } } - static const b3Quaternion& getIdentity() + static const b3Quaternion& getIdentity() { - static const b3Quaternion identityQuat(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.),b3Scalar(1.)); + static const b3Quaternion identityQuat(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.), b3Scalar(1.)); return identityQuat; } B3_FORCE_INLINE const b3Scalar& getW() const { return m_floats[3]; } - - }; - - - - /**@brief Return the product of two quaternions */ B3_FORCE_INLINE b3Quaternion -operator*(const b3Quaternion& q1, const b3Quaternion& q2) +operator*(const b3Quaternion& q1, const b3Quaternion& q2) { -#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) __m128 vQ1 = q1.get128(); __m128 vQ2 = q2.get128(); __m128 A0, A1, B1, A2, B2; - - A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(0,1,2,0)); // X Y z x // vtrn - B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3,3,3,0)); // W W W X // vdup vext + + A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(0, 1, 2, 0)); // X Y z x // vtrn + B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3, 3, 3, 0)); // W W W X // vdup vext A1 = A1 * B1; - - A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1,2,0,1)); // Y Z X Y // vext - B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2,0,1,1)); // z x Y Y // vtrn vdup + + A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1, 2, 0, 1)); // Y Z X Y // vext + B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2, 0, 1, 1)); // z x Y Y // vtrn vdup A2 = A2 * B2; - B1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2,0,1,2)); // z x Y Z // vtrn vext - B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1,2,0,2)); // Y Z x z // vext vtrn - - B1 = B1 * B2; // A3 *= B3 + B1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2, 0, 1, 2)); // z x Y Z // vtrn vext + B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1, 2, 0, 2)); // Y Z x z // vext vtrn + + B1 = B1 * B2; // A3 *= B3 - A0 = b3_splat_ps(vQ1, 3); // A0 - A0 = A0 * vQ2; // A0 * B0 + A0 = b3_splat_ps(vQ1, 3); // A0 + A0 = A0 * vQ2; // A0 * B0 + + A1 = A1 + A2; // AB12 + A0 = A0 - B1; // AB03 = AB0 - AB3 + + A1 = _mm_xor_ps(A1, b3vPPPM); // change sign of the last element + A0 = A0 + A1; // AB03 + AB12 - A1 = A1 + A2; // AB12 - A0 = A0 - B1; // AB03 = AB0 - AB3 - - A1 = _mm_xor_ps(A1, b3vPPPM); // change sign of the last element - A0 = A0 + A1; // AB03 + AB12 - return b3Quaternion(A0); -#elif defined(B3_USE_NEON) +#elif defined(B3_USE_NEON) float32x4_t vQ1 = q1.get128(); float32x4_t vQ2 = q2.get128(); float32x4_t A0, A1, B1, A2, B2, A3, B3; - float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz; - - { - float32x2x2_t tmp; - tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) ); // {z x}, {w y} - vQ1zx = tmp.val[0]; + float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz; + + { + float32x2x2_t tmp; + tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1)); // {z x}, {w y} + vQ1zx = tmp.val[0]; - tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) ); // {z x}, {w y} - vQ2zx = tmp.val[0]; - } - vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); + tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2)); // {z x}, {w y} + vQ2zx = tmp.val[0]; + } + vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); - vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1); + vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1); - vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1); - vQ2xz = vext_f32(vQ2zx, vQ2zx, 1); + vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1); + vQ2xz = vext_f32(vQ2zx, vQ2zx, 1); - A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx); // X Y z x - B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W W X + A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx); // X Y z x + B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W W X A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1)); - B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1)); + B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1)); - A3 = vcombine_f32(vQ1zx, vQ1yz); // Z X Y Z - B3 = vcombine_f32(vQ2yz, vQ2xz); // Y Z x z + A3 = vcombine_f32(vQ1zx, vQ1yz); // Z X Y Z + B3 = vcombine_f32(vQ2yz, vQ2xz); // Y Z x z A1 = vmulq_f32(A1, B1); A2 = vmulq_f32(A2, B2); - A3 = vmulq_f32(A3, B3); // A3 *= B3 - A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); // A0 * B0 - - A1 = vaddq_f32(A1, A2); // AB12 = AB1 + AB2 - A0 = vsubq_f32(A0, A3); // AB03 = AB0 - AB3 - - // change the sign of the last element - A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM); - A0 = vaddq_f32(A0, A1); // AB03 + AB12 - + A3 = vmulq_f32(A3, B3); // A3 *= B3 + A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); // A0 * B0 + + A1 = vaddq_f32(A1, A2); // AB12 = AB1 + AB2 + A0 = vsubq_f32(A0, A3); // AB03 = AB0 - AB3 + + // change the sign of the last element + A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM); + A0 = vaddq_f32(A0, A1); // AB03 + AB12 + return b3Quaternion(A0); #else return b3Quaternion( - q1.getW() * q2.getX() + q1.getX() * q2.getW() + q1.getY() * q2.getZ() - q1.getZ() * q2.getY(), + q1.getW() * q2.getX() + q1.getX() * q2.getW() + q1.getY() * q2.getZ() - q1.getZ() * q2.getY(), q1.getW() * q2.getY() + q1.getY() * q2.getW() + q1.getZ() * q2.getX() - q1.getX() * q2.getZ(), q1.getW() * q2.getZ() + q1.getZ() * q2.getW() + q1.getX() * q2.getY() - q1.getY() * q2.getX(), - q1.getW() * q2.getW() - q1.getX() * q2.getX() - q1.getY() * q2.getY() - q1.getZ() * q2.getZ()); + q1.getW() * q2.getW() - q1.getX() * q2.getX() - q1.getY() * q2.getY() - q1.getZ() * q2.getZ()); #endif } B3_FORCE_INLINE b3Quaternion operator*(const b3Quaternion& q, const b3Vector3& w) { -#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) __m128 vQ1 = q.get128(); __m128 vQ2 = w.get128(); __m128 A1, B1, A2, B2, A3, B3; - - A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(3,3,3,0)); - B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(0,1,2,0)); + + A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(3, 3, 3, 0)); + B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(0, 1, 2, 0)); A1 = A1 * B1; - - A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1,2,0,1)); - B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2,0,1,1)); + + A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1, 2, 0, 1)); + B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2, 0, 1, 1)); A2 = A2 * B2; - A3 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2,0,1,2)); - B3 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1,2,0,2)); - - A3 = A3 * B3; // A3 *= B3 + A3 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2, 0, 1, 2)); + B3 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1, 2, 0, 2)); + + A3 = A3 * B3; // A3 *= B3 + + A1 = A1 + A2; // AB12 + A1 = _mm_xor_ps(A1, b3vPPPM); // change sign of the last element + A1 = A1 - A3; // AB123 = AB12 - AB3 - A1 = A1 + A2; // AB12 - A1 = _mm_xor_ps(A1, b3vPPPM); // change sign of the last element - A1 = A1 - A3; // AB123 = AB12 - AB3 - return b3Quaternion(A1); - -#elif defined(B3_USE_NEON) + +#elif defined(B3_USE_NEON) float32x4_t vQ1 = q.get128(); float32x4_t vQ2 = w.get128(); float32x4_t A1, B1, A2, B2, A3, B3; - float32x2_t vQ1wx, vQ2zx, vQ1yz, vQ2yz, vQ1zx, vQ2xz; - - vQ1wx = vext_f32(vget_high_f32(vQ1), vget_low_f32(vQ1), 1); - { - float32x2x2_t tmp; + float32x2_t vQ1wx, vQ2zx, vQ1yz, vQ2yz, vQ1zx, vQ2xz; - tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) ); // {z x}, {w y} - vQ2zx = tmp.val[0]; + vQ1wx = vext_f32(vget_high_f32(vQ1), vget_low_f32(vQ1), 1); + { + float32x2x2_t tmp; - tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) ); // {z x}, {w y} - vQ1zx = tmp.val[0]; - } + tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2)); // {z x}, {w y} + vQ2zx = tmp.val[0]; - vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1); + tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1)); // {z x}, {w y} + vQ1zx = tmp.val[0]; + } + + vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1); - vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1); - vQ2xz = vext_f32(vQ2zx, vQ2zx, 1); + vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1); + vQ2xz = vext_f32(vQ2zx, vQ2zx, 1); - A1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ1), 1), vQ1wx); // W W W X - B1 = vcombine_f32(vget_low_f32(vQ2), vQ2zx); // X Y z x + A1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ1), 1), vQ1wx); // W W W X + B1 = vcombine_f32(vget_low_f32(vQ2), vQ2zx); // X Y z x A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1)); - B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1)); + B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1)); - A3 = vcombine_f32(vQ1zx, vQ1yz); // Z X Y Z - B3 = vcombine_f32(vQ2yz, vQ2xz); // Y Z x z + A3 = vcombine_f32(vQ1zx, vQ1yz); // Z X Y Z + B3 = vcombine_f32(vQ2yz, vQ2xz); // Y Z x z A1 = vmulq_f32(A1, B1); A2 = vmulq_f32(A2, B2); - A3 = vmulq_f32(A3, B3); // A3 *= B3 - - A1 = vaddq_f32(A1, A2); // AB12 = AB1 + AB2 - - // change the sign of the last element - A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM); - - A1 = vsubq_f32(A1, A3); // AB123 = AB12 - AB3 - + A3 = vmulq_f32(A3, B3); // A3 *= B3 + + A1 = vaddq_f32(A1, A2); // AB12 = AB1 + AB2 + + // change the sign of the last element + A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM); + + A1 = vsubq_f32(A1, A3); // AB123 = AB12 - AB3 + return b3Quaternion(A1); - + #else - return b3Quaternion( - q.getW() * w.getX() + q.getY() * w.getZ() - q.getZ() * w.getY(), - q.getW() * w.getY() + q.getZ() * w.getX() - q.getX() * w.getZ(), - q.getW() * w.getZ() + q.getX() * w.getY() - q.getY() * w.getX(), - -q.getX() * w.getX() - q.getY() * w.getY() - q.getZ() * w.getZ()); + return b3Quaternion( + q.getW() * w.getX() + q.getY() * w.getZ() - q.getZ() * w.getY(), + q.getW() * w.getY() + q.getZ() * w.getX() - q.getX() * w.getZ(), + q.getW() * w.getZ() + q.getX() * w.getY() - q.getY() * w.getX(), + -q.getX() * w.getX() - q.getY() * w.getY() - q.getZ() * w.getZ()); #endif } B3_FORCE_INLINE b3Quaternion operator*(const b3Vector3& w, const b3Quaternion& q) { -#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) __m128 vQ1 = w.get128(); __m128 vQ2 = q.get128(); __m128 A1, B1, A2, B2, A3, B3; - - A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(0,1,2,0)); // X Y z x - B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3,3,3,0)); // W W W X + + A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(0, 1, 2, 0)); // X Y z x + B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3, 3, 3, 0)); // W W W X A1 = A1 * B1; - - A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1,2,0,1)); - B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2,0,1,1)); - A2 = A2 *B2; + A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1, 2, 0, 1)); + B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2, 0, 1, 1)); + + A2 = A2 * B2; + + A3 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2, 0, 1, 2)); + B3 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1, 2, 0, 2)); + + A3 = A3 * B3; // A3 *= B3 - A3 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2,0,1,2)); - B3 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1,2,0,2)); - - A3 = A3 * B3; // A3 *= B3 + A1 = A1 + A2; // AB12 + A1 = _mm_xor_ps(A1, b3vPPPM); // change sign of the last element + A1 = A1 - A3; // AB123 = AB12 - AB3 - A1 = A1 + A2; // AB12 - A1 = _mm_xor_ps(A1, b3vPPPM); // change sign of the last element - A1 = A1 - A3; // AB123 = AB12 - AB3 - return b3Quaternion(A1); -#elif defined(B3_USE_NEON) +#elif defined(B3_USE_NEON) float32x4_t vQ1 = w.get128(); float32x4_t vQ2 = q.get128(); - float32x4_t A1, B1, A2, B2, A3, B3; - float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz; - - { - float32x2x2_t tmp; - - tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) ); // {z x}, {w y} - vQ1zx = tmp.val[0]; + float32x4_t A1, B1, A2, B2, A3, B3; + float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz; - tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) ); // {z x}, {w y} - vQ2zx = tmp.val[0]; - } - vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); + { + float32x2x2_t tmp; + + tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1)); // {z x}, {w y} + vQ1zx = tmp.val[0]; + + tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2)); // {z x}, {w y} + vQ2zx = tmp.val[0]; + } + vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); - vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1); + vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1); - vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1); - vQ2xz = vext_f32(vQ2zx, vQ2zx, 1); + vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1); + vQ2xz = vext_f32(vQ2zx, vQ2zx, 1); - A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx); // X Y z x - B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W W X + A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx); // X Y z x + B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W W X A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1)); - B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1)); + B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1)); - A3 = vcombine_f32(vQ1zx, vQ1yz); // Z X Y Z - B3 = vcombine_f32(vQ2yz, vQ2xz); // Y Z x z + A3 = vcombine_f32(vQ1zx, vQ1yz); // Z X Y Z + B3 = vcombine_f32(vQ2yz, vQ2xz); // Y Z x z A1 = vmulq_f32(A1, B1); A2 = vmulq_f32(A2, B2); - A3 = vmulq_f32(A3, B3); // A3 *= B3 - - A1 = vaddq_f32(A1, A2); // AB12 = AB1 + AB2 - - // change the sign of the last element - A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM); - - A1 = vsubq_f32(A1, A3); // AB123 = AB12 - AB3 - + A3 = vmulq_f32(A3, B3); // A3 *= B3 + + A1 = vaddq_f32(A1, A2); // AB12 = AB1 + AB2 + + // change the sign of the last element + A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM); + + A1 = vsubq_f32(A1, A3); // AB123 = AB12 - AB3 + return b3Quaternion(A1); - + #else - return b3Quaternion( - +w.getX() * q.getW() + w.getY() * q.getZ() - w.getZ() * q.getY(), + return b3Quaternion( + +w.getX() * q.getW() + w.getY() * q.getZ() - w.getZ() * q.getY(), +w.getY() * q.getW() + w.getZ() * q.getX() - w.getX() * q.getZ(), +w.getZ() * q.getW() + w.getX() * q.getY() - w.getY() * q.getX(), - -w.getX() * q.getX() - w.getY() * q.getY() - w.getZ() * q.getZ()); + -w.getX() * q.getX() - w.getY() * q.getY() - w.getZ() * q.getZ()); #endif } /**@brief Calculate the dot product between two quaternions */ -B3_FORCE_INLINE b3Scalar -b3Dot(const b3Quaternion& q1, const b3Quaternion& q2) -{ - return q1.dot(q2); +B3_FORCE_INLINE b3Scalar +b3Dot(const b3Quaternion& q1, const b3Quaternion& q2) +{ + return q1.dot(q2); } - /**@brief Return the length of a quaternion */ B3_FORCE_INLINE b3Scalar -b3Length(const b3Quaternion& q) -{ - return q.length(); +b3Length(const b3Quaternion& q) +{ + return q.length(); } /**@brief Return the angle between two quaternions*/ B3_FORCE_INLINE b3Scalar -b3Angle(const b3Quaternion& q1, const b3Quaternion& q2) -{ - return q1.angle(q2); +b3Angle(const b3Quaternion& q1, const b3Quaternion& q2) +{ + return q1.angle(q2); } /**@brief Return the inverse of a quaternion*/ B3_FORCE_INLINE b3Quaternion -b3Inverse(const b3Quaternion& q) +b3Inverse(const b3Quaternion& q) { return q.inverse(); } @@ -851,7 +844,7 @@ b3Inverse(const b3Quaternion& q) * @param t The ration between q1 and q2. t = 0 return q1, t=1 returns q2 * Slerp assumes constant velocity between positions. */ B3_FORCE_INLINE b3Quaternion -b3Slerp(const b3Quaternion& q1, const b3Quaternion& q2, const b3Scalar& t) +b3Slerp(const b3Quaternion& q1, const b3Quaternion& q2, const b3Scalar& t) { return q1.slerp(q2, t); } @@ -859,7 +852,7 @@ b3Slerp(const b3Quaternion& q1, const b3Quaternion& q2, const b3Scalar& t) B3_FORCE_INLINE b3Quaternion b3QuatMul(const b3Quaternion& rot0, const b3Quaternion& rot1) { - return rot0*rot1; + return rot0 * rot1; } B3_FORCE_INLINE b3Quaternion @@ -868,51 +861,45 @@ b3QuatNormalized(const b3Quaternion& orn) return orn.normalized(); } - - -B3_FORCE_INLINE b3Vector3 -b3QuatRotate(const b3Quaternion& rotation, const b3Vector3& v) +B3_FORCE_INLINE b3Vector3 +b3QuatRotate(const b3Quaternion& rotation, const b3Vector3& v) { b3Quaternion q = rotation * v; q *= rotation.inverse(); -#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) return b3MakeVector3(_mm_and_ps(q.get128(), b3vFFF0fMask)); #elif defined(B3_USE_NEON) - return b3MakeVector3((float32x4_t)vandq_s32((int32x4_t)q.get128(), b3vFFF0Mask)); -#else - return b3MakeVector3(q.getX(),q.getY(),q.getZ()); + return b3MakeVector3((float32x4_t)vandq_s32((int32x4_t)q.get128(), b3vFFF0Mask)); +#else + return b3MakeVector3(q.getX(), q.getY(), q.getZ()); #endif } -B3_FORCE_INLINE b3Quaternion -b3ShortestArcQuat(const b3Vector3& v0, const b3Vector3& v1) // Game Programming Gems 2.10. make sure v0,v1 are normalized +B3_FORCE_INLINE b3Quaternion +b3ShortestArcQuat(const b3Vector3& v0, const b3Vector3& v1) // Game Programming Gems 2.10. make sure v0,v1 are normalized { b3Vector3 c = v0.cross(v1); - b3Scalar d = v0.dot(v1); + b3Scalar d = v0.dot(v1); if (d < -1.0 + B3_EPSILON) { - b3Vector3 n,unused; - b3PlaneSpace1(v0,n,unused); - return b3Quaternion(n.getX(),n.getY(),n.getZ(),0.0f); // just pick any vector that is orthogonal to v0 + b3Vector3 n, unused; + b3PlaneSpace1(v0, n, unused); + return b3Quaternion(n.getX(), n.getY(), n.getZ(), 0.0f); // just pick any vector that is orthogonal to v0 } - b3Scalar s = b3Sqrt((1.0f + d) * 2.0f); + b3Scalar s = b3Sqrt((1.0f + d) * 2.0f); b3Scalar rs = 1.0f / s; - return b3Quaternion(c.getX()*rs,c.getY()*rs,c.getZ()*rs,s * 0.5f); - + return b3Quaternion(c.getX() * rs, c.getY() * rs, c.getZ() * rs, s * 0.5f); } -B3_FORCE_INLINE b3Quaternion -b3ShortestArcQuatNormalize2(b3Vector3& v0,b3Vector3& v1) +B3_FORCE_INLINE b3Quaternion +b3ShortestArcQuatNormalize2(b3Vector3& v0, b3Vector3& v1) { v0.normalize(); v1.normalize(); - return b3ShortestArcQuat(v0,v1); + return b3ShortestArcQuat(v0, v1); } -#endif //B3_SIMD__QUATERNION_H_ - - - +#endif //B3_SIMD__QUATERNION_H_ diff --git a/thirdparty/bullet/Bullet3Common/b3Random.h b/thirdparty/bullet/Bullet3Common/b3Random.h index dc040f1562..c2e21496c7 100644 --- a/thirdparty/bullet/Bullet3Common/b3Random.h +++ b/thirdparty/bullet/Bullet3Common/b3Random.h @@ -12,8 +12,6 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - - #ifndef B3_GEN_RANDOM_H #define B3_GEN_RANDOM_H @@ -26,8 +24,8 @@ subject to the following restrictions: #define B3_RAND_MAX UINT_MAX -B3_FORCE_INLINE void b3Srand(unsigned int seed) { init_genrand(seed); } -B3_FORCE_INLINE unsigned int b3rand() { return genrand_int32(); } +B3_FORCE_INLINE void b3Srand(unsigned int seed) { init_genrand(seed); } +B3_FORCE_INLINE unsigned int b3rand() { return genrand_int32(); } #else @@ -35,8 +33,8 @@ B3_FORCE_INLINE unsigned int b3rand() { return genrand_int32() #define B3_RAND_MAX RAND_MAX -B3_FORCE_INLINE void b3Srand(unsigned int seed) { srand(seed); } -B3_FORCE_INLINE unsigned int b3rand() { return rand(); } +B3_FORCE_INLINE void b3Srand(unsigned int seed) { srand(seed); } +B3_FORCE_INLINE unsigned int b3rand() { return rand(); } #endif @@ -45,6 +43,4 @@ inline b3Scalar b3RandRange(b3Scalar minRange, b3Scalar maxRange) return (b3rand() / (b3Scalar(B3_RAND_MAX) + b3Scalar(1.0))) * (maxRange - minRange) + minRange; } - -#endif //B3_GEN_RANDOM_H - +#endif //B3_GEN_RANDOM_H diff --git a/thirdparty/bullet/Bullet3Common/b3ResizablePool.h b/thirdparty/bullet/Bullet3Common/b3ResizablePool.h index 06ad8a778d..cafe3ff396 100644 --- a/thirdparty/bullet/Bullet3Common/b3ResizablePool.h +++ b/thirdparty/bullet/Bullet3Common/b3ResizablePool.h @@ -4,10 +4,10 @@ #include "Bullet3Common/b3AlignedObjectArray.h" -enum +enum { - B3_POOL_HANDLE_TERMINAL_FREE=-1, - B3_POOL_HANDLE_TERMINAL_USED =-2 + B3_POOL_HANDLE_TERMINAL_FREE = -1, + B3_POOL_HANDLE_TERMINAL_USED = -2 }; template <typename U> @@ -20,25 +20,23 @@ struct b3PoolBodyHandle : public U { m_nextFreeHandle = next; } - int getNextFree() const + int getNextFree() const { return m_nextFreeHandle; } }; -template <typename T> +template <typename T> class b3ResizablePool { - protected: - b3AlignedObjectArray<T> m_bodyHandles; - int m_numUsedHandles; // number of active handles - int m_firstFreeHandle; // free handles list + b3AlignedObjectArray<T> m_bodyHandles; + int m_numUsedHandles; // number of active handles + int m_firstFreeHandle; // free handles list T* getHandleInternal(int handle) { return &m_bodyHandles[handle]; - } const T* getHandleInternal(int handle) const { @@ -46,17 +44,16 @@ protected: } public: - b3ResizablePool() { initHandles(); } - + virtual ~b3ResizablePool() { exitHandles(); } -///handle management + ///handle management int getNumHandles() const { @@ -65,44 +62,40 @@ public: void getUsedHandles(b3AlignedObjectArray<int>& usedHandles) const { - - for (int i=0;i<m_bodyHandles.size();i++) + for (int i = 0; i < m_bodyHandles.size(); i++) { - if (m_bodyHandles[i].getNextFree()==B3_POOL_HANDLE_TERMINAL_USED) + if (m_bodyHandles[i].getNextFree() == B3_POOL_HANDLE_TERMINAL_USED) { usedHandles.push_back(i); } } } - - T* getHandle(int handle) { - b3Assert(handle>=0); - b3Assert(handle<m_bodyHandles.size()); - if ((handle<0) || (handle>=m_bodyHandles.size())) + b3Assert(handle >= 0); + b3Assert(handle < m_bodyHandles.size()); + if ((handle < 0) || (handle >= m_bodyHandles.size())) { return 0; } - if (m_bodyHandles[handle].getNextFree()==B3_POOL_HANDLE_TERMINAL_USED) + if (m_bodyHandles[handle].getNextFree() == B3_POOL_HANDLE_TERMINAL_USED) { return &m_bodyHandles[handle]; } return 0; - } const T* getHandle(int handle) const { - b3Assert(handle>=0); - b3Assert(handle<m_bodyHandles.size()); - if ((handle<0) || (handle>=m_bodyHandles.size())) + b3Assert(handle >= 0); + b3Assert(handle < m_bodyHandles.size()); + if ((handle < 0) || (handle >= m_bodyHandles.size())) { return 0; } - if (m_bodyHandles[handle].getNextFree()==B3_POOL_HANDLE_TERMINAL_USED) + if (m_bodyHandles[handle].getNextFree() == B3_POOL_HANDLE_TERMINAL_USED) { return &m_bodyHandles[handle]; } @@ -120,7 +113,6 @@ public: for (int i = curCapacity; i < newCapacity; i++) m_bodyHandles[i].setNextFree(i + 1); - m_bodyHandles[newCapacity - 1].setNextFree(-1); } m_firstFreeHandle = curCapacity; @@ -142,19 +134,18 @@ public: int allocHandle() { - b3Assert(m_firstFreeHandle>=0); + b3Assert(m_firstFreeHandle >= 0); int handle = m_firstFreeHandle; m_firstFreeHandle = getHandleInternal(handle)->getNextFree(); m_numUsedHandles++; - if (m_firstFreeHandle<0) + if (m_firstFreeHandle < 0) { //int curCapacity = m_bodyHandles.size(); - int additionalCapacity= m_bodyHandles.size(); + int additionalCapacity = m_bodyHandles.size(); increaseHandleCapacity(additionalCapacity); - getHandleInternal(handle)->setNextFree(m_firstFreeHandle); } getHandleInternal(handle)->setNextFree(B3_POOL_HANDLE_TERMINAL_USED); @@ -162,12 +153,11 @@ public: return handle; } - void freeHandle(int handle) { b3Assert(handle >= 0); - if (m_bodyHandles[handle].getNextFree()==B3_POOL_HANDLE_TERMINAL_USED) + if (m_bodyHandles[handle].getNextFree() == B3_POOL_HANDLE_TERMINAL_USED) { getHandleInternal(handle)->clear(); getHandleInternal(handle)->setNextFree(m_firstFreeHandle); @@ -176,7 +166,6 @@ public: } } }; - ///end handle management - - #endif //B3_RESIZABLE_POOL_H -
\ No newline at end of file +///end handle management + +#endif //B3_RESIZABLE_POOL_H diff --git a/thirdparty/bullet/Bullet3Common/b3Scalar.h b/thirdparty/bullet/Bullet3Common/b3Scalar.h index dbc7fea397..0db5eb6f4f 100644 --- a/thirdparty/bullet/Bullet3Common/b3Scalar.h +++ b/thirdparty/bullet/Bullet3Common/b3Scalar.h @@ -12,8 +12,6 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - - #ifndef B3_SCALAR_H #define B3_SCALAR_H @@ -22,238 +20,252 @@ subject to the following restrictions: #pragma unmanaged #endif - - #include <math.h> -#include <stdlib.h>//size_t for MSVC 6.0 +#include <stdlib.h> //size_t for MSVC 6.0 #include <float.h> //Original repository is at http://github.com/erwincoumans/bullet3 #define B3_BULLET_VERSION 300 -inline int b3GetVersion() +inline int b3GetVersion() { return B3_BULLET_VERSION; } -#if defined(DEBUG) || defined (_DEBUG) +#if defined(DEBUG) || defined(_DEBUG) #define B3_DEBUG #endif -#include "b3Logging.h"//for b3Error - +#include "b3Logging.h" //for b3Error #ifdef _WIN32 - #if defined(__MINGW32__) || defined(__CYGWIN__) || (defined (_MSC_VER) && _MSC_VER < 1300) +#if defined(__MINGW32__) || defined(__CYGWIN__) || (defined(_MSC_VER) && _MSC_VER < 1300) - #define B3_FORCE_INLINE inline - #define B3_ATTRIBUTE_ALIGNED16(a) a - #define B3_ATTRIBUTE_ALIGNED64(a) a - #define B3_ATTRIBUTE_ALIGNED128(a) a - #else - //#define B3_HAS_ALIGNED_ALLOCATOR - #pragma warning(disable : 4324) // disable padding warning +#define B3_FORCE_INLINE inline +#define B3_ATTRIBUTE_ALIGNED16(a) a +#define B3_ATTRIBUTE_ALIGNED64(a) a +#define B3_ATTRIBUTE_ALIGNED128(a) a +#else +//#define B3_HAS_ALIGNED_ALLOCATOR +#pragma warning(disable : 4324) // disable padding warning // #pragma warning(disable:4530) // Disable the exception disable but used in MSCV Stl warning. - #pragma warning(disable:4996) //Turn off warnings about deprecated C routines +#pragma warning(disable : 4996) //Turn off warnings about deprecated C routines // #pragma warning(disable:4786) // Disable the "debug name too long" warning - #define B3_FORCE_INLINE __forceinline - #define B3_ATTRIBUTE_ALIGNED16(a) __declspec(align(16)) a - #define B3_ATTRIBUTE_ALIGNED64(a) __declspec(align(64)) a - #define B3_ATTRIBUTE_ALIGNED128(a) __declspec (align(128)) a - #ifdef _XBOX - #define B3_USE_VMX128 - - #include <ppcintrinsics.h> - #define B3_HAVE_NATIVE_FSEL - #define b3Fsel(a,b,c) __fsel((a),(b),(c)) - #else - -#if (defined (_WIN32) && (_MSC_VER) && _MSC_VER >= 1400) && (!defined (B3_USE_DOUBLE_PRECISION)) - #if (defined (_M_IX86) || defined (_M_X64)) - #define B3_USE_SSE - #ifdef B3_USE_SSE - //B3_USE_SSE_IN_API is disabled under Windows by default, because - //it makes it harder to integrate Bullet into your application under Windows - //(structured embedding Bullet structs/classes need to be 16-byte aligned) - //with relatively little performance gain - //If you are not embedded Bullet data in your classes, or make sure that you align those classes on 16-byte boundaries - //you can manually enable this line or set it in the build system for a bit of performance gain (a few percent, dependent on usage) - //#define B3_USE_SSE_IN_API - #endif //B3_USE_SSE - #include <emmintrin.h> - #endif +#define B3_FORCE_INLINE __forceinline +#define B3_ATTRIBUTE_ALIGNED16(a) __declspec(align(16)) a +#define B3_ATTRIBUTE_ALIGNED64(a) __declspec(align(64)) a +#define B3_ATTRIBUTE_ALIGNED128(a) __declspec(align(128)) a +#ifdef _XBOX +#define B3_USE_VMX128 + +#include <ppcintrinsics.h> +#define B3_HAVE_NATIVE_FSEL +#define b3Fsel(a, b, c) __fsel((a), (b), (c)) +#else + +#if (defined(_WIN32) && (_MSC_VER) && _MSC_VER >= 1400) && (!defined(B3_USE_DOUBLE_PRECISION)) +#if (defined(_M_IX86) || defined(_M_X64)) +#define B3_USE_SSE +#ifdef B3_USE_SSE +//B3_USE_SSE_IN_API is disabled under Windows by default, because +//it makes it harder to integrate Bullet into your application under Windows +//(structured embedding Bullet structs/classes need to be 16-byte aligned) +//with relatively little performance gain +//If you are not embedded Bullet data in your classes, or make sure that you align those classes on 16-byte boundaries +//you can manually enable this line or set it in the build system for a bit of performance gain (a few percent, dependent on usage) +//#define B3_USE_SSE_IN_API +#endif //B3_USE_SSE +#include <emmintrin.h> +#endif #endif - #endif//_XBOX +#endif //_XBOX - #endif //__MINGW32__ +#endif //__MINGW32__ #ifdef B3_DEBUG - #ifdef _MSC_VER - #include <stdio.h> - #define b3Assert(x) { if(!(x)){b3Error("Assert "__FILE__ ":%u ("#x")\n", __LINE__);__debugbreak(); }} - #else//_MSC_VER - #include <assert.h> - #define b3Assert assert - #endif//_MSC_VER +#ifdef _MSC_VER +#include <stdio.h> +#define b3Assert(x) \ + { \ + if (!(x)) \ + { \ + b3Error( \ + "Assert "__FILE__ \ + ":%u (" #x ")\n", \ + __LINE__); \ + __debugbreak(); \ + } \ + } +#else //_MSC_VER +#include <assert.h> +#define b3Assert assert +#endif //_MSC_VER #else - #define b3Assert(x) +#define b3Assert(x) #endif - //b3FullAssert is optional, slows down a lot - #define b3FullAssert(x) +//b3FullAssert is optional, slows down a lot +#define b3FullAssert(x) - #define b3Likely(_c) _c - #define b3Unlikely(_c) _c +#define b3Likely(_c) _c +#define b3Unlikely(_c) _c #else - -#if defined (__CELLOS_LV2__) - #define B3_FORCE_INLINE inline __attribute__((always_inline)) - #define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16))) - #define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64))) - #define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128))) - #ifndef assert - #include <assert.h> - #endif + +#if defined(__CELLOS_LV2__) +#define B3_FORCE_INLINE inline __attribute__((always_inline)) +#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__((aligned(16))) +#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__((aligned(64))) +#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__((aligned(128))) +#ifndef assert +#include <assert.h> +#endif #ifdef B3_DEBUG #ifdef __SPU__ #include <spu_printf.h> #define printf spu_printf - #define b3Assert(x) {if(!(x)){b3Error("Assert "__FILE__ ":%u ("#x")\n", __LINE__);spu_hcmpeq(0,0);}} +#define b3Assert(x) \ + { \ + if (!(x)) \ + { \ + b3Error( \ + "Assert "__FILE__ \ + ":%u (" #x ")\n", \ + __LINE__); \ + spu_hcmpeq(0, 0); \ + } \ + } #else - #define b3Assert assert +#define b3Assert assert #endif - + #else - #define b3Assert(x) +#define b3Assert(x) #endif - //b3FullAssert is optional, slows down a lot - #define b3FullAssert(x) +//b3FullAssert is optional, slows down a lot +#define b3FullAssert(x) - #define b3Likely(_c) _c - #define b3Unlikely(_c) _c +#define b3Likely(_c) _c +#define b3Unlikely(_c) _c #else #ifdef USE_LIBSPE2 - #define B3_FORCE_INLINE __inline - #define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16))) - #define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64))) - #define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128))) - #ifndef assert - #include <assert.h> - #endif +#define B3_FORCE_INLINE __inline +#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__((aligned(16))) +#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__((aligned(64))) +#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__((aligned(128))) +#ifndef assert +#include <assert.h> +#endif #ifdef B3_DEBUG - #define b3Assert assert +#define b3Assert assert #else - #define b3Assert(x) +#define b3Assert(x) #endif - //b3FullAssert is optional, slows down a lot - #define b3FullAssert(x) +//b3FullAssert is optional, slows down a lot +#define b3FullAssert(x) +#define b3Likely(_c) __builtin_expect((_c), 1) +#define b3Unlikely(_c) __builtin_expect((_c), 0) - #define b3Likely(_c) __builtin_expect((_c), 1) - #define b3Unlikely(_c) __builtin_expect((_c), 0) - - #else - //non-windows systems - -#if (defined (__APPLE__) && (!defined (B3_USE_DOUBLE_PRECISION))) - #if defined (__i386__) || defined (__x86_64__) - #define B3_USE_SSE - //B3_USE_SSE_IN_API is enabled on Mac OSX by default, because memory is automatically aligned on 16-byte boundaries - //if apps run into issues, we will disable the next line - #define B3_USE_SSE_IN_API - #ifdef B3_USE_SSE - // include appropriate SSE level - #if defined (__SSE4_1__) - #include <smmintrin.h> - #elif defined (__SSSE3__) - #include <tmmintrin.h> - #elif defined (__SSE3__) - #include <pmmintrin.h> - #else - #include <emmintrin.h> - #endif - #endif //B3_USE_SSE - #elif defined( __armv7__ ) - #ifdef __clang__ - #define B3_USE_NEON 1 - - #if defined B3_USE_NEON && defined (__clang__) - #include <arm_neon.h> - #endif//B3_USE_NEON - #endif //__clang__ - #endif//__arm__ - - #define B3_FORCE_INLINE inline __attribute__ ((always_inline)) +//non-windows systems + +#if (defined(__APPLE__) && (!defined(B3_USE_DOUBLE_PRECISION))) +#if defined(__i386__) || defined(__x86_64__) +#define B3_USE_SSE +//B3_USE_SSE_IN_API is enabled on Mac OSX by default, because memory is automatically aligned on 16-byte boundaries +//if apps run into issues, we will disable the next line +#define B3_USE_SSE_IN_API +#ifdef B3_USE_SSE +// include appropriate SSE level +#if defined(__SSE4_1__) +#include <smmintrin.h> +#elif defined(__SSSE3__) +#include <tmmintrin.h> +#elif defined(__SSE3__) +#include <pmmintrin.h> +#else +#include <emmintrin.h> +#endif +#endif //B3_USE_SSE +#elif defined(__armv7__) +#ifdef __clang__ +#define B3_USE_NEON 1 + +#if defined B3_USE_NEON && defined(__clang__) +#include <arm_neon.h> +#endif //B3_USE_NEON +#endif //__clang__ +#endif //__arm__ + +#define B3_FORCE_INLINE inline __attribute__((always_inline)) ///@todo: check out alignment methods for other platforms/compilers - #define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16))) - #define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64))) - #define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128))) - #ifndef assert - #include <assert.h> - #endif - - #if defined(DEBUG) || defined (_DEBUG) - #if defined (__i386__) || defined (__x86_64__) - #include <stdio.h> - #define b3Assert(x)\ - {\ - if(!(x))\ - {\ - b3Error("Assert %s in line %d, file %s\n",#x, __LINE__, __FILE__);\ - asm volatile ("int3");\ - }\ +#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__((aligned(16))) +#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__((aligned(64))) +#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__((aligned(128))) +#ifndef assert +#include <assert.h> +#endif + +#if defined(DEBUG) || defined(_DEBUG) +#if defined(__i386__) || defined(__x86_64__) +#include <stdio.h> +#define b3Assert(x) \ + { \ + if (!(x)) \ + { \ + b3Error("Assert %s in line %d, file %s\n", #x, __LINE__, __FILE__); \ + asm volatile("int3"); \ + } \ } - #else//defined (__i386__) || defined (__x86_64__) - #define b3Assert assert - #endif//defined (__i386__) || defined (__x86_64__) - #else//defined(DEBUG) || defined (_DEBUG) - #define b3Assert(x) - #endif//defined(DEBUG) || defined (_DEBUG) - - //b3FullAssert is optional, slows down a lot - #define b3FullAssert(x) - #define b3Likely(_c) _c - #define b3Unlikely(_c) _c +#else //defined (__i386__) || defined (__x86_64__) +#define b3Assert assert +#endif //defined (__i386__) || defined (__x86_64__) +#else //defined(DEBUG) || defined (_DEBUG) +#define b3Assert(x) +#endif //defined(DEBUG) || defined (_DEBUG) + +//b3FullAssert is optional, slows down a lot +#define b3FullAssert(x) +#define b3Likely(_c) _c +#define b3Unlikely(_c) _c #else - #define B3_FORCE_INLINE inline - ///@todo: check out alignment methods for other platforms/compilers - #define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16))) - #define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64))) - #define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128))) - ///#define B3_ATTRIBUTE_ALIGNED16(a) a - ///#define B3_ATTRIBUTE_ALIGNED64(a) a - ///#define B3_ATTRIBUTE_ALIGNED128(a) a - #ifndef assert - #include <assert.h> - #endif - -#if defined(DEBUG) || defined (_DEBUG) - #define b3Assert assert +#define B3_FORCE_INLINE inline +///@todo: check out alignment methods for other platforms/compilers +#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__((aligned(16))) +#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__((aligned(64))) +#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__((aligned(128))) +///#define B3_ATTRIBUTE_ALIGNED16(a) a +///#define B3_ATTRIBUTE_ALIGNED64(a) a +///#define B3_ATTRIBUTE_ALIGNED128(a) a +#ifndef assert +#include <assert.h> +#endif + +#if defined(DEBUG) || defined(_DEBUG) +#define b3Assert assert #else - #define b3Assert(x) +#define b3Assert(x) #endif - //b3FullAssert is optional, slows down a lot - #define b3FullAssert(x) - #define b3Likely(_c) _c - #define b3Unlikely(_c) _c -#endif //__APPLE__ +//b3FullAssert is optional, slows down a lot +#define b3FullAssert(x) +#define b3Likely(_c) _c +#define b3Unlikely(_c) _c +#endif //__APPLE__ -#endif // LIBSPE2 +#endif // LIBSPE2 -#endif //__CELLOS_LV2__ +#endif //__CELLOS_LV2__ #endif - ///The b3Scalar type abstracts floating point numbers, to easily switch between double and single floating point precision. #if defined(B3_USE_DOUBLE_PRECISION) typedef double b3Scalar; @@ -267,34 +279,34 @@ typedef float b3Scalar; #ifdef B3_USE_SSE typedef __m128 b3SimdFloat4; -#endif//B3_USE_SSE +#endif //B3_USE_SSE -#if defined B3_USE_SSE_IN_API && defined (B3_USE_SSE) +#if defined B3_USE_SSE_IN_API && defined(B3_USE_SSE) #ifdef _WIN32 #ifndef B3_NAN static int b3NanMask = 0x7F800001; -#define B3_NAN (*(float*)&b3NanMask) +#define B3_NAN (*(float *)&b3NanMask) #endif #ifndef B3_INFINITY_MASK -static int b3InfinityMask = 0x7F800000; -#define B3_INFINITY_MASK (*(float*)&b3InfinityMask) +static int b3InfinityMask = 0x7F800000; +#define B3_INFINITY_MASK (*(float *)&b3InfinityMask) #endif -inline __m128 operator + (const __m128 A, const __m128 B) +inline __m128 operator+(const __m128 A, const __m128 B) { - return _mm_add_ps(A, B); + return _mm_add_ps(A, B); } -inline __m128 operator - (const __m128 A, const __m128 B) +inline __m128 operator-(const __m128 A, const __m128 B) { - return _mm_sub_ps(A, B); + return _mm_sub_ps(A, B); } -inline __m128 operator * (const __m128 A, const __m128 B) +inline __m128 operator*(const __m128 A, const __m128 B) { - return _mm_mul_ps(A, B); + return _mm_mul_ps(A, B); } #define b3CastfTo128i(a) (_mm_castps_si128(a)) @@ -302,18 +314,19 @@ inline __m128 operator * (const __m128 A, const __m128 B) #define b3CastiTo128f(a) (_mm_castsi128_ps(a)) #define b3CastdTo128f(a) (_mm_castpd_ps(a)) #define b3CastdTo128i(a) (_mm_castpd_si128(a)) -#define b3Assign128(r0,r1,r2,r3) _mm_setr_ps(r0,r1,r2,r3) +#define b3Assign128(r0, r1, r2, r3) _mm_setr_ps(r0, r1, r2, r3) -#else//_WIN32 +#else //_WIN32 #define b3CastfTo128i(a) ((__m128i)(a)) #define b3CastfTo128d(a) ((__m128d)(a)) -#define b3CastiTo128f(a) ((__m128) (a)) -#define b3CastdTo128f(a) ((__m128) (a)) +#define b3CastiTo128f(a) ((__m128)(a)) +#define b3CastdTo128f(a) ((__m128)(a)) #define b3CastdTo128i(a) ((__m128i)(a)) -#define b3Assign128(r0,r1,r2,r3) (__m128){r0,r1,r2,r3} -#endif//_WIN32 -#endif //B3_USE_SSE_IN_API +#define b3Assign128(r0, r1, r2, r3) \ + (__m128) { r0, r1, r2, r3 } +#endif //_WIN32 +#endif //B3_USE_SSE_IN_API #ifdef B3_USE_NEON #include <arm_neon.h> @@ -321,142 +334,160 @@ inline __m128 operator * (const __m128 A, const __m128 B) typedef float32x4_t b3SimdFloat4; #define B3_INFINITY INFINITY #define B3_NAN NAN -#define b3Assign128(r0,r1,r2,r3) (float32x4_t){r0,r1,r2,r3} +#define b3Assign128(r0, r1, r2, r3) \ + (float32x4_t) { r0, r1, r2, r3 } #endif - - - - -#define B3_DECLARE_ALIGNED_ALLOCATOR() \ - B3_FORCE_INLINE void* operator new(size_t sizeInBytes) { return b3AlignedAlloc(sizeInBytes,16); } \ - B3_FORCE_INLINE void operator delete(void* ptr) { b3AlignedFree(ptr); } \ - B3_FORCE_INLINE void* operator new(size_t, void* ptr) { return ptr; } \ - B3_FORCE_INLINE void operator delete(void*, void*) { } \ - B3_FORCE_INLINE void* operator new[](size_t sizeInBytes) { return b3AlignedAlloc(sizeInBytes,16); } \ - B3_FORCE_INLINE void operator delete[](void* ptr) { b3AlignedFree(ptr); } \ - B3_FORCE_INLINE void* operator new[](size_t, void* ptr) { return ptr; } \ - B3_FORCE_INLINE void operator delete[](void*, void*) { } \ - - +#define B3_DECLARE_ALIGNED_ALLOCATOR() \ + B3_FORCE_INLINE void *operator new(size_t sizeInBytes) { return b3AlignedAlloc(sizeInBytes, 16); } \ + B3_FORCE_INLINE void operator delete(void *ptr) { b3AlignedFree(ptr); } \ + B3_FORCE_INLINE void *operator new(size_t, void *ptr) { return ptr; } \ + B3_FORCE_INLINE void operator delete(void *, void *) {} \ + B3_FORCE_INLINE void *operator new[](size_t sizeInBytes) { return b3AlignedAlloc(sizeInBytes, 16); } \ + B3_FORCE_INLINE void operator delete[](void *ptr) { b3AlignedFree(ptr); } \ + B3_FORCE_INLINE void *operator new[](size_t, void *ptr) { return ptr; } \ + B3_FORCE_INLINE void operator delete[](void *, void *) {} #if defined(B3_USE_DOUBLE_PRECISION) || defined(B3_FORCE_DOUBLE_FUNCTIONS) - -B3_FORCE_INLINE b3Scalar b3Sqrt(b3Scalar x) { return sqrt(x); } + +B3_FORCE_INLINE b3Scalar b3Sqrt(b3Scalar x) +{ + return sqrt(x); +} B3_FORCE_INLINE b3Scalar b3Fabs(b3Scalar x) { return fabs(x); } B3_FORCE_INLINE b3Scalar b3Cos(b3Scalar x) { return cos(x); } B3_FORCE_INLINE b3Scalar b3Sin(b3Scalar x) { return sin(x); } B3_FORCE_INLINE b3Scalar b3Tan(b3Scalar x) { return tan(x); } -B3_FORCE_INLINE b3Scalar b3Acos(b3Scalar x) { if (x<b3Scalar(-1)) x=b3Scalar(-1); if (x>b3Scalar(1)) x=b3Scalar(1); return acos(x); } -B3_FORCE_INLINE b3Scalar b3Asin(b3Scalar x) { if (x<b3Scalar(-1)) x=b3Scalar(-1); if (x>b3Scalar(1)) x=b3Scalar(1); return asin(x); } +B3_FORCE_INLINE b3Scalar b3Acos(b3Scalar x) +{ + if (x < b3Scalar(-1)) x = b3Scalar(-1); + if (x > b3Scalar(1)) x = b3Scalar(1); + return acos(x); +} +B3_FORCE_INLINE b3Scalar b3Asin(b3Scalar x) +{ + if (x < b3Scalar(-1)) x = b3Scalar(-1); + if (x > b3Scalar(1)) x = b3Scalar(1); + return asin(x); +} B3_FORCE_INLINE b3Scalar b3Atan(b3Scalar x) { return atan(x); } B3_FORCE_INLINE b3Scalar b3Atan2(b3Scalar x, b3Scalar y) { return atan2(x, y); } B3_FORCE_INLINE b3Scalar b3Exp(b3Scalar x) { return exp(x); } B3_FORCE_INLINE b3Scalar b3Log(b3Scalar x) { return log(x); } -B3_FORCE_INLINE b3Scalar b3Pow(b3Scalar x,b3Scalar y) { return pow(x,y); } -B3_FORCE_INLINE b3Scalar b3Fmod(b3Scalar x,b3Scalar y) { return fmod(x,y); } +B3_FORCE_INLINE b3Scalar b3Pow(b3Scalar x, b3Scalar y) { return pow(x, y); } +B3_FORCE_INLINE b3Scalar b3Fmod(b3Scalar x, b3Scalar y) { return fmod(x, y); } #else - -B3_FORCE_INLINE b3Scalar b3Sqrt(b3Scalar y) -{ + +B3_FORCE_INLINE b3Scalar b3Sqrt(b3Scalar y) +{ #ifdef USE_APPROXIMATION - double x, z, tempf; - unsigned long *tfptr = ((unsigned long *)&tempf) + 1; + double x, z, tempf; + unsigned long *tfptr = ((unsigned long *)&tempf) + 1; tempf = y; - *tfptr = (0xbfcdd90a - *tfptr)>>1; /* estimate of 1/sqrt(y) */ - x = tempf; - z = y*b3Scalar(0.5); - x = (b3Scalar(1.5)*x)-(x*x)*(x*z); /* iteration formula */ - x = (b3Scalar(1.5)*x)-(x*x)*(x*z); - x = (b3Scalar(1.5)*x)-(x*x)*(x*z); - x = (b3Scalar(1.5)*x)-(x*x)*(x*z); - x = (b3Scalar(1.5)*x)-(x*x)*(x*z); - return x*y; + *tfptr = (0xbfcdd90a - *tfptr) >> 1; /* estimate of 1/sqrt(y) */ + x = tempf; + z = y * b3Scalar(0.5); + x = (b3Scalar(1.5) * x) - (x * x) * (x * z); /* iteration formula */ + x = (b3Scalar(1.5) * x) - (x * x) * (x * z); + x = (b3Scalar(1.5) * x) - (x * x) * (x * z); + x = (b3Scalar(1.5) * x) - (x * x) * (x * z); + x = (b3Scalar(1.5) * x) - (x * x) * (x * z); + return x * y; #else - return sqrtf(y); + return sqrtf(y); #endif } B3_FORCE_INLINE b3Scalar b3Fabs(b3Scalar x) { return fabsf(x); } B3_FORCE_INLINE b3Scalar b3Cos(b3Scalar x) { return cosf(x); } B3_FORCE_INLINE b3Scalar b3Sin(b3Scalar x) { return sinf(x); } B3_FORCE_INLINE b3Scalar b3Tan(b3Scalar x) { return tanf(x); } -B3_FORCE_INLINE b3Scalar b3Acos(b3Scalar x) { - if (x<b3Scalar(-1)) - x=b3Scalar(-1); - if (x>b3Scalar(1)) - x=b3Scalar(1); - return acosf(x); +B3_FORCE_INLINE b3Scalar b3Acos(b3Scalar x) +{ + if (x < b3Scalar(-1)) + x = b3Scalar(-1); + if (x > b3Scalar(1)) + x = b3Scalar(1); + return acosf(x); } -B3_FORCE_INLINE b3Scalar b3Asin(b3Scalar x) { - if (x<b3Scalar(-1)) - x=b3Scalar(-1); - if (x>b3Scalar(1)) - x=b3Scalar(1); - return asinf(x); +B3_FORCE_INLINE b3Scalar b3Asin(b3Scalar x) +{ + if (x < b3Scalar(-1)) + x = b3Scalar(-1); + if (x > b3Scalar(1)) + x = b3Scalar(1); + return asinf(x); } B3_FORCE_INLINE b3Scalar b3Atan(b3Scalar x) { return atanf(x); } B3_FORCE_INLINE b3Scalar b3Atan2(b3Scalar x, b3Scalar y) { return atan2f(x, y); } B3_FORCE_INLINE b3Scalar b3Exp(b3Scalar x) { return expf(x); } B3_FORCE_INLINE b3Scalar b3Log(b3Scalar x) { return logf(x); } -B3_FORCE_INLINE b3Scalar b3Pow(b3Scalar x,b3Scalar y) { return powf(x,y); } -B3_FORCE_INLINE b3Scalar b3Fmod(b3Scalar x,b3Scalar y) { return fmodf(x,y); } - +B3_FORCE_INLINE b3Scalar b3Pow(b3Scalar x, b3Scalar y) { return powf(x, y); } +B3_FORCE_INLINE b3Scalar b3Fmod(b3Scalar x, b3Scalar y) { return fmodf(x, y); } + #endif -#define B3_2_PI b3Scalar(6.283185307179586232) -#define B3_PI (B3_2_PI * b3Scalar(0.5)) -#define B3_HALF_PI (B3_2_PI * b3Scalar(0.25)) +#define B3_2_PI b3Scalar(6.283185307179586232) +#define B3_PI (B3_2_PI * b3Scalar(0.5)) +#define B3_HALF_PI (B3_2_PI * b3Scalar(0.25)) #define B3_RADS_PER_DEG (B3_2_PI / b3Scalar(360.0)) -#define B3_DEGS_PER_RAD (b3Scalar(360.0) / B3_2_PI) +#define B3_DEGS_PER_RAD (b3Scalar(360.0) / B3_2_PI) #define B3_SQRT12 b3Scalar(0.7071067811865475244008443621048490) -#define b3RecipSqrt(x) ((b3Scalar)(b3Scalar(1.0)/b3Sqrt(b3Scalar(x)))) /* reciprocal square root */ - +#define b3RecipSqrt(x) ((b3Scalar)(b3Scalar(1.0) / b3Sqrt(b3Scalar(x)))) /* reciprocal square root */ #ifdef B3_USE_DOUBLE_PRECISION -#define B3_EPSILON DBL_EPSILON -#define B3_INFINITY DBL_MAX +#define B3_EPSILON DBL_EPSILON +#define B3_INFINITY DBL_MAX #else -#define B3_EPSILON FLT_EPSILON -#define B3_INFINITY FLT_MAX +#define B3_EPSILON FLT_EPSILON +#define B3_INFINITY FLT_MAX #endif -B3_FORCE_INLINE b3Scalar b3Atan2Fast(b3Scalar y, b3Scalar x) +B3_FORCE_INLINE b3Scalar b3Atan2Fast(b3Scalar y, b3Scalar x) { b3Scalar coeff_1 = B3_PI / 4.0f; b3Scalar coeff_2 = 3.0f * coeff_1; b3Scalar abs_y = b3Fabs(y); b3Scalar angle; - if (x >= 0.0f) { + if (x >= 0.0f) + { b3Scalar r = (x - abs_y) / (x + abs_y); angle = coeff_1 - coeff_1 * r; - } else { + } + else + { b3Scalar r = (x + abs_y) / (abs_y - x); angle = coeff_2 - coeff_1 * r; } return (y < 0.0f) ? -angle : angle; } -B3_FORCE_INLINE bool b3FuzzyZero(b3Scalar x) { return b3Fabs(x) < B3_EPSILON; } +B3_FORCE_INLINE bool b3FuzzyZero(b3Scalar x) { return b3Fabs(x) < B3_EPSILON; } -B3_FORCE_INLINE bool b3Equal(b3Scalar a, b3Scalar eps) { +B3_FORCE_INLINE bool b3Equal(b3Scalar a, b3Scalar eps) +{ return (((a) <= eps) && !((a) < -eps)); } -B3_FORCE_INLINE bool b3GreaterEqual (b3Scalar a, b3Scalar eps) { +B3_FORCE_INLINE bool b3GreaterEqual(b3Scalar a, b3Scalar eps) +{ return (!((a) <= eps)); } - -B3_FORCE_INLINE int b3IsNegative(b3Scalar x) { - return x < b3Scalar(0.0) ? 1 : 0; +B3_FORCE_INLINE int b3IsNegative(b3Scalar x) +{ + return x < b3Scalar(0.0) ? 1 : 0; } B3_FORCE_INLINE b3Scalar b3Radians(b3Scalar x) { return x * B3_RADS_PER_DEG; } B3_FORCE_INLINE b3Scalar b3Degrees(b3Scalar x) { return x * B3_DEGS_PER_RAD; } -#define B3_DECLARE_HANDLE(name) typedef struct name##__ { int unused; } *name +#define B3_DECLARE_HANDLE(name) \ + typedef struct name##__ \ + { \ + int unused; \ + } * name #ifndef b3Fsel B3_FORCE_INLINE b3Scalar b3Fsel(b3Scalar a, b3Scalar b, b3Scalar c) @@ -464,60 +495,57 @@ B3_FORCE_INLINE b3Scalar b3Fsel(b3Scalar a, b3Scalar b, b3Scalar c) return a >= 0 ? b : c; } #endif -#define b3Fsels(a,b,c) (b3Scalar)b3Fsel(a,b,c) - +#define b3Fsels(a, b, c) (b3Scalar) b3Fsel(a, b, c) B3_FORCE_INLINE bool b3MachineIsLittleEndian() { - long int i = 1; - const char *p = (const char *) &i; - if (p[0] == 1) // Lowest address contains the least significant byte - return true; - else - return false; + long int i = 1; + const char *p = (const char *)&i; + if (p[0] == 1) // Lowest address contains the least significant byte + return true; + else + return false; } - - ///b3Select avoids branches, which makes performance much better for consoles like Playstation 3 and XBox 360 ///Thanks Phil Knight. See also http://www.cellperformance.com/articles/2006/04/more_techniques_for_eliminatin_1.html -B3_FORCE_INLINE unsigned b3Select(unsigned condition, unsigned valueIfConditionNonZero, unsigned valueIfConditionZero) +B3_FORCE_INLINE unsigned b3Select(unsigned condition, unsigned valueIfConditionNonZero, unsigned valueIfConditionZero) { - // Set testNz to 0xFFFFFFFF if condition is nonzero, 0x00000000 if condition is zero - // Rely on positive value or'ed with its negative having sign bit on - // and zero value or'ed with its negative (which is still zero) having sign bit off - // Use arithmetic shift right, shifting the sign bit through all 32 bits - unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31); - unsigned testEqz = ~testNz; - return ((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz)); + // Set testNz to 0xFFFFFFFF if condition is nonzero, 0x00000000 if condition is zero + // Rely on positive value or'ed with its negative having sign bit on + // and zero value or'ed with its negative (which is still zero) having sign bit off + // Use arithmetic shift right, shifting the sign bit through all 32 bits + unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31); + unsigned testEqz = ~testNz; + return ((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz)); } B3_FORCE_INLINE int b3Select(unsigned condition, int valueIfConditionNonZero, int valueIfConditionZero) { - unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31); - unsigned testEqz = ~testNz; - return static_cast<int>((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz)); + unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31); + unsigned testEqz = ~testNz; + return static_cast<int>((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz)); } B3_FORCE_INLINE float b3Select(unsigned condition, float valueIfConditionNonZero, float valueIfConditionZero) { #ifdef B3_HAVE_NATIVE_FSEL - return (float)b3Fsel((b3Scalar)condition - b3Scalar(1.0f), valueIfConditionNonZero, valueIfConditionZero); + return (float)b3Fsel((b3Scalar)condition - b3Scalar(1.0f), valueIfConditionNonZero, valueIfConditionZero); #else - return (condition != 0) ? valueIfConditionNonZero : valueIfConditionZero; + return (condition != 0) ? valueIfConditionNonZero : valueIfConditionZero; #endif } -template<typename T> B3_FORCE_INLINE void b3Swap(T& a, T& b) +template <typename T> +B3_FORCE_INLINE void b3Swap(T &a, T &b) { T tmp = a; a = b; b = tmp; } - //PCK: endian swapping functions B3_FORCE_INLINE unsigned b3SwapEndian(unsigned val) { - return (((val & 0xff000000) >> 24) | ((val & 0x00ff0000) >> 8) | ((val & 0x0000ff00) << 8) | ((val & 0x000000ff) << 24)); + return (((val & 0xff000000) >> 24) | ((val & 0x00ff0000) >> 8) | ((val & 0x0000ff00) << 8) | ((val & 0x000000ff) << 24)); } B3_FORCE_INLINE unsigned short b3SwapEndian(unsigned short val) @@ -532,87 +560,85 @@ B3_FORCE_INLINE unsigned b3SwapEndian(int val) B3_FORCE_INLINE unsigned short b3SwapEndian(short val) { - return b3SwapEndian((unsigned short) val); + return b3SwapEndian((unsigned short)val); } ///b3SwapFloat uses using char pointers to swap the endianness ////b3SwapFloat/b3SwapDouble will NOT return a float, because the machine might 'correct' invalid floating point values -///Not all values of sign/exponent/mantissa are valid floating point numbers according to IEEE 754. -///When a floating point unit is faced with an invalid value, it may actually change the value, or worse, throw an exception. -///In most systems, running user mode code, you wouldn't get an exception, but instead the hardware/os/runtime will 'fix' the number for you. +///Not all values of sign/exponent/mantissa are valid floating point numbers according to IEEE 754. +///When a floating point unit is faced with an invalid value, it may actually change the value, or worse, throw an exception. +///In most systems, running user mode code, you wouldn't get an exception, but instead the hardware/os/runtime will 'fix' the number for you. ///so instead of returning a float/double, we return integer/long long integer -B3_FORCE_INLINE unsigned int b3SwapEndianFloat(float d) +B3_FORCE_INLINE unsigned int b3SwapEndianFloat(float d) { - unsigned int a = 0; - unsigned char *dst = (unsigned char *)&a; - unsigned char *src = (unsigned char *)&d; - - dst[0] = src[3]; - dst[1] = src[2]; - dst[2] = src[1]; - dst[3] = src[0]; - return a; + unsigned int a = 0; + unsigned char *dst = (unsigned char *)&a; + unsigned char *src = (unsigned char *)&d; + + dst[0] = src[3]; + dst[1] = src[2]; + dst[2] = src[1]; + dst[3] = src[0]; + return a; } // unswap using char pointers -B3_FORCE_INLINE float b3UnswapEndianFloat(unsigned int a) +B3_FORCE_INLINE float b3UnswapEndianFloat(unsigned int a) { - float d = 0.0f; - unsigned char *src = (unsigned char *)&a; - unsigned char *dst = (unsigned char *)&d; + float d = 0.0f; + unsigned char *src = (unsigned char *)&a; + unsigned char *dst = (unsigned char *)&d; - dst[0] = src[3]; - dst[1] = src[2]; - dst[2] = src[1]; - dst[3] = src[0]; + dst[0] = src[3]; + dst[1] = src[2]; + dst[2] = src[1]; + dst[3] = src[0]; - return d; + return d; } - // swap using char pointers -B3_FORCE_INLINE void b3SwapEndianDouble(double d, unsigned char* dst) +B3_FORCE_INLINE void b3SwapEndianDouble(double d, unsigned char *dst) { - unsigned char *src = (unsigned char *)&d; - - dst[0] = src[7]; - dst[1] = src[6]; - dst[2] = src[5]; - dst[3] = src[4]; - dst[4] = src[3]; - dst[5] = src[2]; - dst[6] = src[1]; - dst[7] = src[0]; - + unsigned char *src = (unsigned char *)&d; + + dst[0] = src[7]; + dst[1] = src[6]; + dst[2] = src[5]; + dst[3] = src[4]; + dst[4] = src[3]; + dst[5] = src[2]; + dst[6] = src[1]; + dst[7] = src[0]; } // unswap using char pointers -B3_FORCE_INLINE double b3UnswapEndianDouble(const unsigned char *src) +B3_FORCE_INLINE double b3UnswapEndianDouble(const unsigned char *src) { - double d = 0.0; - unsigned char *dst = (unsigned char *)&d; - - dst[0] = src[7]; - dst[1] = src[6]; - dst[2] = src[5]; - dst[3] = src[4]; - dst[4] = src[3]; - dst[5] = src[2]; - dst[6] = src[1]; - dst[7] = src[0]; + double d = 0.0; + unsigned char *dst = (unsigned char *)&d; + + dst[0] = src[7]; + dst[1] = src[6]; + dst[2] = src[5]; + dst[3] = src[4]; + dst[4] = src[3]; + dst[5] = src[2]; + dst[6] = src[1]; + dst[7] = src[0]; return d; } // returns normalized value in range [-B3_PI, B3_PI] -B3_FORCE_INLINE b3Scalar b3NormalizeAngle(b3Scalar angleInRadians) +B3_FORCE_INLINE b3Scalar b3NormalizeAngle(b3Scalar angleInRadians) { angleInRadians = b3Fmod(angleInRadians, B3_2_PI); - if(angleInRadians < -B3_PI) + if (angleInRadians < -B3_PI) { return angleInRadians + B3_2_PI; } - else if(angleInRadians > B3_PI) + else if (angleInRadians > B3_PI) { return angleInRadians - B3_2_PI; } @@ -626,38 +652,34 @@ B3_FORCE_INLINE b3Scalar b3NormalizeAngle(b3Scalar angleInRadians) struct b3TypedObject { b3TypedObject(int objectType) - :m_objectType(objectType) + : m_objectType(objectType) { } - int m_objectType; + int m_objectType; inline int getObjectType() const { return m_objectType; } }; - - ///align a pointer to the provided alignment, upwards -template <typename T>T* b3AlignPointer(T* unalignedPtr, size_t alignment) +template <typename T> +T *b3AlignPointer(T *unalignedPtr, size_t alignment) { - struct b3ConvertPointerSizeT { - union - { - T* ptr; - size_t integer; + union { + T *ptr; + size_t integer; }; }; - b3ConvertPointerSizeT converter; - - + b3ConvertPointerSizeT converter; + const size_t bit_mask = ~(alignment - 1); - converter.ptr = unalignedPtr; - converter.integer += alignment-1; + converter.ptr = unalignedPtr; + converter.integer += alignment - 1; converter.integer &= bit_mask; return converter.ptr; } -#endif //B3_SCALAR_H +#endif //B3_SCALAR_H diff --git a/thirdparty/bullet/Bullet3Common/b3StackAlloc.h b/thirdparty/bullet/Bullet3Common/b3StackAlloc.h index de7de056b5..4972236ac7 100644 --- a/thirdparty/bullet/Bullet3Common/b3StackAlloc.h +++ b/thirdparty/bullet/Bullet3Common/b3StackAlloc.h @@ -20,97 +20,99 @@ Nov.2006 #ifndef B3_STACK_ALLOC #define B3_STACK_ALLOC -#include "b3Scalar.h" //for b3Assert +#include "b3Scalar.h" //for b3Assert #include "b3AlignedAllocator.h" ///The b3Block class is an internal structure for the b3StackAlloc memory allocator. struct b3Block { - b3Block* previous; - unsigned char* address; + b3Block* previous; + unsigned char* address; }; ///The StackAlloc class provides some fast stack-based memory allocator (LIFO last-in first-out) class b3StackAlloc { public: + b3StackAlloc(unsigned int size) + { + ctor(); + create(size); + } + ~b3StackAlloc() { destroy(); } - b3StackAlloc(unsigned int size) { ctor();create(size); } - ~b3StackAlloc() { destroy(); } - - inline void create(unsigned int size) + inline void create(unsigned int size) { destroy(); - data = (unsigned char*) b3AlignedAlloc(size,16); - totalsize = size; + data = (unsigned char*)b3AlignedAlloc(size, 16); + totalsize = size; } - inline void destroy() + inline void destroy() { - b3Assert(usedsize==0); + b3Assert(usedsize == 0); //Raise(L"StackAlloc is still in use"); - if(usedsize==0) + if (usedsize == 0) { - if(!ischild && data) + if (!ischild && data) b3AlignedFree(data); - data = 0; - usedsize = 0; + data = 0; + usedsize = 0; } - } - int getAvailableMemory() const + int getAvailableMemory() const { return static_cast<int>(totalsize - usedsize); } - unsigned char* allocate(unsigned int size) + unsigned char* allocate(unsigned int size) { - const unsigned int nus(usedsize+size); - if(nus<totalsize) + const unsigned int nus(usedsize + size); + if (nus < totalsize) { - usedsize=nus; - return(data+(usedsize-size)); + usedsize = nus; + return (data + (usedsize - size)); } b3Assert(0); //&& (L"Not enough memory")); - - return(0); + + return (0); } - B3_FORCE_INLINE b3Block* beginBlock() + B3_FORCE_INLINE b3Block* beginBlock() { - b3Block* pb = (b3Block*)allocate(sizeof(b3Block)); - pb->previous = current; - pb->address = data+usedsize; - current = pb; - return(pb); + b3Block* pb = (b3Block*)allocate(sizeof(b3Block)); + pb->previous = current; + pb->address = data + usedsize; + current = pb; + return (pb); } - B3_FORCE_INLINE void endBlock(b3Block* block) + B3_FORCE_INLINE void endBlock(b3Block* block) { - b3Assert(block==current); + b3Assert(block == current); //Raise(L"Unmatched blocks"); - if(block==current) + if (block == current) { - current = block->previous; - usedsize = (unsigned int)((block->address-data)-sizeof(b3Block)); + current = block->previous; + usedsize = (unsigned int)((block->address - data) - sizeof(b3Block)); } } private: - void ctor() + void ctor() { - data = 0; - totalsize = 0; - usedsize = 0; - current = 0; - ischild = false; + data = 0; + totalsize = 0; + usedsize = 0; + current = 0; + ischild = false; } - unsigned char* data; - unsigned int totalsize; - unsigned int usedsize; - b3Block* current; - bool ischild; + unsigned char* data; + unsigned int totalsize; + unsigned int usedsize; + b3Block* current; + bool ischild; }; -#endif //B3_STACK_ALLOC +#endif //B3_STACK_ALLOC diff --git a/thirdparty/bullet/Bullet3Common/b3Transform.h b/thirdparty/bullet/Bullet3Common/b3Transform.h index fa480759be..149da9d148 100644 --- a/thirdparty/bullet/Bullet3Common/b3Transform.h +++ b/thirdparty/bullet/Bullet3Common/b3Transform.h @@ -12,11 +12,9 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - #ifndef B3_TRANSFORM_H #define B3_TRANSFORM_H - #include "b3Matrix3x3.h" #ifdef B3_USE_DOUBLE_PRECISION @@ -25,46 +23,45 @@ subject to the following restrictions: #define b3TransformData b3TransformFloatData #endif - - - /**@brief The b3Transform class supports rigid transforms with only translation and rotation and no scaling/shear. *It can be used in combination with b3Vector3, b3Quaternion and b3Matrix3x3 linear algebra classes. */ -B3_ATTRIBUTE_ALIGNED16(class) b3Transform { - - ///Storage for the rotation +B3_ATTRIBUTE_ALIGNED16(class) +b3Transform +{ + ///Storage for the rotation b3Matrix3x3 m_basis; - ///Storage for the translation - b3Vector3 m_origin; + ///Storage for the translation + b3Vector3 m_origin; public: - - /**@brief No initialization constructor */ + /**@brief No initialization constructor */ b3Transform() {} - /**@brief Constructor from b3Quaternion (optional b3Vector3 ) + /**@brief Constructor from b3Quaternion (optional b3Vector3 ) * @param q Rotation from quaternion * @param c Translation from Vector (default 0,0,0) */ - explicit B3_FORCE_INLINE b3Transform(const b3Quaternion& q, - const b3Vector3& c = b3MakeVector3(b3Scalar(0), b3Scalar(0), b3Scalar(0))) + explicit B3_FORCE_INLINE b3Transform(const b3Quaternion& q, + const b3Vector3& c = b3MakeVector3(b3Scalar(0), b3Scalar(0), b3Scalar(0))) : m_basis(q), - m_origin(c) - {} + m_origin(c) + { + } - /**@brief Constructor from b3Matrix3x3 (optional b3Vector3) + /**@brief Constructor from b3Matrix3x3 (optional b3Vector3) * @param b Rotation from Matrix * @param c Translation from Vector default (0,0,0)*/ - explicit B3_FORCE_INLINE b3Transform(const b3Matrix3x3& b, - const b3Vector3& c = b3MakeVector3(b3Scalar(0), b3Scalar(0), b3Scalar(0))) + explicit B3_FORCE_INLINE b3Transform(const b3Matrix3x3& b, + const b3Vector3& c = b3MakeVector3(b3Scalar(0), b3Scalar(0), b3Scalar(0))) : m_basis(b), - m_origin(c) - {} - /**@brief Copy constructor */ - B3_FORCE_INLINE b3Transform (const b3Transform& other) + m_origin(c) + { + } + /**@brief Copy constructor */ + B3_FORCE_INLINE b3Transform(const b3Transform& other) : m_basis(other.m_basis), - m_origin(other.m_origin) + m_origin(other.m_origin) { } - /**@brief Assignment Operator */ + /**@brief Assignment Operator */ B3_FORCE_INLINE b3Transform& operator=(const b3Transform& other) { m_basis = other.m_basis; @@ -72,70 +69,70 @@ public: return *this; } - - /**@brief Set the current transform as the value of the product of two transforms + /**@brief Set the current transform as the value of the product of two transforms * @param t1 Transform 1 * @param t2 Transform 2 * This = Transform1 * Transform2 */ - B3_FORCE_INLINE void mult(const b3Transform& t1, const b3Transform& t2) { - m_basis = t1.m_basis * t2.m_basis; - m_origin = t1(t2.m_origin); - } + B3_FORCE_INLINE void mult(const b3Transform& t1, const b3Transform& t2) + { + m_basis = t1.m_basis * t2.m_basis; + m_origin = t1(t2.m_origin); + } -/* void multInverseLeft(const b3Transform& t1, const b3Transform& t2) { + /* void multInverseLeft(const b3Transform& t1, const b3Transform& t2) { b3Vector3 v = t2.m_origin - t1.m_origin; m_basis = b3MultTransposeLeft(t1.m_basis, t2.m_basis); m_origin = v * t1.m_basis; } */ -/**@brief Return the transform of the vector */ + /**@brief Return the transform of the vector */ B3_FORCE_INLINE b3Vector3 operator()(const b3Vector3& x) const { - return x.dot3(m_basis[0], m_basis[1], m_basis[2]) + m_origin; + return x.dot3(m_basis[0], m_basis[1], m_basis[2]) + m_origin; } - /**@brief Return the transform of the vector */ + /**@brief Return the transform of the vector */ B3_FORCE_INLINE b3Vector3 operator*(const b3Vector3& x) const { return (*this)(x); } - /**@brief Return the transform of the b3Quaternion */ + /**@brief Return the transform of the b3Quaternion */ B3_FORCE_INLINE b3Quaternion operator*(const b3Quaternion& q) const { return getRotation() * q; } - /**@brief Return the basis matrix for the rotation */ - B3_FORCE_INLINE b3Matrix3x3& getBasis() { return m_basis; } - /**@brief Return the basis matrix for the rotation */ - B3_FORCE_INLINE const b3Matrix3x3& getBasis() const { return m_basis; } + /**@brief Return the basis matrix for the rotation */ + B3_FORCE_INLINE b3Matrix3x3& getBasis() { return m_basis; } + /**@brief Return the basis matrix for the rotation */ + B3_FORCE_INLINE const b3Matrix3x3& getBasis() const { return m_basis; } - /**@brief Return the origin vector translation */ - B3_FORCE_INLINE b3Vector3& getOrigin() { return m_origin; } - /**@brief Return the origin vector translation */ - B3_FORCE_INLINE const b3Vector3& getOrigin() const { return m_origin; } + /**@brief Return the origin vector translation */ + B3_FORCE_INLINE b3Vector3& getOrigin() { return m_origin; } + /**@brief Return the origin vector translation */ + B3_FORCE_INLINE const b3Vector3& getOrigin() const { return m_origin; } - /**@brief Return a quaternion representing the rotation */ - b3Quaternion getRotation() const { + /**@brief Return a quaternion representing the rotation */ + b3Quaternion getRotation() const + { b3Quaternion q; m_basis.getRotation(q); return q; } - - - /**@brief Set from an array + + /**@brief Set from an array * @param m A pointer to a 15 element array (12 rotation(row major padded on the right by 1), and 3 translation */ - void setFromOpenGLMatrix(const b3Scalar *m) + void setFromOpenGLMatrix(const b3Scalar* m) { m_basis.setFromOpenGLSubMatrix(m); - m_origin.setValue(m[12],m[13],m[14]); + m_origin.setValue(m[12], m[13], m[14]); } - /**@brief Fill an array representation + /**@brief Fill an array representation * @param m A pointer to a 15 element array (12 rotation(row major padded on the right by 1), and 3 translation */ - void getOpenGLMatrix(b3Scalar *m) const + void getOpenGLMatrix(b3Scalar * m) const { m_basis.getOpenGLSubMatrix(m); m[12] = m_origin.getX(); @@ -144,80 +141,76 @@ public: m[15] = b3Scalar(1.0); } - /**@brief Set the translational element + /**@brief Set the translational element * @param origin The vector to set the translation to */ - B3_FORCE_INLINE void setOrigin(const b3Vector3& origin) - { + B3_FORCE_INLINE void setOrigin(const b3Vector3& origin) + { m_origin = origin; } B3_FORCE_INLINE b3Vector3 invXform(const b3Vector3& inVec) const; - - /**@brief Set the rotational element by b3Matrix3x3 */ + /**@brief Set the rotational element by b3Matrix3x3 */ B3_FORCE_INLINE void setBasis(const b3Matrix3x3& basis) - { + { m_basis = basis; } - /**@brief Set the rotational element by b3Quaternion */ + /**@brief Set the rotational element by b3Quaternion */ B3_FORCE_INLINE void setRotation(const b3Quaternion& q) { m_basis.setRotation(q); } - - /**@brief Set this transformation to the identity */ + /**@brief Set this transformation to the identity */ void setIdentity() { m_basis.setIdentity(); m_origin.setValue(b3Scalar(0.0), b3Scalar(0.0), b3Scalar(0.0)); } - /**@brief Multiply this Transform by another(this = this * another) + /**@brief Multiply this Transform by another(this = this * another) * @param t The other transform */ - b3Transform& operator*=(const b3Transform& t) + b3Transform& operator*=(const b3Transform& t) { m_origin += m_basis * t.m_origin; m_basis *= t.m_basis; return *this; } - /**@brief Return the inverse of this transform */ + /**@brief Return the inverse of this transform */ b3Transform inverse() const - { + { b3Matrix3x3 inv = m_basis.transpose(); return b3Transform(inv, inv * -m_origin); } - /**@brief Return the inverse of this transform times the other transform + /**@brief Return the inverse of this transform times the other transform * @param t The other transform * return this.inverse() * the other */ - b3Transform inverseTimes(const b3Transform& t) const; + b3Transform inverseTimes(const b3Transform& t) const; - /**@brief Return the product of this transform and the other */ + /**@brief Return the product of this transform and the other */ b3Transform operator*(const b3Transform& t) const; - /**@brief Return an identity transform */ - static const b3Transform& getIdentity() + /**@brief Return an identity transform */ + static const b3Transform& getIdentity() { static const b3Transform identityTransform(b3Matrix3x3::getIdentity()); return identityTransform; } - void serialize(struct b3TransformData& dataOut) const; - - void serializeFloat(struct b3TransformFloatData& dataOut) const; + void serialize(struct b3TransformData & dataOut) const; - void deSerialize(const struct b3TransformData& dataIn); + void serializeFloat(struct b3TransformFloatData & dataOut) const; - void deSerializeDouble(const struct b3TransformDoubleData& dataIn); + void deSerialize(const struct b3TransformData& dataIn); - void deSerializeFloat(const struct b3TransformFloatData& dataIn); + void deSerializeDouble(const struct b3TransformDoubleData& dataIn); + void deSerializeFloat(const struct b3TransformFloatData& dataIn); }; - B3_FORCE_INLINE b3Vector3 b3Transform::invXform(const b3Vector3& inVec) const { @@ -225,80 +218,69 @@ b3Transform::invXform(const b3Vector3& inVec) const return (m_basis.transpose() * v); } -B3_FORCE_INLINE b3Transform -b3Transform::inverseTimes(const b3Transform& t) const +B3_FORCE_INLINE b3Transform +b3Transform::inverseTimes(const b3Transform& t) const { b3Vector3 v = t.getOrigin() - m_origin; - return b3Transform(m_basis.transposeTimes(t.m_basis), - v * m_basis); + return b3Transform(m_basis.transposeTimes(t.m_basis), + v * m_basis); } -B3_FORCE_INLINE b3Transform -b3Transform::operator*(const b3Transform& t) const +B3_FORCE_INLINE b3Transform + b3Transform::operator*(const b3Transform& t) const { - return b3Transform(m_basis * t.m_basis, - (*this)(t.m_origin)); + return b3Transform(m_basis * t.m_basis, + (*this)(t.m_origin)); } /**@brief Test if two transforms have all elements equal */ B3_FORCE_INLINE bool operator==(const b3Transform& t1, const b3Transform& t2) { - return ( t1.getBasis() == t2.getBasis() && - t1.getOrigin() == t2.getOrigin() ); + return (t1.getBasis() == t2.getBasis() && + t1.getOrigin() == t2.getOrigin()); } - ///for serialization -struct b3TransformFloatData +struct b3TransformFloatData { - b3Matrix3x3FloatData m_basis; - b3Vector3FloatData m_origin; + b3Matrix3x3FloatData m_basis; + b3Vector3FloatData m_origin; }; -struct b3TransformDoubleData +struct b3TransformDoubleData { - b3Matrix3x3DoubleData m_basis; - b3Vector3DoubleData m_origin; + b3Matrix3x3DoubleData m_basis; + b3Vector3DoubleData m_origin; }; - - -B3_FORCE_INLINE void b3Transform::serialize(b3TransformData& dataOut) const +B3_FORCE_INLINE void b3Transform::serialize(b3TransformData& dataOut) const { m_basis.serialize(dataOut.m_basis); m_origin.serialize(dataOut.m_origin); } -B3_FORCE_INLINE void b3Transform::serializeFloat(b3TransformFloatData& dataOut) const +B3_FORCE_INLINE void b3Transform::serializeFloat(b3TransformFloatData& dataOut) const { m_basis.serializeFloat(dataOut.m_basis); m_origin.serializeFloat(dataOut.m_origin); } - -B3_FORCE_INLINE void b3Transform::deSerialize(const b3TransformData& dataIn) +B3_FORCE_INLINE void b3Transform::deSerialize(const b3TransformData& dataIn) { m_basis.deSerialize(dataIn.m_basis); m_origin.deSerialize(dataIn.m_origin); } -B3_FORCE_INLINE void b3Transform::deSerializeFloat(const b3TransformFloatData& dataIn) +B3_FORCE_INLINE void b3Transform::deSerializeFloat(const b3TransformFloatData& dataIn) { m_basis.deSerializeFloat(dataIn.m_basis); m_origin.deSerializeFloat(dataIn.m_origin); } -B3_FORCE_INLINE void b3Transform::deSerializeDouble(const b3TransformDoubleData& dataIn) +B3_FORCE_INLINE void b3Transform::deSerializeDouble(const b3TransformDoubleData& dataIn) { m_basis.deSerializeDouble(dataIn.m_basis); m_origin.deSerializeDouble(dataIn.m_origin); } - -#endif //B3_TRANSFORM_H - - - - - - +#endif //B3_TRANSFORM_H diff --git a/thirdparty/bullet/Bullet3Common/b3TransformUtil.h b/thirdparty/bullet/Bullet3Common/b3TransformUtil.h index 6ce580c132..1850a9be5f 100644 --- a/thirdparty/bullet/Bullet3Common/b3TransformUtil.h +++ b/thirdparty/bullet/Bullet3Common/b3TransformUtil.h @@ -12,204 +12,189 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - #ifndef B3_TRANSFORM_UTIL_H #define B3_TRANSFORM_UTIL_H #include "b3Transform.h" -#define B3_ANGULAR_MOTION_THRESHOLD b3Scalar(0.5)*B3_HALF_PI - - - +#define B3_ANGULAR_MOTION_THRESHOLD b3Scalar(0.5) * B3_HALF_PI -B3_FORCE_INLINE b3Vector3 b3AabbSupport(const b3Vector3& halfExtents,const b3Vector3& supportDir) +B3_FORCE_INLINE b3Vector3 b3AabbSupport(const b3Vector3& halfExtents, const b3Vector3& supportDir) { return b3MakeVector3(supportDir.getX() < b3Scalar(0.0) ? -halfExtents.getX() : halfExtents.getX(), - supportDir.getY() < b3Scalar(0.0) ? -halfExtents.getY() : halfExtents.getY(), - supportDir.getZ() < b3Scalar(0.0) ? -halfExtents.getZ() : halfExtents.getZ()); + supportDir.getY() < b3Scalar(0.0) ? -halfExtents.getY() : halfExtents.getY(), + supportDir.getZ() < b3Scalar(0.0) ? -halfExtents.getZ() : halfExtents.getZ()); } - - - - - /// Utils related to temporal transforms class b3TransformUtil { - public: - - static void integrateTransform(const b3Transform& curTrans,const b3Vector3& linvel,const b3Vector3& angvel,b3Scalar timeStep,b3Transform& predictedTransform) + static void integrateTransform(const b3Transform& curTrans, const b3Vector3& linvel, const b3Vector3& angvel, b3Scalar timeStep, b3Transform& predictedTransform) { predictedTransform.setOrigin(curTrans.getOrigin() + linvel * timeStep); -// #define QUATERNION_DERIVATIVE - #ifdef QUATERNION_DERIVATIVE + // #define QUATERNION_DERIVATIVE +#ifdef QUATERNION_DERIVATIVE b3Quaternion predictedOrn = curTrans.getRotation(); predictedOrn += (angvel * predictedOrn) * (timeStep * b3Scalar(0.5)); predictedOrn.normalize(); - #else +#else //Exponential map //google for "Practical Parameterization of Rotations Using the Exponential Map", F. Sebastian Grassia b3Vector3 axis; - b3Scalar fAngle = angvel.length(); + b3Scalar fAngle = angvel.length(); //limit the angular motion - if (fAngle*timeStep > B3_ANGULAR_MOTION_THRESHOLD) + if (fAngle * timeStep > B3_ANGULAR_MOTION_THRESHOLD) { fAngle = B3_ANGULAR_MOTION_THRESHOLD / timeStep; } - if ( fAngle < b3Scalar(0.001) ) + if (fAngle < b3Scalar(0.001)) { // use Taylor's expansions of sync function - axis = angvel*( b3Scalar(0.5)*timeStep-(timeStep*timeStep*timeStep)*(b3Scalar(0.020833333333))*fAngle*fAngle ); + axis = angvel * (b3Scalar(0.5) * timeStep - (timeStep * timeStep * timeStep) * (b3Scalar(0.020833333333)) * fAngle * fAngle); } else { // sync(fAngle) = sin(c*fAngle)/t - axis = angvel*( b3Sin(b3Scalar(0.5)*fAngle*timeStep)/fAngle ); + axis = angvel * (b3Sin(b3Scalar(0.5) * fAngle * timeStep) / fAngle); } - b3Quaternion dorn (axis.getX(),axis.getY(),axis.getZ(),b3Cos( fAngle*timeStep*b3Scalar(0.5) )); + b3Quaternion dorn(axis.getX(), axis.getY(), axis.getZ(), b3Cos(fAngle * timeStep * b3Scalar(0.5))); b3Quaternion orn0 = curTrans.getRotation(); b3Quaternion predictedOrn = dorn * orn0; predictedOrn.normalize(); - #endif +#endif predictedTransform.setRotation(predictedOrn); } - static void calculateVelocityQuaternion(const b3Vector3& pos0,const b3Vector3& pos1,const b3Quaternion& orn0,const b3Quaternion& orn1,b3Scalar timeStep,b3Vector3& linVel,b3Vector3& angVel) + static void calculateVelocityQuaternion(const b3Vector3& pos0, const b3Vector3& pos1, const b3Quaternion& orn0, const b3Quaternion& orn1, b3Scalar timeStep, b3Vector3& linVel, b3Vector3& angVel) { linVel = (pos1 - pos0) / timeStep; b3Vector3 axis; - b3Scalar angle; + b3Scalar angle; if (orn0 != orn1) { - calculateDiffAxisAngleQuaternion(orn0,orn1,axis,angle); + calculateDiffAxisAngleQuaternion(orn0, orn1, axis, angle); angVel = axis * angle / timeStep; - } else + } + else { - angVel.setValue(0,0,0); + angVel.setValue(0, 0, 0); } } - static void calculateDiffAxisAngleQuaternion(const b3Quaternion& orn0,const b3Quaternion& orn1a,b3Vector3& axis,b3Scalar& angle) + static void calculateDiffAxisAngleQuaternion(const b3Quaternion& orn0, const b3Quaternion& orn1a, b3Vector3& axis, b3Scalar& angle) { b3Quaternion orn1 = orn0.nearest(orn1a); b3Quaternion dorn = orn1 * orn0.inverse(); angle = dorn.getAngle(); - axis = b3MakeVector3(dorn.getX(),dorn.getY(),dorn.getZ()); + axis = b3MakeVector3(dorn.getX(), dorn.getY(), dorn.getZ()); axis[3] = b3Scalar(0.); //check for axis length b3Scalar len = axis.length2(); - if (len < B3_EPSILON*B3_EPSILON) - axis = b3MakeVector3(b3Scalar(1.),b3Scalar(0.),b3Scalar(0.)); + if (len < B3_EPSILON * B3_EPSILON) + axis = b3MakeVector3(b3Scalar(1.), b3Scalar(0.), b3Scalar(0.)); else axis /= b3Sqrt(len); } - static void calculateVelocity(const b3Transform& transform0,const b3Transform& transform1,b3Scalar timeStep,b3Vector3& linVel,b3Vector3& angVel) + static void calculateVelocity(const b3Transform& transform0, const b3Transform& transform1, b3Scalar timeStep, b3Vector3& linVel, b3Vector3& angVel) { linVel = (transform1.getOrigin() - transform0.getOrigin()) / timeStep; b3Vector3 axis; - b3Scalar angle; - calculateDiffAxisAngle(transform0,transform1,axis,angle); + b3Scalar angle; + calculateDiffAxisAngle(transform0, transform1, axis, angle); angVel = axis * angle / timeStep; } - static void calculateDiffAxisAngle(const b3Transform& transform0,const b3Transform& transform1,b3Vector3& axis,b3Scalar& angle) + static void calculateDiffAxisAngle(const b3Transform& transform0, const b3Transform& transform1, b3Vector3& axis, b3Scalar& angle) { b3Matrix3x3 dmat = transform1.getBasis() * transform0.getBasis().inverse(); b3Quaternion dorn; dmat.getRotation(dorn); - ///floating point inaccuracy can lead to w component > 1..., which breaks + ///floating point inaccuracy can lead to w component > 1..., which breaks dorn.normalize(); - + angle = dorn.getAngle(); - axis = b3MakeVector3(dorn.getX(),dorn.getY(),dorn.getZ()); + axis = b3MakeVector3(dorn.getX(), dorn.getY(), dorn.getZ()); axis[3] = b3Scalar(0.); //check for axis length b3Scalar len = axis.length2(); - if (len < B3_EPSILON*B3_EPSILON) - axis = b3MakeVector3(b3Scalar(1.),b3Scalar(0.),b3Scalar(0.)); + if (len < B3_EPSILON * B3_EPSILON) + axis = b3MakeVector3(b3Scalar(1.), b3Scalar(0.), b3Scalar(0.)); else axis /= b3Sqrt(len); } - }; - -///The b3ConvexSeparatingDistanceUtil can help speed up convex collision detection +///The b3ConvexSeparatingDistanceUtil can help speed up convex collision detection ///by conservatively updating a cached separating distance/vector instead of re-calculating the closest distance -class b3ConvexSeparatingDistanceUtil +class b3ConvexSeparatingDistanceUtil { - b3Quaternion m_ornA; - b3Quaternion m_ornB; - b3Vector3 m_posA; - b3Vector3 m_posB; - - b3Vector3 m_separatingNormal; + b3Quaternion m_ornA; + b3Quaternion m_ornB; + b3Vector3 m_posA; + b3Vector3 m_posB; - b3Scalar m_boundingRadiusA; - b3Scalar m_boundingRadiusB; - b3Scalar m_separatingDistance; + b3Vector3 m_separatingNormal; -public: + b3Scalar m_boundingRadiusA; + b3Scalar m_boundingRadiusB; + b3Scalar m_separatingDistance; - b3ConvexSeparatingDistanceUtil(b3Scalar boundingRadiusA,b3Scalar boundingRadiusB) - :m_boundingRadiusA(boundingRadiusA), - m_boundingRadiusB(boundingRadiusB), - m_separatingDistance(0.f) +public: + b3ConvexSeparatingDistanceUtil(b3Scalar boundingRadiusA, b3Scalar boundingRadiusB) + : m_boundingRadiusA(boundingRadiusA), + m_boundingRadiusB(boundingRadiusB), + m_separatingDistance(0.f) { } - b3Scalar getConservativeSeparatingDistance() + b3Scalar getConservativeSeparatingDistance() { return m_separatingDistance; } - void updateSeparatingDistance(const b3Transform& transA,const b3Transform& transB) + void updateSeparatingDistance(const b3Transform& transA, const b3Transform& transB) { const b3Vector3& toPosA = transA.getOrigin(); const b3Vector3& toPosB = transB.getOrigin(); b3Quaternion toOrnA = transA.getRotation(); b3Quaternion toOrnB = transB.getRotation(); - if (m_separatingDistance>0.f) + if (m_separatingDistance > 0.f) { - - - b3Vector3 linVelA,angVelA,linVelB,angVelB; - b3TransformUtil::calculateVelocityQuaternion(m_posA,toPosA,m_ornA,toOrnA,b3Scalar(1.),linVelA,angVelA); - b3TransformUtil::calculateVelocityQuaternion(m_posB,toPosB,m_ornB,toOrnB,b3Scalar(1.),linVelB,angVelB); + b3Vector3 linVelA, angVelA, linVelB, angVelB; + b3TransformUtil::calculateVelocityQuaternion(m_posA, toPosA, m_ornA, toOrnA, b3Scalar(1.), linVelA, angVelA); + b3TransformUtil::calculateVelocityQuaternion(m_posB, toPosB, m_ornB, toOrnB, b3Scalar(1.), linVelB, angVelB); b3Scalar maxAngularProjectedVelocity = angVelA.length() * m_boundingRadiusA + angVelB.length() * m_boundingRadiusB; - b3Vector3 relLinVel = (linVelB-linVelA); + b3Vector3 relLinVel = (linVelB - linVelA); b3Scalar relLinVelocLength = relLinVel.dot(m_separatingNormal); - if (relLinVelocLength<0.f) + if (relLinVelocLength < 0.f) { relLinVelocLength = 0.f; } - - b3Scalar projectedMotion = maxAngularProjectedVelocity +relLinVelocLength; + + b3Scalar projectedMotion = maxAngularProjectedVelocity + relLinVelocLength; m_separatingDistance -= projectedMotion; } - + m_posA = toPosA; m_posB = toPosB; m_ornA = toOrnA; m_ornB = toOrnB; } - void initSeparatingDistance(const b3Vector3& separatingVector,b3Scalar separatingDistance,const b3Transform& transA,const b3Transform& transB) + void initSeparatingDistance(const b3Vector3& separatingVector, b3Scalar separatingDistance, const b3Transform& transA, const b3Transform& transB) { m_separatingDistance = separatingDistance; - if (m_separatingDistance>0.f) + if (m_separatingDistance > 0.f) { m_separatingNormal = separatingVector; - + const b3Vector3& toPosA = transA.getOrigin(); const b3Vector3& toPosB = transB.getOrigin(); b3Quaternion toOrnA = transA.getRotation(); @@ -220,9 +205,6 @@ public: m_ornB = toOrnB; } } - }; - -#endif //B3_TRANSFORM_UTIL_H - +#endif //B3_TRANSFORM_UTIL_H diff --git a/thirdparty/bullet/Bullet3Common/b3Vector3.cpp b/thirdparty/bullet/Bullet3Common/b3Vector3.cpp index 5f5ac4ac04..100fb774c1 100644 --- a/thirdparty/bullet/Bullet3Common/b3Vector3.cpp +++ b/thirdparty/bullet/Bullet3Common/b3Vector3.cpp @@ -14,274 +14,281 @@ This source version has been altered. */ -#if defined (_WIN32) || defined (__i386__) +#if defined(_WIN32) || defined(__i386__) #define B3_USE_SSE_IN_API #endif #include "b3Vector3.h" -#if defined (B3_USE_SSE) || defined (B3_USE_NEON) +#if defined(B3_USE_SSE) || defined(B3_USE_NEON) #ifdef __APPLE__ #include <stdint.h> -typedef float float4 __attribute__ ((vector_size(16))); +typedef float float4 __attribute__((vector_size(16))); #else #define float4 __m128 #endif //typedef uint32_t uint4 __attribute__ ((vector_size(16))); - #if defined B3_USE_SSE || defined _WIN32 -#define LOG2_ARRAY_SIZE 6 -#define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE) +#define LOG2_ARRAY_SIZE 6 +#define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE) #include <emmintrin.h> -long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult ); -long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult ) +long b3_maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult); +long b3_maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult) { - const float4 *vertices = (const float4*) vv; - static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 }; - float4 dotMax = b3Assign128( -B3_INFINITY, -B3_INFINITY, -B3_INFINITY, -B3_INFINITY ); - float4 vvec = _mm_loadu_ps( vec ); - float4 vHi = b3CastiTo128f(_mm_shuffle_epi32( b3CastfTo128i( vvec), 0xaa )); /// zzzz - float4 vLo = _mm_movelh_ps( vvec, vvec ); /// xyxy + const float4 *vertices = (const float4 *)vv; + static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0}; + float4 dotMax = b3Assign128(-B3_INFINITY, -B3_INFINITY, -B3_INFINITY, -B3_INFINITY); + float4 vvec = _mm_loadu_ps(vec); + float4 vHi = b3CastiTo128f(_mm_shuffle_epi32(b3CastfTo128i(vvec), 0xaa)); /// zzzz + float4 vLo = _mm_movelh_ps(vvec, vvec); /// xyxy - long maxIndex = -1L; + long maxIndex = -1L; - size_t segment = 0; - float4 stack_array[ STACK_ARRAY_COUNT ]; + size_t segment = 0; + float4 stack_array[STACK_ARRAY_COUNT]; #if DEBUG - // memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) ); + // memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) ); #endif - size_t index; - float4 max; - // Faster loop without cleanup code for full tiles - for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 ) - { - max = dotMax; - - for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 ) - { // do four dot products at a time. Carefully avoid touching the w element. - float4 v0 = vertices[0]; - float4 v1 = vertices[1]; - float4 v2 = vertices[2]; - float4 v3 = vertices[3]; vertices += 4; - - float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); - float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); - float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+1] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+2] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+3] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - - // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way. - } - - // If we found a new max - if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax))) - { - // copy the new max across all lanes of our max accumulator - max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e)); - max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1)); - - dotMax = max; - - // find first occurrence of that max - size_t test; - for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4 - {} - // record where it is. - maxIndex = 4*index + segment + indexTable[test]; - } - } - - // account for work we've already done - count -= segment; - - // Deal with the last < STACK_ARRAY_COUNT vectors - max = dotMax; - index = 0; - - - if( b3Unlikely( count > 16) ) - { - for( ; index + 4 <= count / 4; index+=4 ) - { // do four dot products at a time. Carefully avoid touching the w element. - float4 v0 = vertices[0]; - float4 v1 = vertices[1]; - float4 v2 = vertices[2]; - float4 v3 = vertices[3]; vertices += 4; - - float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); - float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); - float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+1] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+2] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+3] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - - // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way. - } - } - - size_t localCount = (count & -4L) - 4*index; - if( localCount ) - { + size_t index; + float4 max; + // Faster loop without cleanup code for full tiles + for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4) + { + max = dotMax; + + for (index = 0; index < STACK_ARRAY_COUNT; index += 4) + { // do four dot products at a time. Carefully avoid touching the w element. + float4 v0 = vertices[0]; + float4 v1 = vertices[1]; + float4 v2 = vertices[2]; + float4 v3 = vertices[3]; + vertices += 4; + + float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); + float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); + float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 1] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 2] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 3] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + + // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way. + } + + // If we found a new max + if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax))) + { + // copy the new max across all lanes of our max accumulator + max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e)); + max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1)); + + dotMax = max; + + // find first occurrence of that max + size_t test; + for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++) // local_count must be a multiple of 4 + { + } + // record where it is. + maxIndex = 4 * index + segment + indexTable[test]; + } + } + + // account for work we've already done + count -= segment; + + // Deal with the last < STACK_ARRAY_COUNT vectors + max = dotMax; + index = 0; + + if (b3Unlikely(count > 16)) + { + for (; index + 4 <= count / 4; index += 4) + { // do four dot products at a time. Carefully avoid touching the w element. + float4 v0 = vertices[0]; + float4 v1 = vertices[1]; + float4 v2 = vertices[2]; + float4 v3 = vertices[3]; + vertices += 4; + + float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); + float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); + float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 1] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 2] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 3] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + + // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way. + } + } + + size_t localCount = (count & -4L) - 4 * index; + if (localCount) + { #ifdef __APPLE__ - float4 t0, t1, t2, t3, t4; - float4 * sap = &stack_array[index + localCount / 4]; - vertices += localCount; // counter the offset - size_t byteIndex = -(localCount) * sizeof(float); - //AT&T Code style assembly - asm volatile - ( ".align 4 \n\ + float4 t0, t1, t2, t3, t4; + float4 *sap = &stack_array[index + localCount / 4]; + vertices += localCount; // counter the offset + size_t byteIndex = -(localCount) * sizeof(float); + //AT&T Code style assembly + asm volatile( + ".align 4 \n\ 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\ movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\ movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\ @@ -307,369 +314,375 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl add $16, %[byteIndex] // advance loop counter\n\ jnz 0b \n\ " - : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex) - : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap) - : "memory", "cc" - ); - index += localCount/4; + : [max] "+x"(max), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex) + : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap) + : "memory", "cc"); + index += localCount / 4; #else - { - for( unsigned int i=0; i<localCount/4; i++,index++) - { // do four dot products at a time. Carefully avoid touching the w element. - float4 v0 = vertices[0]; - float4 v1 = vertices[1]; - float4 v2 = vertices[2]; - float4 v3 = vertices[3]; - vertices += 4; - - float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); - float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); - float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - } - } -#endif //__APPLE__ - } - - // process the last few points - if( count & 3 ) - { - float4 v0, v1, v2, x, y, z; - switch( count & 3 ) - { - case 3: - { - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - - // Calculate 3 dot products, transpose, duplicate v2 - float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo - float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo - lo0 = lo0*vLo; - z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2 - z = z*vHi; - float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy - lo1 = lo1*vLo; - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - } - break; - case 2: - { - v0 = vertices[0]; - v1 = vertices[1]; - float4 xy = _mm_movelh_ps(v0, v1); - z = _mm_movehl_ps(v1, v0); - xy = xy*vLo; - z = _mm_shuffle_ps( z, z, 0xa8); - x = _mm_shuffle_ps( xy, xy, 0xa8); - y = _mm_shuffle_ps( xy, xy, 0xfd); - z = z*vHi; - } - break; - case 1: - { - float4 xy = vertices[0]; - z = _mm_shuffle_ps( xy, xy, 0xaa); - xy = xy*vLo; - z = z*vHi; - x = _mm_shuffle_ps(xy, xy, 0); - y = _mm_shuffle_ps(xy, xy, 0x55); - } - break; - } - x = x+y; - x = x+z; - stack_array[index] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - index++; - } - - // if we found a new max. - if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax))) - { // we found a new max. Search for it - // find max across the max vector, place in all elements of max -- big latency hit here - max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e)); - max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1)); - - // It is slightly faster to do this part in scalar code when count < 8. However, the common case for - // this where it actually makes a difference is handled in the early out at the top of the function, - // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced - // complexity, and removed it. - - dotMax = max; - - // scan for the first occurence of max in the array - size_t test; - for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4 - {} - maxIndex = 4*index + segment + indexTable[test]; - } - - _mm_store_ss( dotResult, dotMax); - return maxIndex; + { + for (unsigned int i = 0; i < localCount / 4; i++, index++) + { // do four dot products at a time. Carefully avoid touching the w element. + float4 v0 = vertices[0]; + float4 v1 = vertices[1]; + float4 v2 = vertices[2]; + float4 v3 = vertices[3]; + vertices += 4; + + float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); + float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); + float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + } + } +#endif //__APPLE__ + } + + // process the last few points + if (count & 3) + { + float4 v0, v1, v2, x, y, z; + switch (count & 3) + { + case 3: + { + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + + // Calculate 3 dot products, transpose, duplicate v2 + float4 lo0 = _mm_movelh_ps(v0, v1); // xyxy.lo + float4 hi0 = _mm_movehl_ps(v1, v0); // z?z?.lo + lo0 = lo0 * vLo; + z = _mm_shuffle_ps(hi0, v2, 0xa8); // z0z1z2z2 + z = z * vHi; + float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy + lo1 = lo1 * vLo; + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + } + break; + case 2: + { + v0 = vertices[0]; + v1 = vertices[1]; + float4 xy = _mm_movelh_ps(v0, v1); + z = _mm_movehl_ps(v1, v0); + xy = xy * vLo; + z = _mm_shuffle_ps(z, z, 0xa8); + x = _mm_shuffle_ps(xy, xy, 0xa8); + y = _mm_shuffle_ps(xy, xy, 0xfd); + z = z * vHi; + } + break; + case 1: + { + float4 xy = vertices[0]; + z = _mm_shuffle_ps(xy, xy, 0xaa); + xy = xy * vLo; + z = z * vHi; + x = _mm_shuffle_ps(xy, xy, 0); + y = _mm_shuffle_ps(xy, xy, 0x55); + } + break; + } + x = x + y; + x = x + z; + stack_array[index] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + index++; + } + + // if we found a new max. + if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax))) + { // we found a new max. Search for it + // find max across the max vector, place in all elements of max -- big latency hit here + max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e)); + max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1)); + + // It is slightly faster to do this part in scalar code when count < 8. However, the common case for + // this where it actually makes a difference is handled in the early out at the top of the function, + // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced + // complexity, and removed it. + + dotMax = max; + + // scan for the first occurence of max in the array + size_t test; + for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++) // local_count must be a multiple of 4 + { + } + maxIndex = 4 * index + segment + indexTable[test]; + } + + _mm_store_ss(dotResult, dotMax); + return maxIndex; } -long b3_mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult ); +long b3_mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult); -long b3_mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult ) +long b3_mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult) { - const float4 *vertices = (const float4*) vv; - static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 }; + const float4 *vertices = (const float4 *)vv; + static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0}; - float4 dotmin = b3Assign128( B3_INFINITY, B3_INFINITY, B3_INFINITY, B3_INFINITY ); - float4 vvec = _mm_loadu_ps( vec ); - float4 vHi = b3CastiTo128f(_mm_shuffle_epi32( b3CastfTo128i( vvec), 0xaa )); /// zzzz - float4 vLo = _mm_movelh_ps( vvec, vvec ); /// xyxy + float4 dotmin = b3Assign128(B3_INFINITY, B3_INFINITY, B3_INFINITY, B3_INFINITY); + float4 vvec = _mm_loadu_ps(vec); + float4 vHi = b3CastiTo128f(_mm_shuffle_epi32(b3CastfTo128i(vvec), 0xaa)); /// zzzz + float4 vLo = _mm_movelh_ps(vvec, vvec); /// xyxy - long minIndex = -1L; + long minIndex = -1L; - size_t segment = 0; - float4 stack_array[ STACK_ARRAY_COUNT ]; + size_t segment = 0; + float4 stack_array[STACK_ARRAY_COUNT]; #if DEBUG - // memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) ); + // memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) ); #endif - size_t index; - float4 min; - // Faster loop without cleanup code for full tiles - for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 ) - { - min = dotmin; - - for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 ) - { // do four dot products at a time. Carefully avoid touching the w element. - float4 v0 = vertices[0]; - float4 v1 = vertices[1]; - float4 v2 = vertices[2]; - float4 v3 = vertices[3]; vertices += 4; - - float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); - float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); - float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index] = x; - min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+1] = x; - min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+2] = x; - min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+3] = x; - min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan - - // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way. - } - - // If we found a new min - if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin))) - { - // copy the new min across all lanes of our min accumulator - min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e)); - min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1)); - - dotmin = min; - - // find first occurrence of that min - size_t test; - for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4 - {} - // record where it is. - minIndex = 4*index + segment + indexTable[test]; - } - } - - // account for work we've already done - count -= segment; - - // Deal with the last < STACK_ARRAY_COUNT vectors - min = dotmin; - index = 0; - - - if(b3Unlikely( count > 16) ) - { - for( ; index + 4 <= count / 4; index+=4 ) - { // do four dot products at a time. Carefully avoid touching the w element. - float4 v0 = vertices[0]; - float4 v1 = vertices[1]; - float4 v2 = vertices[2]; - float4 v3 = vertices[3]; vertices += 4; - - float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); - float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); - float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index] = x; - min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+1] = x; - min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+2] = x; - min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+3] = x; - min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan - - // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way. - } - } - - size_t localCount = (count & -4L) - 4*index; - if( localCount ) - { - - + size_t index; + float4 min; + // Faster loop without cleanup code for full tiles + for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4) + { + min = dotmin; + + for (index = 0; index < STACK_ARRAY_COUNT; index += 4) + { // do four dot products at a time. Carefully avoid touching the w element. + float4 v0 = vertices[0]; + float4 v1 = vertices[1]; + float4 v2 = vertices[2]; + float4 v3 = vertices[3]; + vertices += 4; + + float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); + float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); + float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index] = x; + min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 1] = x; + min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 2] = x; + min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 3] = x; + min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan + + // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way. + } + + // If we found a new min + if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin))) + { + // copy the new min across all lanes of our min accumulator + min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e)); + min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1)); + + dotmin = min; + + // find first occurrence of that min + size_t test; + for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++) // local_count must be a multiple of 4 + { + } + // record where it is. + minIndex = 4 * index + segment + indexTable[test]; + } + } + + // account for work we've already done + count -= segment; + + // Deal with the last < STACK_ARRAY_COUNT vectors + min = dotmin; + index = 0; + + if (b3Unlikely(count > 16)) + { + for (; index + 4 <= count / 4; index += 4) + { // do four dot products at a time. Carefully avoid touching the w element. + float4 v0 = vertices[0]; + float4 v1 = vertices[1]; + float4 v2 = vertices[2]; + float4 v3 = vertices[3]; + vertices += 4; + + float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); + float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); + float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index] = x; + min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 1] = x; + min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 2] = x; + min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 3] = x; + min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan + + // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way. + } + } + + size_t localCount = (count & -4L) - 4 * index; + if (localCount) + { #ifdef __APPLE__ - vertices += localCount; // counter the offset - float4 t0, t1, t2, t3, t4; - size_t byteIndex = -(localCount) * sizeof(float); - float4 * sap = &stack_array[index + localCount / 4]; + vertices += localCount; // counter the offset + float4 t0, t1, t2, t3, t4; + size_t byteIndex = -(localCount) * sizeof(float); + float4 *sap = &stack_array[index + localCount / 4]; - asm volatile - ( ".align 4 \n\ + asm volatile( + ".align 4 \n\ 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\ movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\ movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\ @@ -695,937 +708,930 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl add $16, %[byteIndex] // advance loop counter\n\ jnz 0b \n\ " - : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex) - : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap) - : "memory", "cc" - ); - index += localCount/4; + : [min] "+x"(min), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex) + : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap) + : "memory", "cc"); + index += localCount / 4; #else - { - for( unsigned int i=0; i<localCount/4; i++,index++) - { // do four dot products at a time. Carefully avoid touching the w element. - float4 v0 = vertices[0]; - float4 v1 = vertices[1]; - float4 v2 = vertices[2]; - float4 v3 = vertices[3]; - vertices += 4; - - float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); - float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); - float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index] = x; - min = _mm_min_ps( x, min ); // control the order here so that max is never NaN even if x is nan - } - } + { + for (unsigned int i = 0; i < localCount / 4; i++, index++) + { // do four dot products at a time. Carefully avoid touching the w element. + float4 v0 = vertices[0]; + float4 v1 = vertices[1]; + float4 v2 = vertices[2]; + float4 v3 = vertices[3]; + vertices += 4; + + float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); + float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); + float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index] = x; + min = _mm_min_ps(x, min); // control the order here so that max is never NaN even if x is nan + } + } #endif - } - - // process the last few points - if( count & 3 ) - { - float4 v0, v1, v2, x, y, z; - switch( count & 3 ) - { - case 3: - { - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - - // Calculate 3 dot products, transpose, duplicate v2 - float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo - float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo - lo0 = lo0*vLo; - z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2 - z = z*vHi; - float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy - lo1 = lo1*vLo; - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - } - break; - case 2: - { - v0 = vertices[0]; - v1 = vertices[1]; - float4 xy = _mm_movelh_ps(v0, v1); - z = _mm_movehl_ps(v1, v0); - xy = xy*vLo; - z = _mm_shuffle_ps( z, z, 0xa8); - x = _mm_shuffle_ps( xy, xy, 0xa8); - y = _mm_shuffle_ps( xy, xy, 0xfd); - z = z*vHi; - } - break; - case 1: - { - float4 xy = vertices[0]; - z = _mm_shuffle_ps( xy, xy, 0xaa); - xy = xy*vLo; - z = z*vHi; - x = _mm_shuffle_ps(xy, xy, 0); - y = _mm_shuffle_ps(xy, xy, 0x55); - } - break; - } - x = x+y; - x = x+z; - stack_array[index] = x; - min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan - index++; - } - - // if we found a new min. - if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin))) - { // we found a new min. Search for it - // find min across the min vector, place in all elements of min -- big latency hit here - min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e)); - min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1)); - - // It is slightly faster to do this part in scalar code when count < 8. However, the common case for - // this where it actually makes a difference is handled in the early out at the top of the function, - // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced - // complexity, and removed it. - - dotmin = min; - - // scan for the first occurence of min in the array - size_t test; - for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4 - {} - minIndex = 4*index + segment + indexTable[test]; - } - - _mm_store_ss( dotResult, dotmin); - return minIndex; + } + + // process the last few points + if (count & 3) + { + float4 v0, v1, v2, x, y, z; + switch (count & 3) + { + case 3: + { + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + + // Calculate 3 dot products, transpose, duplicate v2 + float4 lo0 = _mm_movelh_ps(v0, v1); // xyxy.lo + float4 hi0 = _mm_movehl_ps(v1, v0); // z?z?.lo + lo0 = lo0 * vLo; + z = _mm_shuffle_ps(hi0, v2, 0xa8); // z0z1z2z2 + z = z * vHi; + float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy + lo1 = lo1 * vLo; + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + } + break; + case 2: + { + v0 = vertices[0]; + v1 = vertices[1]; + float4 xy = _mm_movelh_ps(v0, v1); + z = _mm_movehl_ps(v1, v0); + xy = xy * vLo; + z = _mm_shuffle_ps(z, z, 0xa8); + x = _mm_shuffle_ps(xy, xy, 0xa8); + y = _mm_shuffle_ps(xy, xy, 0xfd); + z = z * vHi; + } + break; + case 1: + { + float4 xy = vertices[0]; + z = _mm_shuffle_ps(xy, xy, 0xaa); + xy = xy * vLo; + z = z * vHi; + x = _mm_shuffle_ps(xy, xy, 0); + y = _mm_shuffle_ps(xy, xy, 0x55); + } + break; + } + x = x + y; + x = x + z; + stack_array[index] = x; + min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan + index++; + } + + // if we found a new min. + if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin))) + { // we found a new min. Search for it + // find min across the min vector, place in all elements of min -- big latency hit here + min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e)); + min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1)); + + // It is slightly faster to do this part in scalar code when count < 8. However, the common case for + // this where it actually makes a difference is handled in the early out at the top of the function, + // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced + // complexity, and removed it. + + dotmin = min; + + // scan for the first occurence of min in the array + size_t test; + for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++) // local_count must be a multiple of 4 + { + } + minIndex = 4 * index + segment + indexTable[test]; + } + + _mm_store_ss(dotResult, dotmin); + return minIndex; } - #elif defined B3_USE_NEON -#define ARM_NEON_GCC_COMPATIBILITY 1 +#define ARM_NEON_GCC_COMPATIBILITY 1 #include <arm_neon.h> +static long b3_maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult); +static long b3_maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult); +static long b3_maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult); +static long b3_mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult); +static long b3_mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult); +static long b3_mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult); -static long b3_maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult ); -static long b3_maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult ); -static long b3_maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult ); -static long b3_mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult ); -static long b3_mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult ); -static long b3_mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult ); +long (*b3_maxdot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = b3_maxdot_large_sel; +long (*b3_mindot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = b3_mindot_large_sel; -long (*b3_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = b3_maxdot_large_sel; -long (*b3_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = b3_mindot_large_sel; - -extern "C" {int _get_cpu_capabilities( void );} - -static long b3_maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult ) +extern "C" { - if( _get_cpu_capabilities() & 0x2000 ) - b3_maxdot_large = _maxdot_large_v1; - else - b3_maxdot_large = _maxdot_large_v0; - - return b3_maxdot_large(vv, vec, count, dotResult); + int _get_cpu_capabilities(void); } -static long b3_mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult ) +static long b3_maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult) { - if( _get_cpu_capabilities() & 0x2000 ) - b3_mindot_large = _mindot_large_v1; - else - b3_mindot_large = _mindot_large_v0; + if (_get_cpu_capabilities() & 0x2000) + b3_maxdot_large = _maxdot_large_v1; + else + b3_maxdot_large = _maxdot_large_v0; - return b3_mindot_large(vv, vec, count, dotResult); + return b3_maxdot_large(vv, vec, count, dotResult); } +static long b3_mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult) +{ + if (_get_cpu_capabilities() & 0x2000) + b3_mindot_large = _mindot_large_v1; + else + b3_mindot_large = _mindot_large_v0; + return b3_mindot_large(vv, vec, count, dotResult); +} -#define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; }) - +#define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; }) -long b3_maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult ) +long b3_maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult) { - unsigned long i = 0; - float32x4_t vvec = vld1q_f32_aligned_postincrement( vec ); - float32x2_t vLo = vget_low_f32(vvec); - float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0); - float32x2_t dotMaxLo = (float32x2_t) { -B3_INFINITY, -B3_INFINITY }; - float32x2_t dotMaxHi = (float32x2_t) { -B3_INFINITY, -B3_INFINITY }; - uint32x2_t indexLo = (uint32x2_t) {0, 1}; - uint32x2_t indexHi = (uint32x2_t) {2, 3}; - uint32x2_t iLo = (uint32x2_t) {-1, -1}; - uint32x2_t iHi = (uint32x2_t) {-1, -1}; - const uint32x2_t four = (uint32x2_t) {4,4}; - - for( ; i+8 <= count; i+= 8 ) - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); - - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); - float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); - float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo); - - float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); - float32x2_t zLo = vmul_f32( z0.val[0], vHi); - float32x2_t zHi = vmul_f32( z1.val[0], vHi); - - float32x2_t rLo = vpadd_f32( xy0, xy1); - float32x2_t rHi = vpadd_f32( xy2, xy3); - rLo = vadd_f32(rLo, zLo); - rHi = vadd_f32(rHi, zHi); - - uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo ); - uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi ); - dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); - dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi); - iLo = vbsl_u32(maskLo, indexLo, iLo); - iHi = vbsl_u32(maskHi, indexHi, iHi); - indexLo = vadd_u32(indexLo, four); - indexHi = vadd_u32(indexHi, four); - - v0 = vld1q_f32_aligned_postincrement( vv ); - v1 = vld1q_f32_aligned_postincrement( vv ); - v2 = vld1q_f32_aligned_postincrement( vv ); - v3 = vld1q_f32_aligned_postincrement( vv ); - - xy0 = vmul_f32( vget_low_f32(v0), vLo); - xy1 = vmul_f32( vget_low_f32(v1), vLo); - xy2 = vmul_f32( vget_low_f32(v2), vLo); - xy3 = vmul_f32( vget_low_f32(v3), vLo); - - z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); - zLo = vmul_f32( z0.val[0], vHi); - zHi = vmul_f32( z1.val[0], vHi); - - rLo = vpadd_f32( xy0, xy1); - rHi = vpadd_f32( xy2, xy3); - rLo = vadd_f32(rLo, zLo); - rHi = vadd_f32(rHi, zHi); - - maskLo = vcgt_f32( rLo, dotMaxLo ); - maskHi = vcgt_f32( rHi, dotMaxHi ); - dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); - dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi); - iLo = vbsl_u32(maskLo, indexLo, iLo); - iHi = vbsl_u32(maskHi, indexHi, iHi); - indexLo = vadd_u32(indexLo, four); - indexHi = vadd_u32(indexHi, four); - } - - for( ; i+4 <= count; i+= 4 ) - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); - - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); - float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); - float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo); - - float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); - float32x2_t zLo = vmul_f32( z0.val[0], vHi); - float32x2_t zHi = vmul_f32( z1.val[0], vHi); - - float32x2_t rLo = vpadd_f32( xy0, xy1); - float32x2_t rHi = vpadd_f32( xy2, xy3); - rLo = vadd_f32(rLo, zLo); - rHi = vadd_f32(rHi, zHi); - - uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo ); - uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi ); - dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); - dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi); - iLo = vbsl_u32(maskLo, indexLo, iLo); - iHi = vbsl_u32(maskHi, indexHi, iHi); - indexLo = vadd_u32(indexLo, four); - indexHi = vadd_u32(indexHi, four); - } - - switch( count & 3 ) - { - case 3: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); - float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); - - float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x2_t zLo = vmul_f32( z0.val[0], vHi); - float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi); - - float32x2_t rLo = vpadd_f32( xy0, xy1); - float32x2_t rHi = vpadd_f32( xy2, xy2); - rLo = vadd_f32(rLo, zLo); - rHi = vadd_f32(rHi, zHi); - - uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo ); - uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi ); - dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); - dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi); - iLo = vbsl_u32(maskLo, indexLo, iLo); - iHi = vbsl_u32(maskHi, indexHi, iHi); - } - break; - case 2: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); - - float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x2_t zLo = vmul_f32( z0.val[0], vHi); - - float32x2_t rLo = vpadd_f32( xy0, xy1); - rLo = vadd_f32(rLo, zLo); - - uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo ); - dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); - iLo = vbsl_u32(maskLo, indexLo, iLo); - } - break; - case 1: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0); - float32x2_t zLo = vmul_f32( z0, vHi); - float32x2_t rLo = vpadd_f32( xy0, xy0); - rLo = vadd_f32(rLo, zLo); - uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo ); - dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); - iLo = vbsl_u32(maskLo, indexLo, iLo); - } - break; - - default: - break; - } - - // select best answer between hi and lo results - uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo ); - dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo); - iLo = vbsl_u32(mask, iHi, iLo); - - // select best answer between even and odd results - dotMaxHi = vdup_lane_f32(dotMaxLo, 1); - iHi = vdup_lane_u32(iLo, 1); - mask = vcgt_f32( dotMaxHi, dotMaxLo ); - dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo); - iLo = vbsl_u32(mask, iHi, iLo); - - *dotResult = vget_lane_f32( dotMaxLo, 0); - return vget_lane_u32(iLo, 0); + unsigned long i = 0; + float32x4_t vvec = vld1q_f32_aligned_postincrement(vec); + float32x2_t vLo = vget_low_f32(vvec); + float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0); + float32x2_t dotMaxLo = (float32x2_t){-B3_INFINITY, -B3_INFINITY}; + float32x2_t dotMaxHi = (float32x2_t){-B3_INFINITY, -B3_INFINITY}; + uint32x2_t indexLo = (uint32x2_t){0, 1}; + uint32x2_t indexHi = (uint32x2_t){2, 3}; + uint32x2_t iLo = (uint32x2_t){-1, -1}; + uint32x2_t iHi = (uint32x2_t){-1, -1}; + const uint32x2_t four = (uint32x2_t){4, 4}; + + for (; i + 8 <= count; i += 8) + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v3 = vld1q_f32_aligned_postincrement(vv); + + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo); + float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo); + float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo); + + float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3)); + float32x2_t zLo = vmul_f32(z0.val[0], vHi); + float32x2_t zHi = vmul_f32(z1.val[0], vHi); + + float32x2_t rLo = vpadd_f32(xy0, xy1); + float32x2_t rHi = vpadd_f32(xy2, xy3); + rLo = vadd_f32(rLo, zLo); + rHi = vadd_f32(rHi, zHi); + + uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo); + uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi); + dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo); + dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi); + iLo = vbsl_u32(maskLo, indexLo, iLo); + iHi = vbsl_u32(maskHi, indexHi, iHi); + indexLo = vadd_u32(indexLo, four); + indexHi = vadd_u32(indexHi, four); + + v0 = vld1q_f32_aligned_postincrement(vv); + v1 = vld1q_f32_aligned_postincrement(vv); + v2 = vld1q_f32_aligned_postincrement(vv); + v3 = vld1q_f32_aligned_postincrement(vv); + + xy0 = vmul_f32(vget_low_f32(v0), vLo); + xy1 = vmul_f32(vget_low_f32(v1), vLo); + xy2 = vmul_f32(vget_low_f32(v2), vLo); + xy3 = vmul_f32(vget_low_f32(v3), vLo); + + z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3)); + zLo = vmul_f32(z0.val[0], vHi); + zHi = vmul_f32(z1.val[0], vHi); + + rLo = vpadd_f32(xy0, xy1); + rHi = vpadd_f32(xy2, xy3); + rLo = vadd_f32(rLo, zLo); + rHi = vadd_f32(rHi, zHi); + + maskLo = vcgt_f32(rLo, dotMaxLo); + maskHi = vcgt_f32(rHi, dotMaxHi); + dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo); + dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi); + iLo = vbsl_u32(maskLo, indexLo, iLo); + iHi = vbsl_u32(maskHi, indexHi, iHi); + indexLo = vadd_u32(indexLo, four); + indexHi = vadd_u32(indexHi, four); + } + + for (; i + 4 <= count; i += 4) + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v3 = vld1q_f32_aligned_postincrement(vv); + + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo); + float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo); + float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo); + + float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3)); + float32x2_t zLo = vmul_f32(z0.val[0], vHi); + float32x2_t zHi = vmul_f32(z1.val[0], vHi); + + float32x2_t rLo = vpadd_f32(xy0, xy1); + float32x2_t rHi = vpadd_f32(xy2, xy3); + rLo = vadd_f32(rLo, zLo); + rHi = vadd_f32(rHi, zHi); + + uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo); + uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi); + dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo); + dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi); + iLo = vbsl_u32(maskLo, indexLo, iLo); + iHi = vbsl_u32(maskHi, indexHi, iHi); + indexLo = vadd_u32(indexLo, four); + indexHi = vadd_u32(indexHi, four); + } + + switch (count & 3) + { + case 3: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo); + float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo); + + float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x2_t zLo = vmul_f32(z0.val[0], vHi); + float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi); + + float32x2_t rLo = vpadd_f32(xy0, xy1); + float32x2_t rHi = vpadd_f32(xy2, xy2); + rLo = vadd_f32(rLo, zLo); + rHi = vadd_f32(rHi, zHi); + + uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo); + uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi); + dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo); + dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi); + iLo = vbsl_u32(maskLo, indexLo, iLo); + iHi = vbsl_u32(maskHi, indexHi, iHi); + } + break; + case 2: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo); + + float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x2_t zLo = vmul_f32(z0.val[0], vHi); + + float32x2_t rLo = vpadd_f32(xy0, xy1); + rLo = vadd_f32(rLo, zLo); + + uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo); + dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo); + iLo = vbsl_u32(maskLo, indexLo, iLo); + } + break; + case 1: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0); + float32x2_t zLo = vmul_f32(z0, vHi); + float32x2_t rLo = vpadd_f32(xy0, xy0); + rLo = vadd_f32(rLo, zLo); + uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo); + dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo); + iLo = vbsl_u32(maskLo, indexLo, iLo); + } + break; + + default: + break; + } + + // select best answer between hi and lo results + uint32x2_t mask = vcgt_f32(dotMaxHi, dotMaxLo); + dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo); + iLo = vbsl_u32(mask, iHi, iLo); + + // select best answer between even and odd results + dotMaxHi = vdup_lane_f32(dotMaxLo, 1); + iHi = vdup_lane_u32(iLo, 1); + mask = vcgt_f32(dotMaxHi, dotMaxLo); + dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo); + iLo = vbsl_u32(mask, iHi, iLo); + + *dotResult = vget_lane_f32(dotMaxLo, 0); + return vget_lane_u32(iLo, 0); } - -long b3_maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult ) +long b3_maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult) { - float32x4_t vvec = vld1q_f32_aligned_postincrement( vec ); - float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec)); - float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0); - const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 }; - uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3}; - uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 }; - float32x4_t maxDot = (float32x4_t) { -B3_INFINITY, -B3_INFINITY, -B3_INFINITY, -B3_INFINITY }; - - unsigned long i = 0; - for( ; i + 8 <= count; i += 8 ) - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); - - xy0 = vmulq_f32(xy0, vLo); - xy1 = vmulq_f32(xy1, vLo); - - float32x4x2_t zb = vuzpq_f32( z0, z1); - float32x4_t z = vmulq_f32( zb.val[0], vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy1); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcgtq_f32(x, maxDot); - maxDot = vbslq_f32( mask, x, maxDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - - v0 = vld1q_f32_aligned_postincrement( vv ); - v1 = vld1q_f32_aligned_postincrement( vv ); - v2 = vld1q_f32_aligned_postincrement( vv ); - v3 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); - // the next two lines should resolve to a single vswp d, d - z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); - - xy0 = vmulq_f32(xy0, vLo); - xy1 = vmulq_f32(xy1, vLo); - - zb = vuzpq_f32( z0, z1); - z = vmulq_f32( zb.val[0], vHi); - xy = vuzpq_f32( xy0, xy1); - x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - mask = vcgtq_f32(x, maxDot); - maxDot = vbslq_f32( mask, x, maxDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - - for( ; i + 4 <= count; i += 4 ) - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); - - xy0 = vmulq_f32(xy0, vLo); - xy1 = vmulq_f32(xy1, vLo); - - float32x4x2_t zb = vuzpq_f32( z0, z1); - float32x4_t z = vmulq_f32( zb.val[0], vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy1); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcgtq_f32(x, maxDot); - maxDot = vbslq_f32( mask, x, maxDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - - switch (count & 3) { - case 3: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2)); - - xy0 = vmulq_f32(xy0, vLo); - xy1 = vmulq_f32(xy1, vLo); - - float32x4x2_t zb = vuzpq_f32( z0, z1); - float32x4_t z = vmulq_f32( zb.val[0], vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy1); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcgtq_f32(x, maxDot); - maxDot = vbslq_f32( mask, x, maxDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - break; - - case 2: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - - xy0 = vmulq_f32(xy0, vLo); - - float32x4x2_t zb = vuzpq_f32( z0, z0); - float32x4_t z = vmulq_f32( zb.val[0], vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy0); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcgtq_f32(x, maxDot); - maxDot = vbslq_f32( mask, x, maxDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - break; - - case 1: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); - - xy0 = vmulq_f32(xy0, vLo); - - z = vmulq_f32( z, vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy0); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcgtq_f32(x, maxDot); - maxDot = vbslq_f32( mask, x, maxDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - break; - - default: - break; - } - - - // select best answer between hi and lo results - uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot)); - float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot)); - uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index)); - - // select best answer between even and odd results - float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1); - uint32x2_t indexHi = vdup_lane_u32(index2, 1); - mask = vcgt_f32( maxDotO, maxDot2 ); - maxDot2 = vbsl_f32(mask, maxDotO, maxDot2); - index2 = vbsl_u32(mask, indexHi, index2); - - *dotResult = vget_lane_f32( maxDot2, 0); - return vget_lane_u32(index2, 0); - + float32x4_t vvec = vld1q_f32_aligned_postincrement(vec); + float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec)); + float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0); + const uint32x4_t four = (uint32x4_t){4, 4, 4, 4}; + uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3}; + uint32x4_t index = (uint32x4_t){-1, -1, -1, -1}; + float32x4_t maxDot = (float32x4_t){-B3_INFINITY, -B3_INFINITY, -B3_INFINITY, -B3_INFINITY}; + + unsigned long i = 0; + for (; i + 8 <= count; i += 8) + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v3 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3)); + + xy0 = vmulq_f32(xy0, vLo); + xy1 = vmulq_f32(xy1, vLo); + + float32x4x2_t zb = vuzpq_f32(z0, z1); + float32x4_t z = vmulq_f32(zb.val[0], vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy1); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcgtq_f32(x, maxDot); + maxDot = vbslq_f32(mask, x, maxDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + + v0 = vld1q_f32_aligned_postincrement(vv); + v1 = vld1q_f32_aligned_postincrement(vv); + v2 = vld1q_f32_aligned_postincrement(vv); + v3 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3)); + // the next two lines should resolve to a single vswp d, d + z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3)); + + xy0 = vmulq_f32(xy0, vLo); + xy1 = vmulq_f32(xy1, vLo); + + zb = vuzpq_f32(z0, z1); + z = vmulq_f32(zb.val[0], vHi); + xy = vuzpq_f32(xy0, xy1); + x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + mask = vcgtq_f32(x, maxDot); + maxDot = vbslq_f32(mask, x, maxDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + + for (; i + 4 <= count; i += 4) + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v3 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3)); + + xy0 = vmulq_f32(xy0, vLo); + xy1 = vmulq_f32(xy1, vLo); + + float32x4x2_t zb = vuzpq_f32(z0, z1); + float32x4_t z = vmulq_f32(zb.val[0], vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy1); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcgtq_f32(x, maxDot); + maxDot = vbslq_f32(mask, x, maxDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + + switch (count & 3) + { + case 3: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2)); + + xy0 = vmulq_f32(xy0, vLo); + xy1 = vmulq_f32(xy1, vLo); + + float32x4x2_t zb = vuzpq_f32(z0, z1); + float32x4_t z = vmulq_f32(zb.val[0], vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy1); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcgtq_f32(x, maxDot); + maxDot = vbslq_f32(mask, x, maxDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + break; + + case 2: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + + xy0 = vmulq_f32(xy0, vLo); + + float32x4x2_t zb = vuzpq_f32(z0, z0); + float32x4_t z = vmulq_f32(zb.val[0], vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy0); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcgtq_f32(x, maxDot); + maxDot = vbslq_f32(mask, x, maxDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + break; + + case 1: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); + + xy0 = vmulq_f32(xy0, vLo); + + z = vmulq_f32(z, vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy0); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcgtq_f32(x, maxDot); + maxDot = vbslq_f32(mask, x, maxDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + break; + + default: + break; + } + + // select best answer between hi and lo results + uint32x2_t mask = vcgt_f32(vget_high_f32(maxDot), vget_low_f32(maxDot)); + float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot)); + uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index)); + + // select best answer between even and odd results + float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1); + uint32x2_t indexHi = vdup_lane_u32(index2, 1); + mask = vcgt_f32(maxDotO, maxDot2); + maxDot2 = vbsl_f32(mask, maxDotO, maxDot2); + index2 = vbsl_u32(mask, indexHi, index2); + + *dotResult = vget_lane_f32(maxDot2, 0); + return vget_lane_u32(index2, 0); } -long b3_mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult ) +long b3_mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult) { - unsigned long i = 0; - float32x4_t vvec = vld1q_f32_aligned_postincrement( vec ); - float32x2_t vLo = vget_low_f32(vvec); - float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0); - float32x2_t dotMinLo = (float32x2_t) { B3_INFINITY, B3_INFINITY }; - float32x2_t dotMinHi = (float32x2_t) { B3_INFINITY, B3_INFINITY }; - uint32x2_t indexLo = (uint32x2_t) {0, 1}; - uint32x2_t indexHi = (uint32x2_t) {2, 3}; - uint32x2_t iLo = (uint32x2_t) {-1, -1}; - uint32x2_t iHi = (uint32x2_t) {-1, -1}; - const uint32x2_t four = (uint32x2_t) {4,4}; - - for( ; i+8 <= count; i+= 8 ) - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); - - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); - float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); - float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo); - - float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); - float32x2_t zLo = vmul_f32( z0.val[0], vHi); - float32x2_t zHi = vmul_f32( z1.val[0], vHi); - - float32x2_t rLo = vpadd_f32( xy0, xy1); - float32x2_t rHi = vpadd_f32( xy2, xy3); - rLo = vadd_f32(rLo, zLo); - rHi = vadd_f32(rHi, zHi); - - uint32x2_t maskLo = vclt_f32( rLo, dotMinLo ); - uint32x2_t maskHi = vclt_f32( rHi, dotMinHi ); - dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); - dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi); - iLo = vbsl_u32(maskLo, indexLo, iLo); - iHi = vbsl_u32(maskHi, indexHi, iHi); - indexLo = vadd_u32(indexLo, four); - indexHi = vadd_u32(indexHi, four); - - v0 = vld1q_f32_aligned_postincrement( vv ); - v1 = vld1q_f32_aligned_postincrement( vv ); - v2 = vld1q_f32_aligned_postincrement( vv ); - v3 = vld1q_f32_aligned_postincrement( vv ); - - xy0 = vmul_f32( vget_low_f32(v0), vLo); - xy1 = vmul_f32( vget_low_f32(v1), vLo); - xy2 = vmul_f32( vget_low_f32(v2), vLo); - xy3 = vmul_f32( vget_low_f32(v3), vLo); - - z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); - zLo = vmul_f32( z0.val[0], vHi); - zHi = vmul_f32( z1.val[0], vHi); - - rLo = vpadd_f32( xy0, xy1); - rHi = vpadd_f32( xy2, xy3); - rLo = vadd_f32(rLo, zLo); - rHi = vadd_f32(rHi, zHi); - - maskLo = vclt_f32( rLo, dotMinLo ); - maskHi = vclt_f32( rHi, dotMinHi ); - dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); - dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi); - iLo = vbsl_u32(maskLo, indexLo, iLo); - iHi = vbsl_u32(maskHi, indexHi, iHi); - indexLo = vadd_u32(indexLo, four); - indexHi = vadd_u32(indexHi, four); - } - - for( ; i+4 <= count; i+= 4 ) - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); - - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); - float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); - float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo); - - float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); - float32x2_t zLo = vmul_f32( z0.val[0], vHi); - float32x2_t zHi = vmul_f32( z1.val[0], vHi); - - float32x2_t rLo = vpadd_f32( xy0, xy1); - float32x2_t rHi = vpadd_f32( xy2, xy3); - rLo = vadd_f32(rLo, zLo); - rHi = vadd_f32(rHi, zHi); - - uint32x2_t maskLo = vclt_f32( rLo, dotMinLo ); - uint32x2_t maskHi = vclt_f32( rHi, dotMinHi ); - dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); - dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi); - iLo = vbsl_u32(maskLo, indexLo, iLo); - iHi = vbsl_u32(maskHi, indexHi, iHi); - indexLo = vadd_u32(indexLo, four); - indexHi = vadd_u32(indexHi, four); - } - switch( count & 3 ) - { - case 3: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); - float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); - - float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x2_t zLo = vmul_f32( z0.val[0], vHi); - float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi); - - float32x2_t rLo = vpadd_f32( xy0, xy1); - float32x2_t rHi = vpadd_f32( xy2, xy2); - rLo = vadd_f32(rLo, zLo); - rHi = vadd_f32(rHi, zHi); - - uint32x2_t maskLo = vclt_f32( rLo, dotMinLo ); - uint32x2_t maskHi = vclt_f32( rHi, dotMinHi ); - dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); - dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi); - iLo = vbsl_u32(maskLo, indexLo, iLo); - iHi = vbsl_u32(maskHi, indexHi, iHi); - } - break; - case 2: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); - - float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x2_t zLo = vmul_f32( z0.val[0], vHi); - - float32x2_t rLo = vpadd_f32( xy0, xy1); - rLo = vadd_f32(rLo, zLo); - - uint32x2_t maskLo = vclt_f32( rLo, dotMinLo ); - dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); - iLo = vbsl_u32(maskLo, indexLo, iLo); - } - break; - case 1: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0); - float32x2_t zLo = vmul_f32( z0, vHi); - float32x2_t rLo = vpadd_f32( xy0, xy0); - rLo = vadd_f32(rLo, zLo); - uint32x2_t maskLo = vclt_f32( rLo, dotMinLo ); - dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); - iLo = vbsl_u32(maskLo, indexLo, iLo); - } - break; - - default: - break; - } - - // select best answer between hi and lo results - uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo ); - dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo); - iLo = vbsl_u32(mask, iHi, iLo); - - // select best answer between even and odd results - dotMinHi = vdup_lane_f32(dotMinLo, 1); - iHi = vdup_lane_u32(iLo, 1); - mask = vclt_f32( dotMinHi, dotMinLo ); - dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo); - iLo = vbsl_u32(mask, iHi, iLo); - - *dotResult = vget_lane_f32( dotMinLo, 0); - return vget_lane_u32(iLo, 0); + unsigned long i = 0; + float32x4_t vvec = vld1q_f32_aligned_postincrement(vec); + float32x2_t vLo = vget_low_f32(vvec); + float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0); + float32x2_t dotMinLo = (float32x2_t){B3_INFINITY, B3_INFINITY}; + float32x2_t dotMinHi = (float32x2_t){B3_INFINITY, B3_INFINITY}; + uint32x2_t indexLo = (uint32x2_t){0, 1}; + uint32x2_t indexHi = (uint32x2_t){2, 3}; + uint32x2_t iLo = (uint32x2_t){-1, -1}; + uint32x2_t iHi = (uint32x2_t){-1, -1}; + const uint32x2_t four = (uint32x2_t){4, 4}; + + for (; i + 8 <= count; i += 8) + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v3 = vld1q_f32_aligned_postincrement(vv); + + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo); + float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo); + float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo); + + float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3)); + float32x2_t zLo = vmul_f32(z0.val[0], vHi); + float32x2_t zHi = vmul_f32(z1.val[0], vHi); + + float32x2_t rLo = vpadd_f32(xy0, xy1); + float32x2_t rHi = vpadd_f32(xy2, xy3); + rLo = vadd_f32(rLo, zLo); + rHi = vadd_f32(rHi, zHi); + + uint32x2_t maskLo = vclt_f32(rLo, dotMinLo); + uint32x2_t maskHi = vclt_f32(rHi, dotMinHi); + dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo); + dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi); + iLo = vbsl_u32(maskLo, indexLo, iLo); + iHi = vbsl_u32(maskHi, indexHi, iHi); + indexLo = vadd_u32(indexLo, four); + indexHi = vadd_u32(indexHi, four); + + v0 = vld1q_f32_aligned_postincrement(vv); + v1 = vld1q_f32_aligned_postincrement(vv); + v2 = vld1q_f32_aligned_postincrement(vv); + v3 = vld1q_f32_aligned_postincrement(vv); + + xy0 = vmul_f32(vget_low_f32(v0), vLo); + xy1 = vmul_f32(vget_low_f32(v1), vLo); + xy2 = vmul_f32(vget_low_f32(v2), vLo); + xy3 = vmul_f32(vget_low_f32(v3), vLo); + + z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3)); + zLo = vmul_f32(z0.val[0], vHi); + zHi = vmul_f32(z1.val[0], vHi); + + rLo = vpadd_f32(xy0, xy1); + rHi = vpadd_f32(xy2, xy3); + rLo = vadd_f32(rLo, zLo); + rHi = vadd_f32(rHi, zHi); + + maskLo = vclt_f32(rLo, dotMinLo); + maskHi = vclt_f32(rHi, dotMinHi); + dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo); + dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi); + iLo = vbsl_u32(maskLo, indexLo, iLo); + iHi = vbsl_u32(maskHi, indexHi, iHi); + indexLo = vadd_u32(indexLo, four); + indexHi = vadd_u32(indexHi, four); + } + + for (; i + 4 <= count; i += 4) + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v3 = vld1q_f32_aligned_postincrement(vv); + + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo); + float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo); + float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo); + + float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3)); + float32x2_t zLo = vmul_f32(z0.val[0], vHi); + float32x2_t zHi = vmul_f32(z1.val[0], vHi); + + float32x2_t rLo = vpadd_f32(xy0, xy1); + float32x2_t rHi = vpadd_f32(xy2, xy3); + rLo = vadd_f32(rLo, zLo); + rHi = vadd_f32(rHi, zHi); + + uint32x2_t maskLo = vclt_f32(rLo, dotMinLo); + uint32x2_t maskHi = vclt_f32(rHi, dotMinHi); + dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo); + dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi); + iLo = vbsl_u32(maskLo, indexLo, iLo); + iHi = vbsl_u32(maskHi, indexHi, iHi); + indexLo = vadd_u32(indexLo, four); + indexHi = vadd_u32(indexHi, four); + } + switch (count & 3) + { + case 3: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo); + float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo); + + float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x2_t zLo = vmul_f32(z0.val[0], vHi); + float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi); + + float32x2_t rLo = vpadd_f32(xy0, xy1); + float32x2_t rHi = vpadd_f32(xy2, xy2); + rLo = vadd_f32(rLo, zLo); + rHi = vadd_f32(rHi, zHi); + + uint32x2_t maskLo = vclt_f32(rLo, dotMinLo); + uint32x2_t maskHi = vclt_f32(rHi, dotMinHi); + dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo); + dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi); + iLo = vbsl_u32(maskLo, indexLo, iLo); + iHi = vbsl_u32(maskHi, indexHi, iHi); + } + break; + case 2: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo); + + float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x2_t zLo = vmul_f32(z0.val[0], vHi); + + float32x2_t rLo = vpadd_f32(xy0, xy1); + rLo = vadd_f32(rLo, zLo); + + uint32x2_t maskLo = vclt_f32(rLo, dotMinLo); + dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo); + iLo = vbsl_u32(maskLo, indexLo, iLo); + } + break; + case 1: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0); + float32x2_t zLo = vmul_f32(z0, vHi); + float32x2_t rLo = vpadd_f32(xy0, xy0); + rLo = vadd_f32(rLo, zLo); + uint32x2_t maskLo = vclt_f32(rLo, dotMinLo); + dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo); + iLo = vbsl_u32(maskLo, indexLo, iLo); + } + break; + + default: + break; + } + + // select best answer between hi and lo results + uint32x2_t mask = vclt_f32(dotMinHi, dotMinLo); + dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo); + iLo = vbsl_u32(mask, iHi, iLo); + + // select best answer between even and odd results + dotMinHi = vdup_lane_f32(dotMinLo, 1); + iHi = vdup_lane_u32(iLo, 1); + mask = vclt_f32(dotMinHi, dotMinLo); + dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo); + iLo = vbsl_u32(mask, iHi, iLo); + + *dotResult = vget_lane_f32(dotMinLo, 0); + return vget_lane_u32(iLo, 0); } -long b3_mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult ) +long b3_mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult) { - float32x4_t vvec = vld1q_f32_aligned_postincrement( vec ); - float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec)); - float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0); - const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 }; - uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3}; - uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 }; - float32x4_t minDot = (float32x4_t) { B3_INFINITY, B3_INFINITY, B3_INFINITY, B3_INFINITY }; - - unsigned long i = 0; - for( ; i + 8 <= count; i += 8 ) - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); - - xy0 = vmulq_f32(xy0, vLo); - xy1 = vmulq_f32(xy1, vLo); - - float32x4x2_t zb = vuzpq_f32( z0, z1); - float32x4_t z = vmulq_f32( zb.val[0], vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy1); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcltq_f32(x, minDot); - minDot = vbslq_f32( mask, x, minDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - - v0 = vld1q_f32_aligned_postincrement( vv ); - v1 = vld1q_f32_aligned_postincrement( vv ); - v2 = vld1q_f32_aligned_postincrement( vv ); - v3 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); - // the next two lines should resolve to a single vswp d, d - z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); - - xy0 = vmulq_f32(xy0, vLo); - xy1 = vmulq_f32(xy1, vLo); - - zb = vuzpq_f32( z0, z1); - z = vmulq_f32( zb.val[0], vHi); - xy = vuzpq_f32( xy0, xy1); - x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - mask = vcltq_f32(x, minDot); - minDot = vbslq_f32( mask, x, minDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - - for( ; i + 4 <= count; i += 4 ) - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); - - xy0 = vmulq_f32(xy0, vLo); - xy1 = vmulq_f32(xy1, vLo); - - float32x4x2_t zb = vuzpq_f32( z0, z1); - float32x4_t z = vmulq_f32( zb.val[0], vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy1); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcltq_f32(x, minDot); - minDot = vbslq_f32( mask, x, minDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - - switch (count & 3) { - case 3: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2)); - - xy0 = vmulq_f32(xy0, vLo); - xy1 = vmulq_f32(xy1, vLo); - - float32x4x2_t zb = vuzpq_f32( z0, z1); - float32x4_t z = vmulq_f32( zb.val[0], vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy1); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcltq_f32(x, minDot); - minDot = vbslq_f32( mask, x, minDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - break; - - case 2: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - - xy0 = vmulq_f32(xy0, vLo); - - float32x4x2_t zb = vuzpq_f32( z0, z0); - float32x4_t z = vmulq_f32( zb.val[0], vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy0); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcltq_f32(x, minDot); - minDot = vbslq_f32( mask, x, minDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - break; - - case 1: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); - - xy0 = vmulq_f32(xy0, vLo); - - z = vmulq_f32( z, vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy0); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcltq_f32(x, minDot); - minDot = vbslq_f32( mask, x, minDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - break; - - default: - break; - } - - - // select best answer between hi and lo results - uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot)); - float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot)); - uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index)); - - // select best answer between even and odd results - float32x2_t minDotO = vdup_lane_f32(minDot2, 1); - uint32x2_t indexHi = vdup_lane_u32(index2, 1); - mask = vclt_f32( minDotO, minDot2 ); - minDot2 = vbsl_f32(mask, minDotO, minDot2); - index2 = vbsl_u32(mask, indexHi, index2); - - *dotResult = vget_lane_f32( minDot2, 0); - return vget_lane_u32(index2, 0); - + float32x4_t vvec = vld1q_f32_aligned_postincrement(vec); + float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec)); + float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0); + const uint32x4_t four = (uint32x4_t){4, 4, 4, 4}; + uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3}; + uint32x4_t index = (uint32x4_t){-1, -1, -1, -1}; + float32x4_t minDot = (float32x4_t){B3_INFINITY, B3_INFINITY, B3_INFINITY, B3_INFINITY}; + + unsigned long i = 0; + for (; i + 8 <= count; i += 8) + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v3 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3)); + + xy0 = vmulq_f32(xy0, vLo); + xy1 = vmulq_f32(xy1, vLo); + + float32x4x2_t zb = vuzpq_f32(z0, z1); + float32x4_t z = vmulq_f32(zb.val[0], vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy1); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcltq_f32(x, minDot); + minDot = vbslq_f32(mask, x, minDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + + v0 = vld1q_f32_aligned_postincrement(vv); + v1 = vld1q_f32_aligned_postincrement(vv); + v2 = vld1q_f32_aligned_postincrement(vv); + v3 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3)); + // the next two lines should resolve to a single vswp d, d + z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3)); + + xy0 = vmulq_f32(xy0, vLo); + xy1 = vmulq_f32(xy1, vLo); + + zb = vuzpq_f32(z0, z1); + z = vmulq_f32(zb.val[0], vHi); + xy = vuzpq_f32(xy0, xy1); + x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + mask = vcltq_f32(x, minDot); + minDot = vbslq_f32(mask, x, minDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + + for (; i + 4 <= count; i += 4) + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v3 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3)); + + xy0 = vmulq_f32(xy0, vLo); + xy1 = vmulq_f32(xy1, vLo); + + float32x4x2_t zb = vuzpq_f32(z0, z1); + float32x4_t z = vmulq_f32(zb.val[0], vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy1); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcltq_f32(x, minDot); + minDot = vbslq_f32(mask, x, minDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + + switch (count & 3) + { + case 3: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2)); + + xy0 = vmulq_f32(xy0, vLo); + xy1 = vmulq_f32(xy1, vLo); + + float32x4x2_t zb = vuzpq_f32(z0, z1); + float32x4_t z = vmulq_f32(zb.val[0], vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy1); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcltq_f32(x, minDot); + minDot = vbslq_f32(mask, x, minDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + break; + + case 2: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + + xy0 = vmulq_f32(xy0, vLo); + + float32x4x2_t zb = vuzpq_f32(z0, z0); + float32x4_t z = vmulq_f32(zb.val[0], vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy0); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcltq_f32(x, minDot); + minDot = vbslq_f32(mask, x, minDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + break; + + case 1: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); + + xy0 = vmulq_f32(xy0, vLo); + + z = vmulq_f32(z, vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy0); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcltq_f32(x, minDot); + minDot = vbslq_f32(mask, x, minDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + break; + + default: + break; + } + + // select best answer between hi and lo results + uint32x2_t mask = vclt_f32(vget_high_f32(minDot), vget_low_f32(minDot)); + float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot)); + uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index)); + + // select best answer between even and odd results + float32x2_t minDotO = vdup_lane_f32(minDot2, 1); + uint32x2_t indexHi = vdup_lane_u32(index2, 1); + mask = vclt_f32(minDotO, minDot2); + minDot2 = vbsl_f32(mask, minDotO, minDot2); + index2 = vbsl_u32(mask, indexHi, index2); + + *dotResult = vget_lane_f32(minDot2, 0); + return vget_lane_u32(index2, 0); } #else - #error Unhandled __APPLE__ arch +#error Unhandled __APPLE__ arch #endif -#endif /* __APPLE__ */ - - +#endif /* __APPLE__ */ diff --git a/thirdparty/bullet/Bullet3Common/b3Vector3.h b/thirdparty/bullet/Bullet3Common/b3Vector3.h index 16ec02b0ed..56e6c13311 100644 --- a/thirdparty/bullet/Bullet3Common/b3Vector3.h +++ b/thirdparty/bullet/Bullet3Common/b3Vector3.h @@ -12,8 +12,6 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - - #ifndef B3_VECTOR3_H #define B3_VECTOR3_H @@ -28,37 +26,34 @@ subject to the following restrictions: #else #define b3Vector3Data b3Vector3FloatData #define b3Vector3DataName "b3Vector3FloatData" -#endif //B3_USE_DOUBLE_PRECISION +#endif //B3_USE_DOUBLE_PRECISION #if defined B3_USE_SSE //typedef uint32_t __m128i __attribute__ ((vector_size(16))); #ifdef _MSC_VER -#pragma warning(disable: 4556) // value of intrinsic immediate argument '4294967239' is out of range '0 - 255' +#pragma warning(disable : 4556) // value of intrinsic immediate argument '4294967239' is out of range '0 - 255' #endif - -#define B3_SHUFFLE(x,y,z,w) ((w)<<6 | (z)<<4 | (y)<<2 | (x)) +#define B3_SHUFFLE(x, y, z, w) ((w) << 6 | (z) << 4 | (y) << 2 | (x)) //#define b3_pshufd_ps( _a, _mask ) (__m128) _mm_shuffle_epi32((__m128i)(_a), (_mask) ) -#define b3_pshufd_ps( _a, _mask ) _mm_shuffle_ps((_a), (_a), (_mask) ) -#define b3_splat3_ps( _a, _i ) b3_pshufd_ps((_a), B3_SHUFFLE(_i,_i,_i, 3) ) -#define b3_splat_ps( _a, _i ) b3_pshufd_ps((_a), B3_SHUFFLE(_i,_i,_i,_i) ) +#define b3_pshufd_ps(_a, _mask) _mm_shuffle_ps((_a), (_a), (_mask)) +#define b3_splat3_ps(_a, _i) b3_pshufd_ps((_a), B3_SHUFFLE(_i, _i, _i, 3)) +#define b3_splat_ps(_a, _i) b3_pshufd_ps((_a), B3_SHUFFLE(_i, _i, _i, _i)) #define b3v3AbsiMask (_mm_set_epi32(0x00000000, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF)) -#define b3vAbsMask (_mm_set_epi32( 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF)) +#define b3vAbsMask (_mm_set_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF)) #define b3vFFF0Mask (_mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)) #define b3v3AbsfMask b3CastiTo128f(b3v3AbsiMask) #define b3vFFF0fMask b3CastiTo128f(b3vFFF0Mask) #define b3vxyzMaskf b3vFFF0fMask #define b3vAbsfMask b3CastiTo128f(b3vAbsMask) - - const __m128 B3_ATTRIBUTE_ALIGNED16(b3vMzeroMask) = {-0.0f, -0.0f, -0.0f, -0.0f}; const __m128 B3_ATTRIBUTE_ALIGNED16(b3v1110) = {1.0f, 1.0f, 1.0f, 0.0f}; const __m128 B3_ATTRIBUTE_ALIGNED16(b3vHalf) = {0.5f, 0.5f, 0.5f, 0.5f}; -const __m128 B3_ATTRIBUTE_ALIGNED16(b3v1_5) = {1.5f, 1.5f, 1.5f, 1.5f}; +const __m128 B3_ATTRIBUTE_ALIGNED16(b3v1_5) = {1.5f, 1.5f, 1.5f, 1.5f}; #endif @@ -74,70 +69,69 @@ const int32x4_t B3_ATTRIBUTE_ALIGNED16(b3v3AbsMask) = (int32x4_t){0x7FFFFFFF, 0x class b3Vector3; class b3Vector4; -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) //#if defined (B3_USE_SSE) || defined (B3_USE_NEON) -inline b3Vector3 b3MakeVector3( b3SimdFloat4 v); -inline b3Vector4 b3MakeVector4( b3SimdFloat4 vec); +inline b3Vector3 b3MakeVector3(b3SimdFloat4 v); +inline b3Vector4 b3MakeVector4(b3SimdFloat4 vec); #endif -inline b3Vector3 b3MakeVector3(b3Scalar x,b3Scalar y,b3Scalar z); -inline b3Vector3 b3MakeVector3(b3Scalar x,b3Scalar y,b3Scalar z, b3Scalar w); -inline b3Vector4 b3MakeVector4(b3Scalar x,b3Scalar y,b3Scalar z,b3Scalar w); - +inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z); +inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w); +inline b3Vector4 b3MakeVector4(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w); /**@brief b3Vector3 can be used to represent 3D points and vectors. * It has an un-used w component to suit 16-byte alignment when b3Vector3 is stored in containers. This extra component can be used by derived classes (Quaternion?) or by user * Ideally, this class should be replaced by a platform optimized SIMD version that keeps the data in registers */ -B3_ATTRIBUTE_ALIGNED16(class) b3Vector3 +B3_ATTRIBUTE_ALIGNED16(class) +b3Vector3 { public: -#if defined (B3_USE_SSE) || defined(B3_USE_NEON) // _WIN32 || ARM - union { - b3SimdFloat4 mVec128; - float m_floats[4]; - struct {float x,y,z,w;}; - - }; +#if defined(B3_USE_SSE) || defined(B3_USE_NEON) // _WIN32 || ARM + union { + b3SimdFloat4 mVec128; + float m_floats[4]; + struct + { + float x, y, z, w; + }; + }; #else - union - { - float m_floats[4]; - struct {float x,y,z,w;}; + union { + float m_floats[4]; + struct + { + float x, y, z, w; + }; }; #endif - public: - B3_DECLARE_ALIGNED_ALLOCATOR(); -#if defined (B3_USE_SSE) || defined(B3_USE_NEON) // _WIN32 || ARM +#if defined(B3_USE_SSE) || defined(B3_USE_NEON) // _WIN32 || ARM /*B3_FORCE_INLINE b3Vector3() { } */ - B3_FORCE_INLINE b3SimdFloat4 get128() const - { - return mVec128; - } - B3_FORCE_INLINE void set128(b3SimdFloat4 v128) - { - mVec128 = v128; - } + B3_FORCE_INLINE b3SimdFloat4 get128() const + { + return mVec128; + } + B3_FORCE_INLINE void set128(b3SimdFloat4 v128) + { + mVec128 = v128; + } #endif - public: - - - -/**@brief Add a vector to this one +public: + /**@brief Add a vector to this one * @param The vector to add to this one */ B3_FORCE_INLINE b3Vector3& operator+=(const b3Vector3& v) { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) mVec128 = _mm_add_ps(mVec128, v.mVec128); #elif defined(B3_USE_NEON) mVec128 = vaddq_f32(mVec128, v.mVec128); @@ -149,12 +143,11 @@ public: return *this; } - - /**@brief Subtract a vector from this one + /**@brief Subtract a vector from this one * @param The vector to subtract */ B3_FORCE_INLINE b3Vector3& operator-=(const b3Vector3& v) { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) mVec128 = _mm_sub_ps(mVec128, v.mVec128); #elif defined(B3_USE_NEON) mVec128 = vsubq_f32(mVec128, v.mVec128); @@ -166,13 +159,13 @@ public: return *this; } - /**@brief Scale the vector + /**@brief Scale the vector * @param s Scale factor */ B3_FORCE_INLINE b3Vector3& operator*=(const b3Scalar& s) { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) - __m128 vs = _mm_load_ss(&s); // (S 0 0 0) - vs = b3_pshufd_ps(vs, 0x80); // (S S S 0.0) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) + __m128 vs = _mm_load_ss(&s); // (S 0 0 0) + vs = b3_pshufd_ps(vs, 0x80); // (S S S 0.0) mVec128 = _mm_mul_ps(mVec128, vs); #elif defined(B3_USE_NEON) mVec128 = vmulq_n_f32(mVec128, s); @@ -184,13 +177,13 @@ public: return *this; } - /**@brief Inversely scale the vector + /**@brief Inversely scale the vector * @param s Scale factor to divide by */ B3_FORCE_INLINE b3Vector3& operator/=(const b3Scalar& s) { b3FullAssert(s != b3Scalar(0.0)); -#if 0 //defined(B3_USE_SSE_IN_API) +#if 0 //defined(B3_USE_SSE_IN_API) // this code is not faster ! __m128 vs = _mm_load_ss(&s); vs = _mm_div_ss(b3v1110, vs); @@ -204,11 +197,11 @@ public: #endif } - /**@brief Return the dot product + /**@brief Return the dot product * @param v The other vector in the dot product */ B3_FORCE_INLINE b3Scalar dot(const b3Vector3& v) const { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) __m128 vd = _mm_mul_ps(mVec128, v.mVec128); __m128 z = _mm_movehl_ps(vd, vd); __m128 y = _mm_shuffle_ps(vd, vd, 0x55); @@ -221,29 +214,29 @@ public: x = vadd_f32(x, vget_high_f32(vd)); return vget_lane_f32(x, 0); #else - return m_floats[0] * v.m_floats[0] + - m_floats[1] * v.m_floats[1] + - m_floats[2] * v.m_floats[2]; + return m_floats[0] * v.m_floats[0] + + m_floats[1] * v.m_floats[1] + + m_floats[2] * v.m_floats[2]; #endif } - /**@brief Return the length of the vector squared */ + /**@brief Return the length of the vector squared */ B3_FORCE_INLINE b3Scalar length2() const { return dot(*this); } - /**@brief Return the length of the vector */ + /**@brief Return the length of the vector */ B3_FORCE_INLINE b3Scalar length() const { return b3Sqrt(length2()); } - /**@brief Return the distance squared between the ends of this and another vector + /**@brief Return the distance squared between the ends of this and another vector * This is symantically treating the vector like a point */ B3_FORCE_INLINE b3Scalar distance2(const b3Vector3& v) const; - /**@brief Return the distance between the ends of this and another vector + /**@brief Return the distance between the ends of this and another vector * This is symantically treating the vector like a point */ B3_FORCE_INLINE b3Scalar distance(const b3Vector3& v) const; @@ -251,7 +244,7 @@ public: { b3Scalar l2 = length2(); //triNormal.normalize(); - if (l2 >= B3_EPSILON*B3_EPSILON) + if (l2 >= B3_EPSILON * B3_EPSILON) { (*this) /= b3Sqrt(l2); } @@ -262,43 +255,42 @@ public: return *this; } - /**@brief Normalize this vector + /**@brief Normalize this vector * x^2 + y^2 + z^2 = 1 */ B3_FORCE_INLINE b3Vector3& normalize() { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) - // dot product first +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) + // dot product first __m128 vd = _mm_mul_ps(mVec128, mVec128); __m128 z = _mm_movehl_ps(vd, vd); __m128 y = _mm_shuffle_ps(vd, vd, 0x55); vd = _mm_add_ss(vd, y); vd = _mm_add_ss(vd, z); - #if 0 +#if 0 vd = _mm_sqrt_ss(vd); vd = _mm_div_ss(b3v1110, vd); vd = b3_splat_ps(vd, 0x80); mVec128 = _mm_mul_ps(mVec128, vd); - #else +#else - // NR step 1/sqrt(x) - vd is x, y is output - y = _mm_rsqrt_ss(vd); // estimate + // NR step 1/sqrt(x) - vd is x, y is output + y = _mm_rsqrt_ss(vd); // estimate - // one step NR - z = b3v1_5; - vd = _mm_mul_ss(vd, b3vHalf); // vd * 0.5 - //x2 = vd; - vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0 - vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0 * y0 - z = _mm_sub_ss(z, vd); // 1.5 - vd * 0.5 * y0 * y0 + // one step NR + z = b3v1_5; + vd = _mm_mul_ss(vd, b3vHalf); // vd * 0.5 + //x2 = vd; + vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0 + vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0 * y0 + z = _mm_sub_ss(z, vd); // 1.5 - vd * 0.5 * y0 * y0 - y = _mm_mul_ss(y, z); // y0 * (1.5 - vd * 0.5 * y0 * y0) + y = _mm_mul_ss(y, z); // y0 * (1.5 - vd * 0.5 * y0 * y0) y = b3_splat_ps(y, 0x80); mVec128 = _mm_mul_ps(mVec128, y); - #endif - +#endif return *this; #else @@ -306,15 +298,15 @@ public: #endif } - /**@brief Return a normalized version of this vector */ + /**@brief Return a normalized version of this vector */ B3_FORCE_INLINE b3Vector3 normalized() const; - /**@brief Return a rotated version of this vector + /**@brief Return a rotated version of this vector * @param wAxis The axis to rotate about * @param angle The angle to rotate by */ - B3_FORCE_INLINE b3Vector3 rotate( const b3Vector3& wAxis, const b3Scalar angle ) const; + B3_FORCE_INLINE b3Vector3 rotate(const b3Vector3& wAxis, const b3Scalar angle) const; - /**@brief Return the angle between this and another vector + /**@brief Return the angle between this and another vector * @param v The other vector */ B3_FORCE_INLINE b3Scalar angle(const b3Vector3& v) const { @@ -323,10 +315,10 @@ public: return b3Acos(dot(v) / s); } - /**@brief Return a vector will the absolute values of each element */ + /**@brief Return a vector will the absolute values of each element */ B3_FORCE_INLINE b3Vector3 absolute() const { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) return b3MakeVector3(_mm_and_ps(mVec128, b3v3AbsfMask)); #elif defined(B3_USE_NEON) return b3Vector3(vabsq_f32(mVec128)); @@ -338,15 +330,15 @@ public: #endif } - /**@brief Return the cross product between this and another vector + /**@brief Return the cross product between this and another vector * @param v The other vector */ B3_FORCE_INLINE b3Vector3 cross(const b3Vector3& v) const { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) - __m128 T, V; +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) + __m128 T, V; - T = b3_pshufd_ps(mVec128, B3_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0) - V = b3_pshufd_ps(v.mVec128, B3_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0) + T = b3_pshufd_ps(mVec128, B3_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0) + V = b3_pshufd_ps(v.mVec128, B3_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0) V = _mm_mul_ps(V, mVec128); T = _mm_mul_ps(T, v.mVec128); @@ -381,10 +373,10 @@ public: B3_FORCE_INLINE b3Scalar triple(const b3Vector3& v1, const b3Vector3& v2) const { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) // cross: - __m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, B3_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0) - __m128 V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, B3_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0) + __m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, B3_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0) + __m128 V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, B3_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0) V = _mm_mul_ps(V, v1.mVec128); T = _mm_mul_ps(T, v2.mVec128); @@ -422,25 +414,24 @@ public: x = vadd_f32(x, vget_high_f32(V)); return vget_lane_f32(x, 0); #else - return - m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) + - m_floats[1] * (v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2]) + - m_floats[2] * (v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0]); + return m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) + + m_floats[1] * (v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2]) + + m_floats[2] * (v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0]); #endif } - /**@brief Return the axis with the smallest value + /**@brief Return the axis with the smallest value * Note return values are 0,1,2 for x, y, or z */ B3_FORCE_INLINE int minAxis() const { - return m_floats[0] < m_floats[1] ? (m_floats[0] <m_floats[2] ? 0 : 2) : (m_floats[1] <m_floats[2] ? 1 : 2); + return m_floats[0] < m_floats[1] ? (m_floats[0] < m_floats[2] ? 0 : 2) : (m_floats[1] < m_floats[2] ? 1 : 2); } - /**@brief Return the axis with the largest value + /**@brief Return the axis with the largest value * Note return values are 0,1,2 for x, y, or z */ B3_FORCE_INLINE int maxAxis() const { - return m_floats[0] < m_floats[1] ? (m_floats[1] <m_floats[2] ? 2 : 1) : (m_floats[0] <m_floats[2] ? 2 : 0); + return m_floats[0] < m_floats[1] ? (m_floats[1] < m_floats[2] ? 2 : 1) : (m_floats[0] < m_floats[2] ? 2 : 0); } B3_FORCE_INLINE int furthestAxis() const @@ -453,18 +444,17 @@ public: return absolute().maxAxis(); } - B3_FORCE_INLINE void setInterpolate3(const b3Vector3& v0, const b3Vector3& v1, b3Scalar rt) { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) - __m128 vrt = _mm_load_ss(&rt); // (rt 0 0 0) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) + __m128 vrt = _mm_load_ss(&rt); // (rt 0 0 0) b3Scalar s = b3Scalar(1.0) - rt; - __m128 vs = _mm_load_ss(&s); // (S 0 0 0) - vs = b3_pshufd_ps(vs, 0x80); // (S S S 0.0) + __m128 vs = _mm_load_ss(&s); // (S 0 0 0) + vs = b3_pshufd_ps(vs, 0x80); // (S S S 0.0) __m128 r0 = _mm_mul_ps(v0.mVec128, vs); - vrt = b3_pshufd_ps(vrt, 0x80); // (rt rt rt 0.0) + vrt = b3_pshufd_ps(vrt, 0x80); // (rt rt rt 0.0) __m128 r1 = _mm_mul_ps(v1.mVec128, vrt); - __m128 tmp3 = _mm_add_ps(r0,r1); + __m128 tmp3 = _mm_add_ps(r0, r1); mVec128 = tmp3; #elif defined(B3_USE_NEON) float32x4_t vl = vsubq_f32(v1.mVec128, v0.mVec128); @@ -480,14 +470,14 @@ public: #endif } - /**@brief Return the linear interpolation between this and another vector + /**@brief Return the linear interpolation between this and another vector * @param v The other vector * @param t The ration of this to v (t = 0 => return this, t=1 => return other) */ B3_FORCE_INLINE b3Vector3 lerp(const b3Vector3& v, const b3Scalar& t) const { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) - __m128 vt = _mm_load_ss(&t); // (t 0 0 0) - vt = b3_pshufd_ps(vt, 0x80); // (rt rt rt 0.0) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) + __m128 vt = _mm_load_ss(&t); // (t 0 0 0) + vt = b3_pshufd_ps(vt, 0x80); // (rt rt rt 0.0) __m128 vl = _mm_sub_ps(v.mVec128, mVec128); vl = _mm_mul_ps(vl, vt); vl = _mm_add_ps(vl, mVec128); @@ -500,18 +490,17 @@ public: return b3Vector3(vl); #else - return - b3MakeVector3( m_floats[0] + (v.m_floats[0] - m_floats[0]) * t, - m_floats[1] + (v.m_floats[1] - m_floats[1]) * t, - m_floats[2] + (v.m_floats[2] - m_floats[2]) * t); + return b3MakeVector3(m_floats[0] + (v.m_floats[0] - m_floats[0]) * t, + m_floats[1] + (v.m_floats[1] - m_floats[1]) * t, + m_floats[2] + (v.m_floats[2] - m_floats[2]) * t); #endif } - /**@brief Elementwise multiply this vector by the other + /**@brief Elementwise multiply this vector by the other * @param v The other vector */ B3_FORCE_INLINE b3Vector3& operator*=(const b3Vector3& v) { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) mVec128 = _mm_mul_ps(mVec128, v.mVec128); #elif defined(B3_USE_NEON) mVec128 = vmulq_f32(mVec128, v.mVec128); @@ -523,53 +512,53 @@ public: return *this; } - /**@brief Return the x value */ - B3_FORCE_INLINE const b3Scalar& getX() const { return m_floats[0]; } - /**@brief Return the y value */ - B3_FORCE_INLINE const b3Scalar& getY() const { return m_floats[1]; } - /**@brief Return the z value */ - B3_FORCE_INLINE const b3Scalar& getZ() const { return m_floats[2]; } -/**@brief Return the w value */ - B3_FORCE_INLINE const b3Scalar& getW() const { return m_floats[3]; } - - /**@brief Set the x value */ - B3_FORCE_INLINE void setX(b3Scalar _x) { m_floats[0] = _x;}; - /**@brief Set the y value */ - B3_FORCE_INLINE void setY(b3Scalar _y) { m_floats[1] = _y;}; - /**@brief Set the z value */ - B3_FORCE_INLINE void setZ(b3Scalar _z) { m_floats[2] = _z;}; - /**@brief Set the w value */ - B3_FORCE_INLINE void setW(b3Scalar _w) { m_floats[3] = _w;}; + /**@brief Return the x value */ + B3_FORCE_INLINE const b3Scalar& getX() const { return m_floats[0]; } + /**@brief Return the y value */ + B3_FORCE_INLINE const b3Scalar& getY() const { return m_floats[1]; } + /**@brief Return the z value */ + B3_FORCE_INLINE const b3Scalar& getZ() const { return m_floats[2]; } + /**@brief Return the w value */ + B3_FORCE_INLINE const b3Scalar& getW() const { return m_floats[3]; } + + /**@brief Set the x value */ + B3_FORCE_INLINE void setX(b3Scalar _x) { m_floats[0] = _x; }; + /**@brief Set the y value */ + B3_FORCE_INLINE void setY(b3Scalar _y) { m_floats[1] = _y; }; + /**@brief Set the z value */ + B3_FORCE_INLINE void setZ(b3Scalar _z) { m_floats[2] = _z; }; + /**@brief Set the w value */ + B3_FORCE_INLINE void setW(b3Scalar _w) { m_floats[3] = _w; }; //B3_FORCE_INLINE b3Scalar& operator[](int i) { return (&m_floats[0])[i]; } //B3_FORCE_INLINE const b3Scalar& operator[](int i) const { return (&m_floats[0])[i]; } ///operator b3Scalar*() replaces operator[], using implicit conversion. We added operator != and operator == to avoid pointer comparisons. - B3_FORCE_INLINE operator b3Scalar *() { return &m_floats[0]; } - B3_FORCE_INLINE operator const b3Scalar *() const { return &m_floats[0]; } + B3_FORCE_INLINE operator b3Scalar*() { return &m_floats[0]; } + B3_FORCE_INLINE operator const b3Scalar*() const { return &m_floats[0]; } - B3_FORCE_INLINE bool operator==(const b3Vector3& other) const + B3_FORCE_INLINE bool operator==(const b3Vector3& other) const { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) - return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128))); +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) + return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128))); #else - return ((m_floats[3]==other.m_floats[3]) && - (m_floats[2]==other.m_floats[2]) && - (m_floats[1]==other.m_floats[1]) && - (m_floats[0]==other.m_floats[0])); + return ((m_floats[3] == other.m_floats[3]) && + (m_floats[2] == other.m_floats[2]) && + (m_floats[1] == other.m_floats[1]) && + (m_floats[0] == other.m_floats[0])); #endif } - B3_FORCE_INLINE bool operator!=(const b3Vector3& other) const + B3_FORCE_INLINE bool operator!=(const b3Vector3& other) const { return !(*this == other); } - /**@brief Set each element to the max of the current values and the values of another b3Vector3 + /**@brief Set each element to the max of the current values and the values of another b3Vector3 * @param other The other b3Vector3 to compare with */ - B3_FORCE_INLINE void setMax(const b3Vector3& other) + B3_FORCE_INLINE void setMax(const b3Vector3& other) { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) mVec128 = _mm_max_ps(mVec128, other.mVec128); #elif defined(B3_USE_NEON) mVec128 = vmaxq_f32(mVec128, other.mVec128); @@ -581,12 +570,12 @@ public: #endif } - /**@brief Set each element to the min of the current values and the values of another b3Vector3 + /**@brief Set each element to the min of the current values and the values of another b3Vector3 * @param other The other b3Vector3 to compare with */ - B3_FORCE_INLINE void setMin(const b3Vector3& other) + B3_FORCE_INLINE void setMin(const b3Vector3& other) { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) mVec128 = _mm_min_ps(mVec128, other.mVec128); #elif defined(B3_USE_NEON) mVec128 = vminq_f32(mVec128, other.mVec128); @@ -598,46 +587,46 @@ public: #endif } - B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z) + B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z) { - m_floats[0]=_x; - m_floats[1]=_y; - m_floats[2]=_z; + m_floats[0] = _x; + m_floats[1] = _y; + m_floats[2] = _z; m_floats[3] = b3Scalar(0.f); } - void getSkewSymmetricMatrix(b3Vector3* v0,b3Vector3* v1,b3Vector3* v2) const + void getSkewSymmetricMatrix(b3Vector3 * v0, b3Vector3 * v1, b3Vector3 * v2) const { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) - __m128 V = _mm_and_ps(mVec128, b3vFFF0fMask); + __m128 V = _mm_and_ps(mVec128, b3vFFF0fMask); __m128 V0 = _mm_xor_ps(b3vMzeroMask, V); __m128 V2 = _mm_movelh_ps(V0, V); __m128 V1 = _mm_shuffle_ps(V, V0, 0xCE); - V0 = _mm_shuffle_ps(V0, V, 0xDB); + V0 = _mm_shuffle_ps(V0, V, 0xDB); V2 = _mm_shuffle_ps(V2, V, 0xF9); v0->mVec128 = V0; v1->mVec128 = V1; v2->mVec128 = V2; #else - v0->setValue(0. ,-getZ() ,getY()); - v1->setValue(getZ() ,0. ,-getX()); - v2->setValue(-getY() ,getX() ,0.); + v0->setValue(0., -getZ(), getY()); + v1->setValue(getZ(), 0., -getX()); + v2->setValue(-getY(), getX(), 0.); #endif } void setZero() { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) mVec128 = (__m128)_mm_xor_ps(mVec128, mVec128); #elif defined(B3_USE_NEON) int32x4_t vi = vdupq_n_s32(0); mVec128 = vreinterpretq_f32_s32(vi); #else - setValue(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); + setValue(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.)); #endif } @@ -651,76 +640,76 @@ public: return length2() < B3_EPSILON; } - B3_FORCE_INLINE void serialize(struct b3Vector3Data& dataOut) const; + B3_FORCE_INLINE void serialize(struct b3Vector3Data & dataOut) const; - B3_FORCE_INLINE void deSerialize(const struct b3Vector3Data& dataIn); + B3_FORCE_INLINE void deSerialize(const struct b3Vector3Data& dataIn); - B3_FORCE_INLINE void serializeFloat(struct b3Vector3FloatData& dataOut) const; + B3_FORCE_INLINE void serializeFloat(struct b3Vector3FloatData & dataOut) const; - B3_FORCE_INLINE void deSerializeFloat(const struct b3Vector3FloatData& dataIn); + B3_FORCE_INLINE void deSerializeFloat(const struct b3Vector3FloatData& dataIn); - B3_FORCE_INLINE void serializeDouble(struct b3Vector3DoubleData& dataOut) const; + B3_FORCE_INLINE void serializeDouble(struct b3Vector3DoubleData & dataOut) const; - B3_FORCE_INLINE void deSerializeDouble(const struct b3Vector3DoubleData& dataIn); + B3_FORCE_INLINE void deSerializeDouble(const struct b3Vector3DoubleData& dataIn); - /**@brief returns index of maximum dot product between this and vectors in array[] + /**@brief returns index of maximum dot product between this and vectors in array[] * @param array The other vectors * @param array_count The number of other vectors * @param dotOut The maximum dot product */ - B3_FORCE_INLINE long maxDot( const b3Vector3 *array, long array_count, b3Scalar &dotOut ) const; + B3_FORCE_INLINE long maxDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const; - /**@brief returns index of minimum dot product between this and vectors in array[] + /**@brief returns index of minimum dot product between this and vectors in array[] * @param array The other vectors * @param array_count The number of other vectors * @param dotOut The minimum dot product */ - B3_FORCE_INLINE long minDot( const b3Vector3 *array, long array_count, b3Scalar &dotOut ) const; - - /* create a vector as b3Vector3( this->dot( b3Vector3 v0 ), this->dot( b3Vector3 v1), this->dot( b3Vector3 v2 )) */ - B3_FORCE_INLINE b3Vector3 dot3( const b3Vector3 &v0, const b3Vector3 &v1, const b3Vector3 &v2 ) const - { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) - - __m128 a0 = _mm_mul_ps( v0.mVec128, this->mVec128 ); - __m128 a1 = _mm_mul_ps( v1.mVec128, this->mVec128 ); - __m128 a2 = _mm_mul_ps( v2.mVec128, this->mVec128 ); - __m128 b0 = _mm_unpacklo_ps( a0, a1 ); - __m128 b1 = _mm_unpackhi_ps( a0, a1 ); - __m128 b2 = _mm_unpacklo_ps( a2, _mm_setzero_ps() ); - __m128 r = _mm_movelh_ps( b0, b2 ); - r = _mm_add_ps( r, _mm_movehl_ps( b2, b0 )); - a2 = _mm_and_ps( a2, b3vxyzMaskf); - r = _mm_add_ps( r, b3CastdTo128f (_mm_move_sd( b3CastfTo128d(a2), b3CastfTo128d(b1) ))); - return b3MakeVector3(r); + B3_FORCE_INLINE long minDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const; + + /* create a vector as b3Vector3( this->dot( b3Vector3 v0 ), this->dot( b3Vector3 v1), this->dot( b3Vector3 v2 )) */ + B3_FORCE_INLINE b3Vector3 dot3(const b3Vector3& v0, const b3Vector3& v1, const b3Vector3& v2) const + { +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) + + __m128 a0 = _mm_mul_ps(v0.mVec128, this->mVec128); + __m128 a1 = _mm_mul_ps(v1.mVec128, this->mVec128); + __m128 a2 = _mm_mul_ps(v2.mVec128, this->mVec128); + __m128 b0 = _mm_unpacklo_ps(a0, a1); + __m128 b1 = _mm_unpackhi_ps(a0, a1); + __m128 b2 = _mm_unpacklo_ps(a2, _mm_setzero_ps()); + __m128 r = _mm_movelh_ps(b0, b2); + r = _mm_add_ps(r, _mm_movehl_ps(b2, b0)); + a2 = _mm_and_ps(a2, b3vxyzMaskf); + r = _mm_add_ps(r, b3CastdTo128f(_mm_move_sd(b3CastfTo128d(a2), b3CastfTo128d(b1)))); + return b3MakeVector3(r); #elif defined(B3_USE_NEON) - static const uint32x4_t xyzMask = (const uint32x4_t){ -1, -1, -1, 0 }; - float32x4_t a0 = vmulq_f32( v0.mVec128, this->mVec128); - float32x4_t a1 = vmulq_f32( v1.mVec128, this->mVec128); - float32x4_t a2 = vmulq_f32( v2.mVec128, this->mVec128); - float32x2x2_t zLo = vtrn_f32( vget_high_f32(a0), vget_high_f32(a1)); - a2 = (float32x4_t) vandq_u32((uint32x4_t) a2, xyzMask ); - float32x2_t b0 = vadd_f32( vpadd_f32( vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0] ); - float32x2_t b1 = vpadd_f32( vpadd_f32( vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f)); - return b3Vector3( vcombine_f32(b0, b1) ); + static const uint32x4_t xyzMask = (const uint32x4_t){-1, -1, -1, 0}; + float32x4_t a0 = vmulq_f32(v0.mVec128, this->mVec128); + float32x4_t a1 = vmulq_f32(v1.mVec128, this->mVec128); + float32x4_t a2 = vmulq_f32(v2.mVec128, this->mVec128); + float32x2x2_t zLo = vtrn_f32(vget_high_f32(a0), vget_high_f32(a1)); + a2 = (float32x4_t)vandq_u32((uint32x4_t)a2, xyzMask); + float32x2_t b0 = vadd_f32(vpadd_f32(vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0]); + float32x2_t b1 = vpadd_f32(vpadd_f32(vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f)); + return b3Vector3(vcombine_f32(b0, b1)); #else - return b3MakeVector3( dot(v0), dot(v1), dot(v2)); + return b3MakeVector3(dot(v0), dot(v1), dot(v2)); #endif - } + } }; /**@brief Return the sum of two vectors (Point symantics)*/ B3_FORCE_INLINE b3Vector3 operator+(const b3Vector3& v1, const b3Vector3& v2) { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) return b3MakeVector3(_mm_add_ps(v1.mVec128, v2.mVec128)); #elif defined(B3_USE_NEON) return b3MakeVector3(vaddq_f32(v1.mVec128, v2.mVec128)); #else return b3MakeVector3( - v1.m_floats[0] + v2.m_floats[0], - v1.m_floats[1] + v2.m_floats[1], - v1.m_floats[2] + v2.m_floats[2]); + v1.m_floats[0] + v2.m_floats[0], + v1.m_floats[1] + v2.m_floats[1], + v1.m_floats[2] + v2.m_floats[2]); #endif } @@ -728,15 +717,15 @@ operator+(const b3Vector3& v1, const b3Vector3& v2) B3_FORCE_INLINE b3Vector3 operator*(const b3Vector3& v1, const b3Vector3& v2) { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) return b3MakeVector3(_mm_mul_ps(v1.mVec128, v2.mVec128)); #elif defined(B3_USE_NEON) return b3MakeVector3(vmulq_f32(v1.mVec128, v2.mVec128)); #else return b3MakeVector3( - v1.m_floats[0] * v2.m_floats[0], - v1.m_floats[1] * v2.m_floats[1], - v1.m_floats[2] * v2.m_floats[2]); + v1.m_floats[0] * v2.m_floats[0], + v1.m_floats[1] * v2.m_floats[1], + v1.m_floats[2] * v2.m_floats[2]); #endif } @@ -744,7 +733,7 @@ operator*(const b3Vector3& v1, const b3Vector3& v2) B3_FORCE_INLINE b3Vector3 operator-(const b3Vector3& v1, const b3Vector3& v2) { -#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) // without _mm_and_ps this code causes slowdown in Concave moving __m128 r = _mm_sub_ps(v1.mVec128, v2.mVec128); @@ -754,9 +743,9 @@ operator-(const b3Vector3& v1, const b3Vector3& v2) return b3MakeVector3((float32x4_t)vandq_s32((int32x4_t)r, b3vFFF0Mask)); #else return b3MakeVector3( - v1.m_floats[0] - v2.m_floats[0], - v1.m_floats[1] - v2.m_floats[1], - v1.m_floats[2] - v2.m_floats[2]); + v1.m_floats[0] - v2.m_floats[0], + v1.m_floats[1] - v2.m_floats[1], + v1.m_floats[2] - v2.m_floats[2]); #endif } @@ -764,7 +753,7 @@ operator-(const b3Vector3& v1, const b3Vector3& v2) B3_FORCE_INLINE b3Vector3 operator-(const b3Vector3& v) { -#if (defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)) +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) __m128 r = _mm_xor_ps(v.mVec128, b3vMzeroMask); return b3MakeVector3(_mm_and_ps(r, b3vFFF0fMask)); #elif defined(B3_USE_NEON) @@ -778,9 +767,9 @@ operator-(const b3Vector3& v) B3_FORCE_INLINE b3Vector3 operator*(const b3Vector3& v, const b3Scalar& s) { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) - __m128 vs = _mm_load_ss(&s); // (S 0 0 0) - vs = b3_pshufd_ps(vs, 0x80); // (S S S 0.0) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) + __m128 vs = _mm_load_ss(&s); // (S 0 0 0) + vs = b3_pshufd_ps(vs, 0x80); // (S S S 0.0) return b3MakeVector3(_mm_mul_ps(v.mVec128, vs)); #elif defined(B3_USE_NEON) float32x4_t r = vmulq_n_f32(v.mVec128, s); @@ -802,7 +791,7 @@ B3_FORCE_INLINE b3Vector3 operator/(const b3Vector3& v, const b3Scalar& s) { b3FullAssert(s != b3Scalar(0.0)); -#if 0 //defined(B3_USE_SSE_IN_API) +#if 0 //defined(B3_USE_SSE_IN_API) // this code is not faster ! __m128 vs = _mm_load_ss(&s); vs = _mm_div_ss(b3v1110, vs); @@ -818,7 +807,7 @@ operator/(const b3Vector3& v, const b3Scalar& s) B3_FORCE_INLINE b3Vector3 operator/(const b3Vector3& v1, const b3Vector3& v2) { -#if (defined(B3_USE_SSE_IN_API)&& defined (B3_USE_SSE)) +#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) __m128 vec = _mm_div_ps(v1.mVec128, v2.mVec128); vec = _mm_and_ps(vec, b3vFFF0fMask); return b3MakeVector3(vec); @@ -828,19 +817,19 @@ operator/(const b3Vector3& v1, const b3Vector3& v2) x = v1.mVec128; y = v2.mVec128; - v = vrecpeq_f32(y); // v ~ 1/y - m = vrecpsq_f32(y, v); // m = (2-v*y) - v = vmulq_f32(v, m); // vv = v*m ~~ 1/y - m = vrecpsq_f32(y, v); // mm = (2-vv*y) - v = vmulq_f32(v, x); // x*vv - v = vmulq_f32(v, m); // (x*vv)*(2-vv*y) = x*(vv(2-vv*y)) ~~~ x/y + v = vrecpeq_f32(y); // v ~ 1/y + m = vrecpsq_f32(y, v); // m = (2-v*y) + v = vmulq_f32(v, m); // vv = v*m ~~ 1/y + m = vrecpsq_f32(y, v); // mm = (2-vv*y) + v = vmulq_f32(v, x); // x*vv + v = vmulq_f32(v, m); // (x*vv)*(2-vv*y) = x*(vv(2-vv*y)) ~~~ x/y return b3Vector3(v); #else return b3MakeVector3( - v1.m_floats[0] / v2.m_floats[0], - v1.m_floats[1] / v2.m_floats[1], - v1.m_floats[2] / v2.m_floats[2]); + v1.m_floats[0] / v2.m_floats[0], + v1.m_floats[1] / v2.m_floats[1], + v1.m_floats[2] / v2.m_floats[2]); #endif } @@ -851,7 +840,6 @@ b3Dot(const b3Vector3& v1, const b3Vector3& v2) return v1.dot(v2); } - /**@brief Return the distance squared between two vectors */ B3_FORCE_INLINE b3Scalar b3Distance2(const b3Vector3& v1, const b3Vector3& v2) @@ -859,7 +847,6 @@ b3Distance2(const b3Vector3& v1, const b3Vector3& v2) return v1.distance2(v2); } - /**@brief Return the distance between two vectors */ B3_FORCE_INLINE b3Scalar b3Distance(const b3Vector3& v1, const b3Vector3& v2) @@ -897,8 +884,6 @@ b3Lerp(const b3Vector3& v1, const b3Vector3& v2, const b3Scalar& t) return v1.lerp(v2, t); } - - B3_FORCE_INLINE b3Scalar b3Vector3::distance2(const b3Vector3& v) const { return (v - *this).length2(); @@ -911,7 +896,7 @@ B3_FORCE_INLINE b3Scalar b3Vector3::distance(const b3Vector3& v) const B3_FORCE_INLINE b3Vector3 b3Vector3::normalized() const { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) b3Vector3 norm = *this; return norm.normalize(); @@ -920,143 +905,136 @@ B3_FORCE_INLINE b3Vector3 b3Vector3::normalized() const #endif } -B3_FORCE_INLINE b3Vector3 b3Vector3::rotate( const b3Vector3& wAxis, const b3Scalar _angle ) const +B3_FORCE_INLINE b3Vector3 b3Vector3::rotate(const b3Vector3& wAxis, const b3Scalar _angle) const { // wAxis must be a unit lenght vector -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) - __m128 O = _mm_mul_ps(wAxis.mVec128, mVec128); - b3Scalar ssin = b3Sin( _angle ); - __m128 C = wAxis.cross( b3MakeVector3(mVec128) ).mVec128; + __m128 O = _mm_mul_ps(wAxis.mVec128, mVec128); + b3Scalar ssin = b3Sin(_angle); + __m128 C = wAxis.cross(b3MakeVector3(mVec128)).mVec128; O = _mm_and_ps(O, b3vFFF0fMask); - b3Scalar scos = b3Cos( _angle ); + b3Scalar scos = b3Cos(_angle); - __m128 vsin = _mm_load_ss(&ssin); // (S 0 0 0) - __m128 vcos = _mm_load_ss(&scos); // (S 0 0 0) + __m128 vsin = _mm_load_ss(&ssin); // (S 0 0 0) + __m128 vcos = _mm_load_ss(&scos); // (S 0 0 0) - __m128 Y = b3_pshufd_ps(O, 0xC9); // (Y Z X 0) - __m128 Z = b3_pshufd_ps(O, 0xD2); // (Z X Y 0) + __m128 Y = b3_pshufd_ps(O, 0xC9); // (Y Z X 0) + __m128 Z = b3_pshufd_ps(O, 0xD2); // (Z X Y 0) O = _mm_add_ps(O, Y); - vsin = b3_pshufd_ps(vsin, 0x80); // (S S S 0) + vsin = b3_pshufd_ps(vsin, 0x80); // (S S S 0) O = _mm_add_ps(O, Z); - vcos = b3_pshufd_ps(vcos, 0x80); // (S S S 0) + vcos = b3_pshufd_ps(vcos, 0x80); // (S S S 0) - vsin = vsin * C; + vsin = vsin * C; O = O * wAxis.mVec128; __m128 X = mVec128 - O; - O = O + vsin; + O = O + vsin; vcos = vcos * X; O = O + vcos; return b3MakeVector3(O); #else - b3Vector3 o = wAxis * wAxis.dot( *this ); + b3Vector3 o = wAxis * wAxis.dot(*this); b3Vector3 _x = *this - o; b3Vector3 _y; - _y = wAxis.cross( *this ); + _y = wAxis.cross(*this); - return ( o + _x * b3Cos( _angle ) + _y * b3Sin( _angle ) ); + return (o + _x * b3Cos(_angle) + _y * b3Sin(_angle)); #endif } -B3_FORCE_INLINE long b3Vector3::maxDot( const b3Vector3 *array, long array_count, b3Scalar &dotOut ) const +B3_FORCE_INLINE long b3Vector3::maxDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const { -#if defined (B3_USE_SSE) || defined (B3_USE_NEON) - #if defined _WIN32 || defined (B3_USE_SSE) - const long scalar_cutoff = 10; - long b3_maxdot_large( const float *array, const float *vec, unsigned long array_count, float *dotOut ); - #elif defined B3_USE_NEON - const long scalar_cutoff = 4; - extern long (*_maxdot_large)( const float *array, const float *vec, unsigned long array_count, float *dotOut ); - #endif - if( array_count < scalar_cutoff ) +#if defined(B3_USE_SSE) || defined(B3_USE_NEON) +#if defined _WIN32 || defined(B3_USE_SSE) + const long scalar_cutoff = 10; + long b3_maxdot_large(const float* array, const float* vec, unsigned long array_count, float* dotOut); +#elif defined B3_USE_NEON + const long scalar_cutoff = 4; + extern long (*_maxdot_large)(const float* array, const float* vec, unsigned long array_count, float* dotOut); +#endif + if (array_count < scalar_cutoff) #else -#endif//B3_USE_SSE || B3_USE_NEON - { - b3Scalar maxDot = -B3_INFINITY; - int i = 0; - int ptIndex = -1; - for( i = 0; i < array_count; i++ ) - { - b3Scalar dot = array[i].dot(*this); - - if( dot > maxDot ) - { - maxDot = dot; - ptIndex = i; - } - } - - b3Assert(ptIndex>=0); - if (ptIndex<0) +#endif //B3_USE_SSE || B3_USE_NEON + { + b3Scalar maxDot = -B3_INFINITY; + int i = 0; + int ptIndex = -1; + for (i = 0; i < array_count; i++) + { + b3Scalar dot = array[i].dot(*this); + + if (dot > maxDot) + { + maxDot = dot; + ptIndex = i; + } + } + + b3Assert(ptIndex >= 0); + if (ptIndex < 0) { ptIndex = 0; } - dotOut = maxDot; - return ptIndex; - } -#if defined (B3_USE_SSE) || defined (B3_USE_NEON) - return b3_maxdot_large( (float*) array, (float*) &m_floats[0], array_count, &dotOut ); + dotOut = maxDot; + return ptIndex; + } +#if defined(B3_USE_SSE) || defined(B3_USE_NEON) + return b3_maxdot_large((float*)array, (float*)&m_floats[0], array_count, &dotOut); #endif } -B3_FORCE_INLINE long b3Vector3::minDot( const b3Vector3 *array, long array_count, b3Scalar &dotOut ) const +B3_FORCE_INLINE long b3Vector3::minDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const { -#if defined (B3_USE_SSE) || defined (B3_USE_NEON) - #if defined B3_USE_SSE - const long scalar_cutoff = 10; - long b3_mindot_large( const float *array, const float *vec, unsigned long array_count, float *dotOut ); - #elif defined B3_USE_NEON - const long scalar_cutoff = 4; - extern long (*b3_mindot_large)( const float *array, const float *vec, unsigned long array_count, float *dotOut ); - #else - #error unhandled arch! - #endif - - if( array_count < scalar_cutoff ) -#endif//B3_USE_SSE || B3_USE_NEON - { - b3Scalar minDot = B3_INFINITY; - int i = 0; - int ptIndex = -1; - - for( i = 0; i < array_count; i++ ) - { - b3Scalar dot = array[i].dot(*this); - - if( dot < minDot ) - { - minDot = dot; - ptIndex = i; - } - } - - dotOut = minDot; - - return ptIndex; - } -#if defined (B3_USE_SSE) || defined (B3_USE_NEON) - return b3_mindot_large( (float*) array, (float*) &m_floats[0], array_count, &dotOut ); +#if defined(B3_USE_SSE) || defined(B3_USE_NEON) +#if defined B3_USE_SSE + const long scalar_cutoff = 10; + long b3_mindot_large(const float* array, const float* vec, unsigned long array_count, float* dotOut); +#elif defined B3_USE_NEON + const long scalar_cutoff = 4; + extern long (*b3_mindot_large)(const float* array, const float* vec, unsigned long array_count, float* dotOut); +#else +#error unhandled arch! #endif -} - - -class b3Vector4 : public b3Vector3 -{ -public: + if (array_count < scalar_cutoff) +#endif //B3_USE_SSE || B3_USE_NEON + { + b3Scalar minDot = B3_INFINITY; + int i = 0; + int ptIndex = -1; + for (i = 0; i < array_count; i++) + { + b3Scalar dot = array[i].dot(*this); + if (dot < minDot) + { + minDot = dot; + ptIndex = i; + } + } + dotOut = minDot; + return ptIndex; + } +#if defined(B3_USE_SSE) || defined(B3_USE_NEON) + return b3_mindot_large((float*)array, (float*)&m_floats[0], array_count, &dotOut); +#endif +} +class b3Vector4 : public b3Vector3 +{ +public: B3_FORCE_INLINE b3Vector4 absolute4() const { -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) return b3MakeVector4(_mm_and_ps(mVec128, b3vAbsfMask)); #elif defined(B3_USE_NEON) return b3Vector4(vabsq_f32(mVec128)); @@ -1069,11 +1047,9 @@ public: #endif } + b3Scalar getW() const { return m_floats[3]; } - b3Scalar getW() const { return m_floats[3];} - - - B3_FORCE_INLINE int maxAxis4() const + B3_FORCE_INLINE int maxAxis4() const { int maxIndex = -1; b3Scalar maxVal = b3Scalar(-B3_LARGE_FLOAT); @@ -1090,7 +1066,7 @@ public: if (m_floats[2] > maxVal) { maxIndex = 2; - maxVal =m_floats[2]; + maxVal = m_floats[2]; } if (m_floats[3] > maxVal) { @@ -1100,7 +1076,6 @@ public: return maxIndex; } - B3_FORCE_INLINE int minAxis4() const { int minIndex = -1; @@ -1118,7 +1093,7 @@ public: if (m_floats[2] < minVal) { minIndex = 2; - minVal =m_floats[2]; + minVal = m_floats[2]; } if (m_floats[3] < minVal) { @@ -1129,216 +1104,200 @@ public: return minIndex; } - B3_FORCE_INLINE int closestAxis4() const { return absolute4().maxAxis4(); } - - - - /**@brief Set x,y,z and zero w + /**@brief Set x,y,z and zero w * @param x Value of x * @param y Value of y * @param z Value of z */ - -/* void getValue(b3Scalar *m) const + /* void getValue(b3Scalar *m) const { m[0] = m_floats[0]; m[1] = m_floats[1]; m[2] =m_floats[2]; } */ -/**@brief Set the values + /**@brief Set the values * @param x Value of x * @param y Value of y * @param z Value of z * @param w Value of w */ - B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z,const b3Scalar& _w) - { - m_floats[0]=_x; - m_floats[1]=_y; - m_floats[2]=_z; - m_floats[3]=_w; - } - - + B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z, const b3Scalar& _w) + { + m_floats[0] = _x; + m_floats[1] = _y; + m_floats[2] = _z; + m_floats[3] = _w; + } }; - ///b3SwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization -B3_FORCE_INLINE void b3SwapScalarEndian(const b3Scalar& sourceVal, b3Scalar& destVal) +B3_FORCE_INLINE void b3SwapScalarEndian(const b3Scalar& sourceVal, b3Scalar& destVal) { - #ifdef B3_USE_DOUBLE_PRECISION - unsigned char* dest = (unsigned char*) &destVal; - unsigned char* src = (unsigned char*) &sourceVal; +#ifdef B3_USE_DOUBLE_PRECISION + unsigned char* dest = (unsigned char*)&destVal; + unsigned char* src = (unsigned char*)&sourceVal; dest[0] = src[7]; - dest[1] = src[6]; - dest[2] = src[5]; - dest[3] = src[4]; - dest[4] = src[3]; - dest[5] = src[2]; - dest[6] = src[1]; - dest[7] = src[0]; + dest[1] = src[6]; + dest[2] = src[5]; + dest[3] = src[4]; + dest[4] = src[3]; + dest[5] = src[2]; + dest[6] = src[1]; + dest[7] = src[0]; #else - unsigned char* dest = (unsigned char*) &destVal; - unsigned char* src = (unsigned char*) &sourceVal; + unsigned char* dest = (unsigned char*)&destVal; + unsigned char* src = (unsigned char*)&sourceVal; dest[0] = src[3]; - dest[1] = src[2]; - dest[2] = src[1]; - dest[3] = src[0]; -#endif //B3_USE_DOUBLE_PRECISION + dest[1] = src[2]; + dest[2] = src[1]; + dest[3] = src[0]; +#endif //B3_USE_DOUBLE_PRECISION } ///b3SwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization -B3_FORCE_INLINE void b3SwapVector3Endian(const b3Vector3& sourceVec, b3Vector3& destVec) +B3_FORCE_INLINE void b3SwapVector3Endian(const b3Vector3& sourceVec, b3Vector3& destVec) { - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) { - b3SwapScalarEndian(sourceVec[i],destVec[i]); + b3SwapScalarEndian(sourceVec[i], destVec[i]); } - } ///b3UnSwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization -B3_FORCE_INLINE void b3UnSwapVector3Endian(b3Vector3& vector) +B3_FORCE_INLINE void b3UnSwapVector3Endian(b3Vector3& vector) { - - b3Vector3 swappedVec; - for (int i=0;i<4;i++) + b3Vector3 swappedVec; + for (int i = 0; i < 4; i++) { - b3SwapScalarEndian(vector[i],swappedVec[i]); + b3SwapScalarEndian(vector[i], swappedVec[i]); } vector = swappedVec; } template <class T> -B3_FORCE_INLINE void b3PlaneSpace1 (const T& n, T& p, T& q) +B3_FORCE_INLINE void b3PlaneSpace1(const T& n, T& p, T& q) { - if (b3Fabs(n[2]) > B3_SQRT12) { - // choose p in y-z plane - b3Scalar a = n[1]*n[1] + n[2]*n[2]; - b3Scalar k = b3RecipSqrt (a); - p[0] = 0; - p[1] = -n[2]*k; - p[2] = n[1]*k; - // set q = n x p - q[0] = a*k; - q[1] = -n[0]*p[2]; - q[2] = n[0]*p[1]; - } - else { - // choose p in x-y plane - b3Scalar a = n[0]*n[0] + n[1]*n[1]; - b3Scalar k = b3RecipSqrt (a); - p[0] = -n[1]*k; - p[1] = n[0]*k; - p[2] = 0; - // set q = n x p - q[0] = -n[2]*p[1]; - q[1] = n[2]*p[0]; - q[2] = a*k; - } + if (b3Fabs(n[2]) > B3_SQRT12) + { + // choose p in y-z plane + b3Scalar a = n[1] * n[1] + n[2] * n[2]; + b3Scalar k = b3RecipSqrt(a); + p[0] = 0; + p[1] = -n[2] * k; + p[2] = n[1] * k; + // set q = n x p + q[0] = a * k; + q[1] = -n[0] * p[2]; + q[2] = n[0] * p[1]; + } + else + { + // choose p in x-y plane + b3Scalar a = n[0] * n[0] + n[1] * n[1]; + b3Scalar k = b3RecipSqrt(a); + p[0] = -n[1] * k; + p[1] = n[0] * k; + p[2] = 0; + // set q = n x p + q[0] = -n[2] * p[1]; + q[1] = n[2] * p[0]; + q[2] = a * k; + } } - -struct b3Vector3FloatData +struct b3Vector3FloatData { - float m_floats[4]; + float m_floats[4]; }; -struct b3Vector3DoubleData +struct b3Vector3DoubleData { - double m_floats[4]; - + double m_floats[4]; }; -B3_FORCE_INLINE void b3Vector3::serializeFloat(struct b3Vector3FloatData& dataOut) const +B3_FORCE_INLINE void b3Vector3::serializeFloat(struct b3Vector3FloatData& dataOut) const { ///could also do a memcpy, check if it is worth it - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) dataOut.m_floats[i] = float(m_floats[i]); } -B3_FORCE_INLINE void b3Vector3::deSerializeFloat(const struct b3Vector3FloatData& dataIn) +B3_FORCE_INLINE void b3Vector3::deSerializeFloat(const struct b3Vector3FloatData& dataIn) { - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) m_floats[i] = b3Scalar(dataIn.m_floats[i]); } - -B3_FORCE_INLINE void b3Vector3::serializeDouble(struct b3Vector3DoubleData& dataOut) const +B3_FORCE_INLINE void b3Vector3::serializeDouble(struct b3Vector3DoubleData& dataOut) const { ///could also do a memcpy, check if it is worth it - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) dataOut.m_floats[i] = double(m_floats[i]); } -B3_FORCE_INLINE void b3Vector3::deSerializeDouble(const struct b3Vector3DoubleData& dataIn) +B3_FORCE_INLINE void b3Vector3::deSerializeDouble(const struct b3Vector3DoubleData& dataIn) { - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) m_floats[i] = b3Scalar(dataIn.m_floats[i]); } - -B3_FORCE_INLINE void b3Vector3::serialize(struct b3Vector3Data& dataOut) const +B3_FORCE_INLINE void b3Vector3::serialize(struct b3Vector3Data& dataOut) const { ///could also do a memcpy, check if it is worth it - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) dataOut.m_floats[i] = m_floats[i]; } -B3_FORCE_INLINE void b3Vector3::deSerialize(const struct b3Vector3Data& dataIn) +B3_FORCE_INLINE void b3Vector3::deSerialize(const struct b3Vector3Data& dataIn) { - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) m_floats[i] = dataIn.m_floats[i]; } - - - -inline b3Vector3 b3MakeVector3(b3Scalar x,b3Scalar y,b3Scalar z) +inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z) { - b3Vector3 tmp; - tmp.setValue(x,y,z); + b3Vector3 tmp; + tmp.setValue(x, y, z); return tmp; } -inline b3Vector3 b3MakeVector3(b3Scalar x,b3Scalar y,b3Scalar z, b3Scalar w) +inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w) { - b3Vector3 tmp; - tmp.setValue(x,y,z); + b3Vector3 tmp; + tmp.setValue(x, y, z); tmp.w = w; return tmp; } -inline b3Vector4 b3MakeVector4(b3Scalar x,b3Scalar y,b3Scalar z,b3Scalar w) +inline b3Vector4 b3MakeVector4(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w) { - b3Vector4 tmp; - tmp.setValue(x,y,z,w); + b3Vector4 tmp; + tmp.setValue(x, y, z, w); return tmp; } -#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE) +#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE) -inline b3Vector3 b3MakeVector3( b3SimdFloat4 v) +inline b3Vector3 b3MakeVector3(b3SimdFloat4 v) { - b3Vector3 tmp; - tmp.set128(v); - return tmp; + b3Vector3 tmp; + tmp.set128(v); + return tmp; } inline b3Vector4 b3MakeVector4(b3SimdFloat4 vec) { - b3Vector4 tmp; + b3Vector4 tmp; tmp.set128(vec); return tmp; } #endif - -#endif //B3_VECTOR3_H +#endif //B3_VECTOR3_H diff --git a/thirdparty/bullet/Bullet3Common/shared/b3Float4.h b/thirdparty/bullet/Bullet3Common/shared/b3Float4.h index 5e4b95bcee..d8a9f47411 100644 --- a/thirdparty/bullet/Bullet3Common/shared/b3Float4.h +++ b/thirdparty/bullet/Bullet3Common/shared/b3Float4.h @@ -4,94 +4,87 @@ #include "Bullet3Common/shared/b3PlatformDefinitions.h" #ifdef __cplusplus - #include "Bullet3Common/b3Vector3.h" - #define b3Float4 b3Vector3 - #define b3Float4ConstArg const b3Vector3& - #define b3Dot3F4 b3Dot - #define b3Cross3 b3Cross - #define b3MakeFloat4 b3MakeVector3 - inline b3Vector3 b3Normalized(const b3Vector3& vec) - { - return vec.normalized(); - } - - inline b3Float4 b3FastNormalized3(b3Float4ConstArg v) - { - return v.normalized(); - } - - inline b3Float4 b3MaxFloat4 (const b3Float4& a, const b3Float4& b) - { - b3Float4 tmp = a; - tmp.setMax(b); - return tmp; - } - inline b3Float4 b3MinFloat4 (const b3Float4& a, const b3Float4& b) - { - b3Float4 tmp = a; - tmp.setMin(b); - return tmp; - } +#include "Bullet3Common/b3Vector3.h" +#define b3Float4 b3Vector3 +#define b3Float4ConstArg const b3Vector3& +#define b3Dot3F4 b3Dot +#define b3Cross3 b3Cross +#define b3MakeFloat4 b3MakeVector3 +inline b3Vector3 b3Normalized(const b3Vector3& vec) +{ + return vec.normalized(); +} +inline b3Float4 b3FastNormalized3(b3Float4ConstArg v) +{ + return v.normalized(); +} +inline b3Float4 b3MaxFloat4(const b3Float4& a, const b3Float4& b) +{ + b3Float4 tmp = a; + tmp.setMax(b); + return tmp; +} +inline b3Float4 b3MinFloat4(const b3Float4& a, const b3Float4& b) +{ + b3Float4 tmp = a; + tmp.setMin(b); + return tmp; +} #else - typedef float4 b3Float4; - #define b3Float4ConstArg const b3Float4 - #define b3MakeFloat4 (float4) - float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1) - { - float4 a1 = b3MakeFloat4(v0.xyz,0.f); - float4 b1 = b3MakeFloat4(v1.xyz,0.f); - return dot(a1, b1); - } - b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1) - { - float4 a1 = b3MakeFloat4(v0.xyz,0.f); - float4 b1 = b3MakeFloat4(v1.xyz,0.f); - return cross(a1, b1); - } - #define b3MinFloat4 min - #define b3MaxFloat4 max - - #define b3Normalized(a) normalize(a) +typedef float4 b3Float4; +#define b3Float4ConstArg const b3Float4 +#define b3MakeFloat4 (float4) +float b3Dot3F4(b3Float4ConstArg v0, b3Float4ConstArg v1) +{ + float4 a1 = b3MakeFloat4(v0.xyz, 0.f); + float4 b1 = b3MakeFloat4(v1.xyz, 0.f); + return dot(a1, b1); +} +b3Float4 b3Cross3(b3Float4ConstArg v0, b3Float4ConstArg v1) +{ + float4 a1 = b3MakeFloat4(v0.xyz, 0.f); + float4 b1 = b3MakeFloat4(v1.xyz, 0.f); + return cross(a1, b1); +} +#define b3MinFloat4 min +#define b3MaxFloat4 max -#endif +#define b3Normalized(a) normalize(a) +#endif - inline bool b3IsAlmostZero(b3Float4ConstArg v) { - if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) + if (b3Fabs(v.x) > 1e-6 || b3Fabs(v.y) > 1e-6 || b3Fabs(v.z) > 1e-6) return false; return true; } - -inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut ) +inline int b3MaxDot(b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut) { - float maxDot = -B3_INFINITY; - int i = 0; - int ptIndex = -1; - for( i = 0; i < vecLen; i++ ) - { - float dot = b3Dot3F4(vecArray[i],vec); - - if( dot > maxDot ) - { - maxDot = dot; - ptIndex = i; - } - } - b3Assert(ptIndex>=0); - if (ptIndex<0) + float maxDot = -B3_INFINITY; + int i = 0; + int ptIndex = -1; + for (i = 0; i < vecLen; i++) + { + float dot = b3Dot3F4(vecArray[i], vec); + + if (dot > maxDot) + { + maxDot = dot; + ptIndex = i; + } + } + b3Assert(ptIndex >= 0); + if (ptIndex < 0) { ptIndex = 0; } - *dotOut = maxDot; - return ptIndex; + *dotOut = maxDot; + return ptIndex; } - - -#endif //B3_FLOAT4_H +#endif //B3_FLOAT4_H diff --git a/thirdparty/bullet/Bullet3Common/shared/b3Int2.h b/thirdparty/bullet/Bullet3Common/shared/b3Int2.h index f1d01f81a5..7b84de4436 100644 --- a/thirdparty/bullet/Bullet3Common/shared/b3Int2.h +++ b/thirdparty/bullet/Bullet3Common/shared/b3Int2.h @@ -20,11 +20,10 @@ subject to the following restrictions: struct b3UnsignedInt2 { - union - { + union { struct { - unsigned int x,y; + unsigned int x, y; }; struct { @@ -35,11 +34,10 @@ struct b3UnsignedInt2 struct b3Int2 { - union - { + union { struct { - int x,y; + int x, y; }; struct { @@ -51,7 +49,8 @@ struct b3Int2 inline b3Int2 b3MakeInt2(int x, int y) { b3Int2 v; - v.s[0] = x; v.s[1] = y; + v.s[0] = x; + v.s[1] = y; return v; } #else @@ -60,5 +59,5 @@ inline b3Int2 b3MakeInt2(int x, int y) #define b3Int2 int2 #define b3MakeInt2 (int2) -#endif //__cplusplus +#endif //__cplusplus #endif
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3Common/shared/b3Int4.h b/thirdparty/bullet/Bullet3Common/shared/b3Int4.h index aa02d6beef..f6a1754245 100644 --- a/thirdparty/bullet/Bullet3Common/shared/b3Int4.h +++ b/thirdparty/bullet/Bullet3Common/shared/b3Int4.h @@ -5,16 +5,15 @@ #include "Bullet3Common/b3Scalar.h" - -B3_ATTRIBUTE_ALIGNED16(struct) b3UnsignedInt4 +B3_ATTRIBUTE_ALIGNED16(struct) +b3UnsignedInt4 { B3_DECLARE_ALIGNED_ALLOCATOR(); - union - { + union { struct { - unsigned int x,y,z,w; + unsigned int x, y, z, w; }; struct { @@ -23,15 +22,15 @@ B3_ATTRIBUTE_ALIGNED16(struct) b3UnsignedInt4 }; }; -B3_ATTRIBUTE_ALIGNED16(struct) b3Int4 +B3_ATTRIBUTE_ALIGNED16(struct) +b3Int4 { B3_DECLARE_ALIGNED_ALLOCATOR(); - union - { + union { struct { - int x,y,z,w; + int x, y, z, w; }; struct { @@ -43,26 +42,30 @@ B3_ATTRIBUTE_ALIGNED16(struct) b3Int4 B3_FORCE_INLINE b3Int4 b3MakeInt4(int x, int y, int z, int w = 0) { b3Int4 v; - v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w; + v.s[0] = x; + v.s[1] = y; + v.s[2] = z; + v.s[3] = w; return v; } B3_FORCE_INLINE b3UnsignedInt4 b3MakeUnsignedInt4(unsigned int x, unsigned int y, unsigned int z, unsigned int w = 0) { b3UnsignedInt4 v; - v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w; + v.s[0] = x; + v.s[1] = y; + v.s[2] = z; + v.s[3] = w; return v; } #else - #define b3UnsignedInt4 uint4 #define b3Int4 int4 #define b3MakeInt4 (int4) #define b3MakeUnsignedInt4 (uint4) +#endif //__cplusplus -#endif //__cplusplus - -#endif //B3_INT4_H +#endif //B3_INT4_H diff --git a/thirdparty/bullet/Bullet3Common/shared/b3Mat3x3.h b/thirdparty/bullet/Bullet3Common/shared/b3Mat3x3.h index 7b1fef32f8..ce6482b5a6 100644 --- a/thirdparty/bullet/Bullet3Common/shared/b3Mat3x3.h +++ b/thirdparty/bullet/Bullet3Common/shared/b3Mat3x3.h @@ -4,7 +4,6 @@ #include "Bullet3Common/shared/b3Quat.h" - #ifdef __cplusplus #include "Bullet3Common/b3Matrix3x3.h" @@ -22,43 +21,41 @@ inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg mat) return mat.absolute(); } -#define b3GetRow(m,row) m.getRow(row) +#define b3GetRow(m, row) m.getRow(row) -__inline -b3Float4 mtMul3(b3Float4ConstArg a, b3Mat3x3ConstArg b) +__inline b3Float4 mtMul3(b3Float4ConstArg a, b3Mat3x3ConstArg b) { - return b*a; + return b * a; } - #else typedef struct { b3Float4 m_row[3]; -}b3Mat3x3; +} b3Mat3x3; #define b3Mat3x3ConstArg const b3Mat3x3 -#define b3GetRow(m,row) (m.m_row[row]) +#define b3GetRow(m, row) (m.m_row[row]) inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat) { - b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f); + b3Float4 quat2 = (b3Float4)(quat.x * quat.x, quat.y * quat.y, quat.z * quat.z, 0.f); b3Mat3x3 out; - out.m_row[0].x=1-2*quat2.y-2*quat2.z; - out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z; - out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y; + out.m_row[0].x = 1 - 2 * quat2.y - 2 * quat2.z; + out.m_row[0].y = 2 * quat.x * quat.y - 2 * quat.w * quat.z; + out.m_row[0].z = 2 * quat.x * quat.z + 2 * quat.w * quat.y; out.m_row[0].w = 0.f; - out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z; - out.m_row[1].y=1-2*quat2.x-2*quat2.z; - out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x; + out.m_row[1].x = 2 * quat.x * quat.y + 2 * quat.w * quat.z; + out.m_row[1].y = 1 - 2 * quat2.x - 2 * quat2.z; + out.m_row[1].z = 2 * quat.y * quat.z - 2 * quat.w * quat.x; out.m_row[1].w = 0.f; - out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y; - out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x; - out.m_row[2].z=1-2*quat2.x-2*quat2.y; + out.m_row[2].x = 2 * quat.x * quat.z - 2 * quat.w * quat.y; + out.m_row[2].y = 2 * quat.y * quat.z + 2 * quat.w * quat.x; + out.m_row[2].z = 1 - 2 * quat2.x - 2 * quat2.y; out.m_row[2].w = 0.f; return out; @@ -73,27 +70,19 @@ inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn) return out; } +__inline b3Mat3x3 mtZero(); -__inline -b3Mat3x3 mtZero(); - -__inline -b3Mat3x3 mtIdentity(); +__inline b3Mat3x3 mtIdentity(); -__inline -b3Mat3x3 mtTranspose(b3Mat3x3 m); +__inline b3Mat3x3 mtTranspose(b3Mat3x3 m); -__inline -b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b); +__inline b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b); -__inline -b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b); +__inline b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b); -__inline -b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b); +__inline b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b); -__inline -b3Mat3x3 mtZero() +__inline b3Mat3x3 mtZero() { b3Mat3x3 m; m.m_row[0] = (b3Float4)(0.f); @@ -102,18 +91,16 @@ b3Mat3x3 mtZero() return m; } -__inline -b3Mat3x3 mtIdentity() +__inline b3Mat3x3 mtIdentity() { b3Mat3x3 m; - m.m_row[0] = (b3Float4)(1,0,0,0); - m.m_row[1] = (b3Float4)(0,1,0,0); - m.m_row[2] = (b3Float4)(0,0,1,0); + m.m_row[0] = (b3Float4)(1, 0, 0, 0); + m.m_row[1] = (b3Float4)(0, 1, 0, 0); + m.m_row[2] = (b3Float4)(0, 0, 1, 0); return m; } -__inline -b3Mat3x3 mtTranspose(b3Mat3x3 m) +__inline b3Mat3x3 mtTranspose(b3Mat3x3 m) { b3Mat3x3 out; out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f); @@ -122,58 +109,49 @@ b3Mat3x3 mtTranspose(b3Mat3x3 m) return out; } -__inline -b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b) +__inline b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b) { b3Mat3x3 transB; - transB = mtTranspose( b ); + transB = mtTranspose(b); b3Mat3x3 ans; // why this doesn't run when 0ing in the for{} a.m_row[0].w = 0.f; a.m_row[1].w = 0.f; a.m_row[2].w = 0.f; - for(int i=0; i<3; i++) + for (int i = 0; i < 3; i++) { -// a.m_row[i].w = 0.f; - ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]); - ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]); - ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]); + // a.m_row[i].w = 0.f; + ans.m_row[i].x = b3Dot3F4(a.m_row[i], transB.m_row[0]); + ans.m_row[i].y = b3Dot3F4(a.m_row[i], transB.m_row[1]); + ans.m_row[i].z = b3Dot3F4(a.m_row[i], transB.m_row[2]); ans.m_row[i].w = 0.f; } return ans; } -__inline -b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b) +__inline b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b) { b3Float4 ans; - ans.x = b3Dot3F4( a.m_row[0], b ); - ans.y = b3Dot3F4( a.m_row[1], b ); - ans.z = b3Dot3F4( a.m_row[2], b ); + ans.x = b3Dot3F4(a.m_row[0], b); + ans.y = b3Dot3F4(a.m_row[1], b); + ans.z = b3Dot3F4(a.m_row[2], b); ans.w = 0.f; return ans; } -__inline -b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b) +__inline b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b) { b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0); b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0); b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0); b3Float4 ans; - ans.x = b3Dot3F4( a, colx ); - ans.y = b3Dot3F4( a, coly ); - ans.z = b3Dot3F4( a, colz ); + ans.x = b3Dot3F4(a, colx); + ans.y = b3Dot3F4(a, coly); + ans.z = b3Dot3F4(a, colz); return ans; } - #endif - - - - - -#endif //B3_MAT3x3_H +#endif //B3_MAT3x3_H diff --git a/thirdparty/bullet/Bullet3Common/shared/b3PlatformDefinitions.h b/thirdparty/bullet/Bullet3Common/shared/b3PlatformDefinitions.h index 1c133fb088..b72bee9310 100644 --- a/thirdparty/bullet/Bullet3Common/shared/b3PlatformDefinitions.h +++ b/thirdparty/bullet/Bullet3Common/shared/b3PlatformDefinitions.h @@ -8,18 +8,18 @@ struct MyTest #ifdef __cplusplus //#define b3ConstArray(a) const b3AlignedObjectArray<a>& -#define b3ConstArray(a) const a* +#define b3ConstArray(a) const a * #define b3AtomicInc(a) ((*a)++) -inline int b3AtomicAdd (volatile int *p, int val) +inline int b3AtomicAdd(volatile int *p, int val) { int oldValue = *p; - int newValue = oldValue+val; + int newValue = oldValue + val; *p = newValue; return oldValue; } -#define __global +#define __global #define B3_STATIC static #else @@ -27,7 +27,7 @@ inline int b3AtomicAdd (volatile int *p, int val) #define B3_LARGE_FLOAT 1e18f #define B3_INFINITY 1e18f #define b3Assert(a) -#define b3ConstArray(a) __global const a* +#define b3ConstArray(a) __global const a * #define b3AtomicInc atomic_inc #define b3AtomicAdd atomic_add #define b3Fabs fabs diff --git a/thirdparty/bullet/Bullet3Common/shared/b3Quat.h b/thirdparty/bullet/Bullet3Common/shared/b3Quat.h index f262d5e08f..940610c77b 100644 --- a/thirdparty/bullet/Bullet3Common/shared/b3Quat.h +++ b/thirdparty/bullet/Bullet3Common/shared/b3Quat.h @@ -5,35 +5,34 @@ #include "Bullet3Common/shared/b3Float4.h" #ifdef __cplusplus - #include "Bullet3Common/b3Quaternion.h" - #include "Bullet3Common/b3Transform.h" +#include "Bullet3Common/b3Quaternion.h" +#include "Bullet3Common/b3Transform.h" - #define b3Quat b3Quaternion - #define b3QuatConstArg const b3Quaternion& - inline b3Quat b3QuatInverse(b3QuatConstArg orn) - { - return orn.inverse(); - } +#define b3Quat b3Quaternion +#define b3QuatConstArg const b3Quaternion& +inline b3Quat b3QuatInverse(b3QuatConstArg orn) +{ + return orn.inverse(); +} - inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation) - { - b3Transform tr; - tr.setOrigin(translation); - tr.setRotation(orientation); - return tr(point); - } +inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation) +{ + b3Transform tr; + tr.setOrigin(translation); + tr.setRotation(orientation); + return tr(point); +} #else - typedef float4 b3Quat; - #define b3QuatConstArg const b3Quat - - +typedef float4 b3Quat; +#define b3QuatConstArg const b3Quat + inline float4 b3FastNormalize4(float4 v) { - v = (float4)(v.xyz,0.f); + v = (float4)(v.xyz, 0.f); return fast_normalize(v); } - + inline b3Quat b3QuatMul(b3Quat a, b3Quat b); inline b3Quat b3QuatNormalized(b3QuatConstArg in); inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec); @@ -43,20 +42,20 @@ inline b3Quat b3QuatInverse(b3QuatConstArg q); inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b) { b3Quat ans; - ans = b3Cross3( a, b ); - ans += a.w*b+b.w*a; -// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z); - ans.w = a.w*b.w - b3Dot3F4(a, b); + ans = b3Cross3(a, b); + ans += a.w * b + b.w * a; + // ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z); + ans.w = a.w * b.w - b3Dot3F4(a, b); return ans; } inline b3Quat b3QuatNormalized(b3QuatConstArg in) { b3Quat q; - q=in; + q = in; //return b3FastNormalize4(in); float len = native_sqrt(dot(q, q)); - if(len > 0.f) + if (len > 0.f) { q *= 1.f / len; } @@ -69,15 +68,13 @@ inline b3Quat b3QuatNormalized(b3QuatConstArg in) } inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec) { - b3Quat qInv = b3QuatInvert( q ); + b3Quat qInv = b3QuatInvert(q); float4 vcpy = vec; vcpy.w = 0.f; - float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv); + float4 out = b3QuatMul(b3QuatMul(q, vcpy), qInv); return out; } - - inline b3Quat b3QuatInverse(b3QuatConstArg q) { return (b3Quat)(-q.xyz, q.w); @@ -90,14 +87,14 @@ inline b3Quat b3QuatInvert(b3QuatConstArg q) inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec) { - return b3QuatRotate( b3QuatInvert( q ), vec ); + return b3QuatRotate(b3QuatInvert(q), vec); } -inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation) +inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation) { - return b3QuatRotate( orientation, point ) + (translation); + return b3QuatRotate(orientation, point) + (translation); } - -#endif -#endif //B3_QUAT_H +#endif + +#endif //B3_QUAT_H |