27 files changed, 4765 insertions, 4904 deletions
diff --git a/thirdparty/bullet/Bullet3Common/b3AlignedAllocator.cpp b/thirdparty/bullet/Bullet3Common/b3AlignedAllocator.cpp
index b98e2b4d33..d546d5e066 100644
--- a/thirdparty/bullet/Bullet3Common/b3AlignedAllocator.cpp
+++ b/thirdparty/bullet/Bullet3Common/b3AlignedAllocator.cpp
@@ -15,9 +15,11 @@ subject to the following restrictions:
 
 #include "b3AlignedAllocator.h"
 
+#ifdef B3_ALLOCATOR_STATISTICS
 int b3g_numAlignedAllocs = 0;
 int b3g_numAlignedFree = 0;
-int b3g_totalBytesAlignedAllocs = 0;//detect memory leaks
+int b3g_totalBytesAlignedAllocs = 0;  //detect memory leaks
+#endif
 
 static void *b3AllocDefault(size_t size)
 {
@@ -29,12 +31,10 @@ static void b3FreeDefault(void *ptr)
 	free(ptr);
 }
 
-static b3AllocFunc* b3s_allocFunc = b3AllocDefault;
-static b3FreeFunc* b3s_freeFunc = b3FreeDefault;
-
-
+static b3AllocFunc *b3s_allocFunc = b3AllocDefault;
+static b3FreeFunc *b3s_freeFunc = b3FreeDefault;
 
-#if defined (B3_HAS_ALIGNED_ALLOCATOR)
+#if defined(B3_HAS_ALIGNED_ALLOCATOR)
 #include <malloc.h>
 static void *b3AlignedAllocDefault(size_t size, int alignment)
 {
@@ -59,123 +59,128 @@ static inline void b3AlignedFreeDefault(void *ptr)
 }
 #else
 
-
-
-
-
 static inline void *b3AlignedAllocDefault(size_t size, int alignment)
 {
-  void *ret;
-  char *real;
-  real = (char *)b3s_allocFunc(size + sizeof(void *) + (alignment-1));
-  if (real) {
-	ret = b3AlignPointer(real + sizeof(void *),alignment);
-    *((void **)(ret)-1) = (void *)(real);
-  } else {
-    ret = (void *)(real);
-  }
-  return (ret);
+	void *ret;
+	char *real;
+	real = (char *)b3s_allocFunc(size + sizeof(void *) + (alignment - 1));
+	if (real)
+	{
+		ret = b3AlignPointer(real + sizeof(void *), alignment);
+		*((void **)(ret)-1) = (void *)(real);
+	}
+	else
+	{
+		ret = (void *)(real);
+	}
+	return (ret);
 }
 
 static inline void b3AlignedFreeDefault(void *ptr)
 {
-  void* real;
+	void *real;
 
-  if (ptr) {
-    real = *((void **)(ptr)-1);
-    b3s_freeFunc(real);
-  }
+	if (ptr)
+	{
+		real = *((void **)(ptr)-1);
+		b3s_freeFunc(real);
+	}
 }
 #endif
 
-
-static b3AlignedAllocFunc* b3s_alignedAllocFunc = b3AlignedAllocDefault;
-static b3AlignedFreeFunc* b3s_alignedFreeFunc = b3AlignedFreeDefault;
+static b3AlignedAllocFunc *b3s_alignedAllocFunc = b3AlignedAllocDefault;
+static b3AlignedFreeFunc *b3s_alignedFreeFunc = b3AlignedFreeDefault;
 
 void b3AlignedAllocSetCustomAligned(b3AlignedAllocFunc *allocFunc, b3AlignedFreeFunc *freeFunc)
 {
-  b3s_alignedAllocFunc = allocFunc ? allocFunc : b3AlignedAllocDefault;
-  b3s_alignedFreeFunc = freeFunc ? freeFunc : b3AlignedFreeDefault;
+	b3s_alignedAllocFunc = allocFunc ? allocFunc : b3AlignedAllocDefault;
+	b3s_alignedFreeFunc = freeFunc ? freeFunc : b3AlignedFreeDefault;
 }
 
 void b3AlignedAllocSetCustom(b3AllocFunc *allocFunc, b3FreeFunc *freeFunc)
 {
-  b3s_allocFunc = allocFunc ? allocFunc : b3AllocDefault;
-  b3s_freeFunc = freeFunc ? freeFunc : b3FreeDefault;
+	b3s_allocFunc = allocFunc ? allocFunc : b3AllocDefault;
+	b3s_freeFunc = freeFunc ? freeFunc : b3FreeDefault;
 }
 
 #ifdef B3_DEBUG_MEMORY_ALLOCATIONS
 //this generic allocator provides the total allocated number of bytes
 #include <stdio.h>
 
-void*   b3AlignedAllocInternal  (size_t size, int alignment,int line,char* filename)
+void *b3AlignedAllocInternal(size_t size, int alignment, int line, char *filename)
 {
- void *ret;
- char *real;
-
- b3g_totalBytesAlignedAllocs += size;
- b3g_numAlignedAllocs++;
-
- 
- real = (char *)b3s_allocFunc(size + 2*sizeof(void *) + (alignment-1));
- if (real) {
-   ret = (void*) b3AlignPointer(real + 2*sizeof(void *), alignment);
-   *((void **)(ret)-1) = (void *)(real);
-       *((int*)(ret)-2) = size;
-
- } else {
-   ret = (void *)(real);//??
- }
+	void *ret;
+	char *real;
+#ifdef B3_ALLOCATOR_STATISTICS
+	b3g_totalBytesAlignedAllocs += size;
+	b3g_numAlignedAllocs++;
+#endif
+	real = (char *)b3s_allocFunc(size + 2 * sizeof(void *) + (alignment - 1));
+	if (real)
+	{
+		ret = (void *)b3AlignPointer(real + 2 * sizeof(void *), alignment);
+		*((void **)(ret)-1) = (void *)(real);
+		*((int *)(ret)-2) = size;
+	}
+	else
+	{
+		ret = (void *)(real);  //??
+	}
 
- b3Printf("allocation#%d at address %x, from %s,line %d, size %d\n",b3g_numAlignedAllocs,real, filename,line,size);
+	b3Printf("allocation#%d at address %x, from %s,line %d, size %d\n", b3g_numAlignedAllocs, real, filename, line, size);
 
- int* ptr = (int*)ret;
- *ptr = 12;
- return (ret);
+	int *ptr = (int *)ret;
+	*ptr = 12;
+	return (ret);
 }
 
-void    b3AlignedFreeInternal   (void* ptr,int line,char* filename)
+void b3AlignedFreeInternal(void *ptr, int line, char *filename)
 {
+	void *real;
+#ifdef B3_ALLOCATOR_STATISTICS
+	b3g_numAlignedFree++;
+#endif
+	if (ptr)
+	{
+		real = *((void **)(ptr)-1);
+		int size = *((int *)(ptr)-2);
+#ifdef B3_ALLOCATOR_STATISTICS
+		b3g_totalBytesAlignedAllocs -= size;
+#endif
+		b3Printf("free #%d at address %x, from %s,line %d, size %d\n", b3g_numAlignedFree, real, filename, line, size);
 
- void* real;
- b3g_numAlignedFree++;
-
- if (ptr) {
-   real = *((void **)(ptr)-1);
-       int size = *((int*)(ptr)-2);
-       b3g_totalBytesAlignedAllocs -= size;
-
-	   b3Printf("free #%d at address %x, from %s,line %d, size %d\n",b3g_numAlignedFree,real, filename,line,size);
-
-   b3s_freeFunc(real);
- } else
- {
-	 b3Printf("NULL ptr\n");
- }
+		b3s_freeFunc(real);
+	}
+	else
+	{
+		b3Printf("NULL ptr\n");
+	}
 }
 
-#else //B3_DEBUG_MEMORY_ALLOCATIONS
+#else  //B3_DEBUG_MEMORY_ALLOCATIONS
 
-void*	b3AlignedAllocInternal	(size_t size, int alignment)
+void *b3AlignedAllocInternal(size_t size, int alignment)
 {
+#ifdef B3_ALLOCATOR_STATISTICS
 	b3g_numAlignedAllocs++;
-	void* ptr;
+#endif
+	void *ptr;
 	ptr = b3s_alignedAllocFunc(size, alignment);
-//	b3Printf("b3AlignedAllocInternal %d, %x\n",size,ptr);
+	//	b3Printf("b3AlignedAllocInternal %d, %x\n",size,ptr);
 	return ptr;
 }
 
-void	b3AlignedFreeInternal	(void* ptr)
+void b3AlignedFreeInternal(void *ptr)
 {
 	if (!ptr)
 	{
 		return;
 	}
-
+#ifdef B3_ALLOCATOR_STATISTICS
 	b3g_numAlignedFree++;
-//	b3Printf("b3AlignedFreeInternal %x\n",ptr);
+#endif
+	//	b3Printf("b3AlignedFreeInternal %x\n",ptr);
 	b3s_alignedFreeFunc(ptr);
 }
 
-#endif //B3_DEBUG_MEMORY_ALLOCATIONS
-
+#endif  //B3_DEBUG_MEMORY_ALLOCATIONS
diff --git a/thirdparty/bullet/Bullet3Common/b3AlignedAllocator.h b/thirdparty/bullet/Bullet3Common/b3AlignedAllocator.h
index be418bd55f..bcff9f128e 100644
--- a/thirdparty/bullet/Bullet3Common/b3AlignedAllocator.h
+++ b/thirdparty/bullet/Bullet3Common/b3AlignedAllocator.h
@@ -24,84 +24,87 @@ subject to the following restrictions:
 //#define B3_DEBUG_MEMORY_ALLOCATIONS 1
 #ifdef B3_DEBUG_MEMORY_ALLOCATIONS
 
-#define b3AlignedAlloc(a,b) \
-		b3AlignedAllocInternal(a,b,__LINE__,__FILE__)
+#define b3AlignedAlloc(a, b) \
+	b3AlignedAllocInternal(a, b, __LINE__, __FILE__)
 
 #define b3AlignedFree(ptr) \
-		b3AlignedFreeInternal(ptr,__LINE__,__FILE__)
+	b3AlignedFreeInternal(ptr, __LINE__, __FILE__)
 
-void*	b3AlignedAllocInternal	(size_t size, int alignment,int line,char* filename);
+void* b3AlignedAllocInternal(size_t size, int alignment, int line, char* filename);
 
-void	b3AlignedFreeInternal	(void* ptr,int line,char* filename);
+void b3AlignedFreeInternal(void* ptr, int line, char* filename);
 
 #else
-	void*	b3AlignedAllocInternal	(size_t size, int alignment);
-	void	b3AlignedFreeInternal	(void* ptr);
+void* b3AlignedAllocInternal(size_t size, int alignment);
+void b3AlignedFreeInternal(void* ptr);
 
-	#define b3AlignedAlloc(size,alignment) b3AlignedAllocInternal(size,alignment)
-	#define b3AlignedFree(ptr) b3AlignedFreeInternal(ptr)
+#define b3AlignedAlloc(size, alignment) b3AlignedAllocInternal(size, alignment)
+#define b3AlignedFree(ptr) b3AlignedFreeInternal(ptr)
 
 #endif
-typedef int	btSizeType;
+typedef int btSizeType;
 
-typedef void *(b3AlignedAllocFunc)(size_t size, int alignment);
-typedef void (b3AlignedFreeFunc)(void *memblock);
-typedef void *(b3AllocFunc)(size_t size);
-typedef void (b3FreeFunc)(void *memblock);
+typedef void*(b3AlignedAllocFunc)(size_t size, int alignment);
+typedef void(b3AlignedFreeFunc)(void* memblock);
+typedef void*(b3AllocFunc)(size_t size);
+typedef void(b3FreeFunc)(void* memblock);
 
 ///The developer can let all Bullet memory allocations go through a custom memory allocator, using b3AlignedAllocSetCustom
-void b3AlignedAllocSetCustom(b3AllocFunc *allocFunc, b3FreeFunc *freeFunc);
+void b3AlignedAllocSetCustom(b3AllocFunc* allocFunc, b3FreeFunc* freeFunc);
 ///If the developer has already an custom aligned allocator, then b3AlignedAllocSetCustomAligned can be used. The default aligned allocator pre-allocates extra memory using the non-aligned allocator, and instruments it.
-void b3AlignedAllocSetCustomAligned(b3AlignedAllocFunc *allocFunc, b3AlignedFreeFunc *freeFunc);
-
+void b3AlignedAllocSetCustomAligned(b3AlignedAllocFunc* allocFunc, b3AlignedFreeFunc* freeFunc);
 
 ///The b3AlignedAllocator is a portable class for aligned memory allocations.
 ///Default implementations for unaligned and aligned allocations can be overridden by a custom allocator using b3AlignedAllocSetCustom and b3AlignedAllocSetCustomAligned.
-template < typename T , unsigned Alignment >
-class b3AlignedAllocator {
-	
-	typedef b3AlignedAllocator< T , Alignment > self_type;
-	
-public:
+template <typename T, unsigned Alignment>
+class b3AlignedAllocator
+{
+	typedef b3AlignedAllocator<T, Alignment> self_type;
 
+public:
 	//just going down a list:
 	b3AlignedAllocator() {}
 	/*
 	b3AlignedAllocator( const self_type & ) {}
 	*/
 
-	template < typename Other >
-	b3AlignedAllocator( const b3AlignedAllocator< Other , Alignment > & ) {}
+	template <typename Other>
+	b3AlignedAllocator(const b3AlignedAllocator<Other, Alignment>&)
+	{
+	}
 
-	typedef const T*         const_pointer;
-	typedef const T&         const_reference;
-	typedef T*               pointer;
-	typedef T&               reference;
-	typedef T                value_type;
+	typedef const T* const_pointer;
+	typedef const T& const_reference;
+	typedef T* pointer;
+	typedef T& reference;
+	typedef T value_type;
 
-	pointer       address   ( reference        ref ) const                           { return &ref; }
-	const_pointer address   ( const_reference  ref ) const                           { return &ref; }
-	pointer       allocate  ( btSizeType        n   , const_pointer *      hint = 0 ) {
+	pointer address(reference ref) const { return &ref; }
+	const_pointer address(const_reference ref) const { return &ref; }
+	pointer allocate(btSizeType n, const_pointer* hint = 0)
+	{
 		(void)hint;
-		return reinterpret_cast< pointer >(b3AlignedAlloc( sizeof(value_type) * n , Alignment ));
+		return reinterpret_cast<pointer>(b3AlignedAlloc(sizeof(value_type) * n, Alignment));
 	}
-	void          construct ( pointer          ptr , const value_type &   value    ) { new (ptr) value_type( value ); }
-	void          deallocate( pointer          ptr ) {
-		b3AlignedFree( reinterpret_cast< void * >( ptr ) );
+	void construct(pointer ptr, const value_type& value) { new (ptr) value_type(value); }
+	void deallocate(pointer ptr)
+	{
+		b3AlignedFree(reinterpret_cast<void*>(ptr));
 	}
-	void          destroy   ( pointer          ptr )                                 { ptr->~value_type(); }
-	
+	void destroy(pointer ptr) { ptr->~value_type(); }
 
-	template < typename O > struct rebind {
-		typedef b3AlignedAllocator< O , Alignment > other;
+	template <typename O>
+	struct rebind
+	{
+		typedef b3AlignedAllocator<O, Alignment> other;
 	};
-	template < typename O >
-	self_type & operator=( const b3AlignedAllocator< O , Alignment > & ) { return *this; }
+	template <typename O>
+	self_type& operator=(const b3AlignedAllocator<O, Alignment>&)
+	{
+		return *this;
+	}
 
-	friend bool operator==( const self_type & , const self_type & ) { return true; }
+	friend bool operator==(const self_type&, const self_type&) { return true; }
 };
 
-
-
-#endif //B3_ALIGNED_ALLOCATOR
-
+#endif  //B3_ALIGNED_ALLOCATOR
diff --git a/thirdparty/bullet/Bullet3Common/b3AlignedObjectArray.h b/thirdparty/bullet/Bullet3Common/b3AlignedObjectArray.h
index ef71016565..249e381bf1 100644
--- a/thirdparty/bullet/Bullet3Common/b3AlignedObjectArray.h
+++ b/thirdparty/bullet/Bullet3Common/b3AlignedObjectArray.h
@@ -13,11 +13,10 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
-
 #ifndef B3_OBJECT_ARRAY__
 #define B3_OBJECT_ARRAY__
 
-#include "b3Scalar.h" // has definitions like B3_FORCE_INLINE
+#include "b3Scalar.h"  // has definitions like B3_FORCE_INLINE
 #include "b3AlignedAllocator.h"
 
 ///If the platform doesn't support placement new, you can disable B3_USE_PLACEMENT_NEW
@@ -28,402 +27,386 @@ subject to the following restrictions:
 
 #define B3_USE_PLACEMENT_NEW 1
 //#define B3_USE_MEMCPY 1 //disable, because it is cumbersome to find out for each platform where memcpy is defined. It can be in <memory.h> or <string.h> or otherwise...
-#define B3_ALLOW_ARRAY_COPY_OPERATOR // enabling this can accidently perform deep copies of data if you are not careful
+#define B3_ALLOW_ARRAY_COPY_OPERATOR  // enabling this can accidently perform deep copies of data if you are not careful
 
 #ifdef B3_USE_MEMCPY
 #include <memory.h>
 #include <string.h>
-#endif //B3_USE_MEMCPY
+#endif  //B3_USE_MEMCPY
 
 #ifdef B3_USE_PLACEMENT_NEW
-#include <new> //for placement new
-#endif //B3_USE_PLACEMENT_NEW
-
+#include <new>  //for placement new
+#endif          //B3_USE_PLACEMENT_NEW
 
 ///The b3AlignedObjectArray template class uses a subset of the stl::vector interface for its methods
 ///It is developed to replace stl::vector to avoid portability issues, including STL alignment issues to add SIMD/SSE data
-template <typename T> 
-//template <class T> 
+template <typename T>
+//template <class T>
 class b3AlignedObjectArray
 {
-	b3AlignedAllocator<T , 16>	m_allocator;
+	b3AlignedAllocator<T, 16> m_allocator;
 
-	int					m_size;
-	int					m_capacity;
-	T*					m_data;
+	int m_size;
+	int m_capacity;
+	T* m_data;
 	//PCK: added this line
-	bool				m_ownsMemory;
+	bool m_ownsMemory;
 
 #ifdef B3_ALLOW_ARRAY_COPY_OPERATOR
 public:
-	B3_FORCE_INLINE b3AlignedObjectArray<T>& operator=(const b3AlignedObjectArray<T> &other)
+	B3_FORCE_INLINE b3AlignedObjectArray<T>& operator=(const b3AlignedObjectArray<T>& other)
 	{
 		copyFromArray(other);
 		return *this;
 	}
-#else//B3_ALLOW_ARRAY_COPY_OPERATOR
+#else   //B3_ALLOW_ARRAY_COPY_OPERATOR
 private:
-		B3_FORCE_INLINE b3AlignedObjectArray<T>& operator=(const b3AlignedObjectArray<T> &other);
-#endif//B3_ALLOW_ARRAY_COPY_OPERATOR
+	B3_FORCE_INLINE b3AlignedObjectArray<T>& operator=(const b3AlignedObjectArray<T>& other);
+#endif  //B3_ALLOW_ARRAY_COPY_OPERATOR
 
 protected:
-		B3_FORCE_INLINE	int	allocSize(int size)
-		{
-			return (size ? size*2 : 1);
-		}
-		B3_FORCE_INLINE	void	copy(int start,int end, T* dest) const
-		{
-			int i;
-			for (i=start;i<end;++i)
+	B3_FORCE_INLINE int allocSize(int size)
+	{
+		return (size ? size * 2 : 1);
+	}
+	B3_FORCE_INLINE void copy(int start, int end, T* dest) const
+	{
+		int i;
+		for (i = start; i < end; ++i)
 #ifdef B3_USE_PLACEMENT_NEW
-				new (&dest[i]) T(m_data[i]);
+			new (&dest[i]) T(m_data[i]);
 #else
-				dest[i] = m_data[i];
-#endif //B3_USE_PLACEMENT_NEW
-		}
+			dest[i] = m_data[i];
+#endif  //B3_USE_PLACEMENT_NEW
+	}
 
-		B3_FORCE_INLINE	void	init()
+	B3_FORCE_INLINE void init()
+	{
+		//PCK: added this line
+		m_ownsMemory = true;
+		m_data = 0;
+		m_size = 0;
+		m_capacity = 0;
+	}
+	B3_FORCE_INLINE void destroy(int first, int last)
+	{
+		int i;
+		for (i = first; i < last; i++)
 		{
-			//PCK: added this line
-			m_ownsMemory = true;
-			m_data = 0;
-			m_size = 0;
-			m_capacity = 0;
+			m_data[i].~T();
 		}
-		B3_FORCE_INLINE	void	destroy(int first,int last)
+	}
+
+	B3_FORCE_INLINE void* allocate(int size)
+	{
+		if (size)
+			return m_allocator.allocate(size);
+		return 0;
+	}
+
+	B3_FORCE_INLINE void deallocate()
+	{
+		if (m_data)
 		{
-			int i;
-			for (i=first; i<last;i++)
+			//PCK: enclosed the deallocation in this block
+			if (m_ownsMemory)
 			{
-				m_data[i].~T();
+				m_allocator.deallocate(m_data);
 			}
+			m_data = 0;
 		}
+	}
 
-		B3_FORCE_INLINE	void* allocate(int size)
-		{
-			if (size)
-				return m_allocator.allocate(size);
-			return 0;
-		}
+public:
+	b3AlignedObjectArray()
+	{
+		init();
+	}
 
-		B3_FORCE_INLINE	void	deallocate()
-		{
-			if(m_data)	{
-				//PCK: enclosed the deallocation in this block
-				if (m_ownsMemory)
-				{
-					m_allocator.deallocate(m_data);
-				}
-				m_data = 0;
-			}
-		}
+	~b3AlignedObjectArray()
+	{
+		clear();
+	}
 
-	
+	///Generally it is best to avoid using the copy constructor of an b3AlignedObjectArray, and use a (const) reference to the array instead.
+	b3AlignedObjectArray(const b3AlignedObjectArray& otherArray)
+	{
+		init();
 
+		int otherSize = otherArray.size();
+		resize(otherSize);
+		otherArray.copy(0, otherSize, m_data);
+	}
 
-	public:
-		
-		b3AlignedObjectArray()
-		{
-			init();
-		}
+	/// return the number of elements in the array
+	B3_FORCE_INLINE int size() const
+	{
+		return m_size;
+	}
 
-		~b3AlignedObjectArray()
-		{
-			clear();
-		}
+	B3_FORCE_INLINE const T& at(int n) const
+	{
+		b3Assert(n >= 0);
+		b3Assert(n < size());
+		return m_data[n];
+	}
 
-		///Generally it is best to avoid using the copy constructor of an b3AlignedObjectArray, and use a (const) reference to the array instead.
-		b3AlignedObjectArray(const b3AlignedObjectArray& otherArray)
-		{
-			init();
+	B3_FORCE_INLINE T& at(int n)
+	{
+		b3Assert(n >= 0);
+		b3Assert(n < size());
+		return m_data[n];
+	}
 
-			int otherSize = otherArray.size();
-			resize (otherSize);
-			otherArray.copy(0, otherSize, m_data);
-		}
+	B3_FORCE_INLINE const T& operator[](int n) const
+	{
+		b3Assert(n >= 0);
+		b3Assert(n < size());
+		return m_data[n];
+	}
 
-		
-		
-		/// return the number of elements in the array
-		B3_FORCE_INLINE	int size() const
-		{	
-			return m_size;
-		}
-		
-		B3_FORCE_INLINE const T& at(int n) const
-		{
-			b3Assert(n>=0);
-			b3Assert(n<size());
-			return m_data[n];
-		}
+	B3_FORCE_INLINE T& operator[](int n)
+	{
+		b3Assert(n >= 0);
+		b3Assert(n < size());
+		return m_data[n];
+	}
 
-		B3_FORCE_INLINE T& at(int n)
-		{
-			b3Assert(n>=0);
-			b3Assert(n<size());
-			return m_data[n];
-		}
+	///clear the array, deallocated memory. Generally it is better to use array.resize(0), to reduce performance overhead of run-time memory (de)allocations.
+	B3_FORCE_INLINE void clear()
+	{
+		destroy(0, size());
 
-		B3_FORCE_INLINE const T& operator[](int n) const
-		{
-			b3Assert(n>=0);
-			b3Assert(n<size());
-			return m_data[n];
-		}
+		deallocate();
 
-		B3_FORCE_INLINE T& operator[](int n)
-		{
-			b3Assert(n>=0);
-			b3Assert(n<size());
-			return m_data[n];
-		}
-		
+		init();
+	}
 
-		///clear the array, deallocated memory. Generally it is better to use array.resize(0), to reduce performance overhead of run-time memory (de)allocations.
-		B3_FORCE_INLINE	void	clear()
+	B3_FORCE_INLINE void pop_back()
+	{
+		b3Assert(m_size > 0);
+		m_size--;
+		m_data[m_size].~T();
+	}
+
+	///resize changes the number of elements in the array. If the new size is larger, the new elements will be constructed using the optional second argument.
+	///when the new number of elements is smaller, the destructor will be called, but memory will not be freed, to reduce performance overhead of run-time memory (de)allocations.
+	B3_FORCE_INLINE void resizeNoInitialize(int newsize)
+	{
+		int curSize = size();
+
+		if (newsize < curSize)
 		{
-			destroy(0,size());
-			
-			deallocate();
-			
-			init();
 		}
-
-		B3_FORCE_INLINE	void	pop_back()
+		else
 		{
-			b3Assert(m_size>0);
-			m_size--;
-			m_data[m_size].~T();
+			if (newsize > size())
+			{
+				reserve(newsize);
+			}
+			//leave this uninitialized
 		}
+		m_size = newsize;
+	}
 
+	B3_FORCE_INLINE void resize(int newsize, const T& fillData = T())
+	{
+		int curSize = size();
 
-		///resize changes the number of elements in the array. If the new size is larger, the new elements will be constructed using the optional second argument.
-		///when the new number of elements is smaller, the destructor will be called, but memory will not be freed, to reduce performance overhead of run-time memory (de)allocations.
-		B3_FORCE_INLINE	void	resizeNoInitialize(int newsize)
+		if (newsize < curSize)
 		{
-			int curSize = size();
-
-			if (newsize < curSize)
-			{
-			} else
+			for (int i = newsize; i < curSize; i++)
 			{
-				if (newsize > size())
-				{
-					reserve(newsize);
-				}
-				//leave this uninitialized
+				m_data[i].~T();
 			}
-			m_size = newsize;
 		}
-	
-		B3_FORCE_INLINE	void	resize(int newsize, const T& fillData=T())
+		else
 		{
-			int curSize = size();
-
-			if (newsize < curSize)
-			{
-				for(int i = newsize; i < curSize; i++)
-				{
-					m_data[i].~T();
-				}
-			} else
+			if (newsize > size())
 			{
-				if (newsize > size())
-				{
-					reserve(newsize);
-				}
-#ifdef B3_USE_PLACEMENT_NEW
-				for (int i=curSize;i<newsize;i++)
-				{
-					new ( &m_data[i]) T(fillData);
-				}
-#endif //B3_USE_PLACEMENT_NEW
-
+				reserve(newsize);
 			}
-
-			m_size = newsize;
-		}
-		B3_FORCE_INLINE	T&  expandNonInitializing( )
-		{	
-			int sz = size();
-			if( sz == capacity() )
+#ifdef B3_USE_PLACEMENT_NEW
+			for (int i = curSize; i < newsize; i++)
 			{
-				reserve( allocSize(size()) );
+				new (&m_data[i]) T(fillData);
 			}
-			m_size++;
+#endif  //B3_USE_PLACEMENT_NEW
+		}
 
-			return m_data[sz];		
+		m_size = newsize;
+	}
+	B3_FORCE_INLINE T& expandNonInitializing()
+	{
+		int sz = size();
+		if (sz == capacity())
+		{
+			reserve(allocSize(size()));
 		}
+		m_size++;
 
+		return m_data[sz];
+	}
 
-		B3_FORCE_INLINE	T&  expand( const T& fillValue=T())
-		{	
-			int sz = size();
-			if( sz == capacity() )
-			{
-				reserve( allocSize(size()) );
-			}
-			m_size++;
+	B3_FORCE_INLINE T& expand(const T& fillValue = T())
+	{
+		int sz = size();
+		if (sz == capacity())
+		{
+			reserve(allocSize(size()));
+		}
+		m_size++;
 #ifdef B3_USE_PLACEMENT_NEW
-			new (&m_data[sz]) T(fillValue); //use the in-place new (not really allocating heap memory)
+		new (&m_data[sz]) T(fillValue);  //use the in-place new (not really allocating heap memory)
 #endif
 
-			return m_data[sz];		
-		}
+		return m_data[sz];
+	}
 
+	B3_FORCE_INLINE void push_back(const T& _Val)
+	{
+		int sz = size();
+		if (sz == capacity())
+		{
+			reserve(allocSize(size()));
+		}
 
-		B3_FORCE_INLINE	void push_back(const T& _Val)
-		{	
-			int sz = size();
-			if( sz == capacity() )
-			{
-				reserve( allocSize(size()) );
-			}
-			
 #ifdef B3_USE_PLACEMENT_NEW
-			new ( &m_data[m_size] ) T(_Val);
+		new (&m_data[m_size]) T(_Val);
 #else
-			m_data[size()] = _Val;			
-#endif //B3_USE_PLACEMENT_NEW
+		m_data[size()] = _Val;
+#endif  //B3_USE_PLACEMENT_NEW
 
-			m_size++;
-		}
+		m_size++;
+	}
 
-	
-		/// return the pre-allocated (reserved) elements, this is at least as large as the total number of elements,see size() and reserve()
-		B3_FORCE_INLINE	int capacity() const
-		{	
-			return m_capacity;
-		}
-		
-		B3_FORCE_INLINE	void reserve(int _Count)
-		{	// determine new minimum length of allocated storage
-			if (capacity() < _Count)
-			{	// not enough room, reallocate
-				T*	s = (T*)allocate(_Count);
-				b3Assert(s);
-				if (s==0)
-				{
-					b3Error("b3AlignedObjectArray reserve out-of-memory\n");
-					_Count=0;
-					m_size=0;
-				}
-				copy(0, size(), s);
-
-				destroy(0,size());
-
-				deallocate();
-				
-				//PCK: added this line
-				m_ownsMemory = true;
-
-				m_data = s;
-				
-				m_capacity = _Count;
+	/// return the pre-allocated (reserved) elements, this is at least as large as the total number of elements,see size() and reserve()
+	B3_FORCE_INLINE int capacity() const
+	{
+		return m_capacity;
+	}
 
+	B3_FORCE_INLINE void reserve(int _Count)
+	{  // determine new minimum length of allocated storage
+		if (capacity() < _Count)
+		{  // not enough room, reallocate
+			T* s = (T*)allocate(_Count);
+			b3Assert(s);
+			if (s == 0)
+			{
+				b3Error("b3AlignedObjectArray reserve out-of-memory\n");
+				_Count = 0;
+				m_size = 0;
 			}
-		}
+			copy(0, size(), s);
 
+			destroy(0, size());
 
-		class less
-		{
-			public:
+			deallocate();
+
+			//PCK: added this line
+			m_ownsMemory = true;
+
+			m_data = s;
 
-				bool operator() ( const T& a, const T& b )
-				{
-					return ( a < b );
-				}
-		};
-	
+			m_capacity = _Count;
+		}
+	}
 
-		template <typename L>
-		void quickSortInternal(const L& CompareFunc,int lo, int hi)
+	class less
+	{
+	public:
+		bool operator()(const T& a, const T& b)
 		{
-		//  lo is the lower index, hi is the upper index
-		//  of the region of array a that is to be sorted
-			int i=lo, j=hi;
-			T x=m_data[(lo+hi)/2];
-
-			//  partition
-			do
-			{    
-				while (CompareFunc(m_data[i],x)) 
-					i++; 
-				while (CompareFunc(x,m_data[j])) 
-					j--;
-				if (i<=j)
-				{
-					swap(i,j);
-					i++; j--;
-				}
-			} while (i<=j);
-
-			//  recursion
-			if (lo<j) 
-				quickSortInternal( CompareFunc, lo, j);
-			if (i<hi) 
-				quickSortInternal( CompareFunc, i, hi);
+			return (a < b);
 		}
+	};
 
+	template <typename L>
+	void quickSortInternal(const L& CompareFunc, int lo, int hi)
+	{
+		//  lo is the lower index, hi is the upper index
+		//  of the region of array a that is to be sorted
+		int i = lo, j = hi;
+		T x = m_data[(lo + hi) / 2];
 
-		template <typename L>
-		void quickSort(const L& CompareFunc)
+		//  partition
+		do
 		{
-			//don't sort 0 or 1 elements
-			if (size()>1)
+			while (CompareFunc(m_data[i], x))
+				i++;
+			while (CompareFunc(x, m_data[j]))
+				j--;
+			if (i <= j)
 			{
-				quickSortInternal(CompareFunc,0,size()-1);
+				swap(i, j);
+				i++;
+				j--;
 			}
+		} while (i <= j);
+
+		//  recursion
+		if (lo < j)
+			quickSortInternal(CompareFunc, lo, j);
+		if (i < hi)
+			quickSortInternal(CompareFunc, i, hi);
+	}
+
+	template <typename L>
+	void quickSort(const L& CompareFunc)
+	{
+		//don't sort 0 or 1 elements
+		if (size() > 1)
+		{
+			quickSortInternal(CompareFunc, 0, size() - 1);
 		}
+	}
 
+	///heap sort from http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Sort/Heap/
+	template <typename L>
+	void downHeap(T* pArr, int k, int n, const L& CompareFunc)
+	{
+		/*  PRE: a[k+1..N] is a heap */
+		/* POST:  a[k..N]  is a heap */
 
-		///heap sort from http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Sort/Heap/
-		template <typename L>
-		void downHeap(T *pArr, int k, int n, const L& CompareFunc)
+		T temp = pArr[k - 1];
+		/* k has child(s) */
+		while (k <= n / 2)
 		{
-			/*  PRE: a[k+1..N] is a heap */
-			/* POST:  a[k..N]  is a heap */
-			
-			T temp = pArr[k - 1];
-			/* k has child(s) */
-			while (k <= n/2) 
+			int child = 2 * k;
+
+			if ((child < n) && CompareFunc(pArr[child - 1], pArr[child]))
+			{
+				child++;
+			}
+			/* pick larger child */
+			if (CompareFunc(temp, pArr[child - 1]))
 			{
-				int child = 2*k;
-				
-				if ((child < n) && CompareFunc(pArr[child - 1] , pArr[child]))
-				{
-					child++;
-				}
-				/* pick larger child */
-				if (CompareFunc(temp , pArr[child - 1]))
-				{
-					/* move child up */
-					pArr[k - 1] = pArr[child - 1];
-					k = child;
-				}
-				else
-				{
-					break;
-				}
+				/* move child up */
+				pArr[k - 1] = pArr[child - 1];
+				k = child;
 			}
-			pArr[k - 1] = temp;
-		} /*downHeap*/
+			else
+			{
+				break;
+			}
+		}
+		pArr[k - 1] = temp;
+	} /*downHeap*/
 
-		void	swap(int index0,int index1)
-		{
+	void swap(int index0, int index1)
+	{
 #ifdef B3_USE_MEMCPY
-			char	temp[sizeof(T)];
-			memcpy(temp,&m_data[index0],sizeof(T));
-			memcpy(&m_data[index0],&m_data[index1],sizeof(T));
-			memcpy(&m_data[index1],temp,sizeof(T));
+		char temp[sizeof(T)];
+		memcpy(temp, &m_data[index0], sizeof(T));
+		memcpy(&m_data[index0], &m_data[index1], sizeof(T));
+		memcpy(&m_data[index1], temp, sizeof(T));
 #else
-			T temp = m_data[index0];
-			m_data[index0] = m_data[index1];
-			m_data[index1] = temp;
-#endif //B3_USE_PLACEMENT_NEW
-
-		}
+		T temp = m_data[index0];
+		m_data[index0] = m_data[index1];
+		m_data[index1] = temp;
+#endif  //B3_USE_PLACEMENT_NEW
+	}
 
 	template <typename L>
 	void heapSort(const L& CompareFunc)
@@ -431,49 +414,48 @@ protected:
 		/* sort a[0..N-1],  N.B. 0 to N-1 */
 		int k;
 		int n = m_size;
-		for (k = n/2; k > 0; k--) 
+		for (k = n / 2; k > 0; k--)
 		{
 			downHeap(m_data, k, n, CompareFunc);
 		}
 
 		/* a[1..N] is now a heap */
-		while ( n>=1 ) 
+		while (n >= 1)
 		{
-			swap(0,n-1); /* largest of a[0..n-1] */
-
+			swap(0, n - 1); /* largest of a[0..n-1] */
 
 			n = n - 1;
 			/* restore a[1..i-1] heap */
 			downHeap(m_data, 1, n, CompareFunc);
-		} 
+		}
 	}
 
 	///non-recursive binary search, assumes sorted array
-	int	findBinarySearch(const T& key) const
+	int findBinarySearch(const T& key) const
 	{
 		int first = 0;
-		int last = size()-1;
+		int last = size() - 1;
 
 		//assume sorted array
-		while (first <= last) {
+		while (first <= last)
+		{
 			int mid = (first + last) / 2;  // compute mid point.
-			if (key > m_data[mid]) 
+			if (key > m_data[mid])
 				first = mid + 1;  // repeat search in top half.
-			else if (key < m_data[mid]) 
-				last = mid - 1; // repeat search in bottom half.
+			else if (key < m_data[mid])
+				last = mid - 1;  // repeat search in bottom half.
 			else
-				return mid;     // found it. return position /////
+				return mid;  // found it. return position /////
 		}
-		return size();    // failed to find key
+		return size();  // failed to find key
 	}
 
-
-	int	findLinearSearch(const T& key) const
+	int findLinearSearch(const T& key) const
 	{
-		int index=size();
+		int index = size();
 		int i;
 
-		for (i=0;i<size();i++)
+		for (i = 0; i < size(); i++)
 		{
 			if (m_data[i] == key)
 			{
@@ -483,36 +465,35 @@ protected:
 		}
 		return index;
 	}
-    
-    int	findLinearSearch2(const T& key) const
-    {
-        int index=-1;
-        int i;
-        
-        for (i=0;i<size();i++)
-        {
-            if (m_data[i] == key)
-            {
-                index = i;
-                break;
-            }
-        }
-        return index;
-    }
-
-	void	remove(const T& key)
+
+	int findLinearSearch2(const T& key) const
 	{
+		int index = -1;
+		int i;
 
+		for (i = 0; i < size(); i++)
+		{
+			if (m_data[i] == key)
+			{
+				index = i;
+				break;
+			}
+		}
+		return index;
+	}
+
+	void remove(const T& key)
+	{
 		int findIndex = findLinearSearch(key);
-		if (findIndex<size())
+		if (findIndex < size())
 		{
-			swap( findIndex,size()-1);
+			swap(findIndex, size() - 1);
 			pop_back();
 		}
 	}
 
 	//PCK: whole function
-	void initializeFromBuffer(void *buffer, int size, int capacity)
+	void initializeFromBuffer(void* buffer, int size, int capacity)
 	{
 		clear();
 		m_ownsMemory = false;
@@ -524,18 +505,18 @@ protected:
 	void copyFromArray(const b3AlignedObjectArray& otherArray)
 	{
 		int otherSize = otherArray.size();
-		resize (otherSize);
+		resize(otherSize);
 		otherArray.copy(0, otherSize, m_data);
 	}
 
 	void removeAtIndex(int index)
-    {
-        if (index<size())
-        {
-            swap( index,size()-1);
-            pop_back();
-        }
-    }
+	{
+		if (index < size())
+		{
+			swap(index, size() - 1);
+			pop_back();
+		}
+	}
 };
 
-#endif //B3_OBJECT_ARRAY__
+#endif  //B3_OBJECT_ARRAY__
diff --git a/thirdparty/bullet/Bullet3Common/b3CommandLineArgs.h b/thirdparty/bullet/Bullet3Common/b3CommandLineArgs.h
index 38df8e2600..5fe4f25f8d 100644
--- a/thirdparty/bullet/Bullet3Common/b3CommandLineArgs.h
+++ b/thirdparty/bullet/Bullet3Common/b3CommandLineArgs.h
@@ -12,51 +12,54 @@
 class b3CommandLineArgs
 {
 protected:
-
 	std::map<std::string, std::string> pairs;
 
 public:
-
 	// Constructor
 	b3CommandLineArgs(int argc, char **argv)
 	{
-		addArgs(argc,argv);
+		addArgs(argc, argv);
 	}
 
-	void addArgs(int argc, char**argv)
+	void addArgs(int argc, char **argv)
 	{
-	    for (int i = 1; i < argc; i++)
-	    {
-	        std::string arg = argv[i];
+		for (int i = 1; i < argc; i++)
+		{
+			std::string arg = argv[i];
+
+			if ((arg.length() < 2) || (arg[0] != '-') || (arg[1] != '-'))
+			{
+				continue;
+			}
 
-			if ((arg.length() < 2) || (arg[0] != '-') || (arg[1] != '-')) {
-	        	continue;
-	        }
+			std::string::size_type pos;
+			std::string key, val;
+			if ((pos = arg.find('=')) == std::string::npos)
+			{
+				key = std::string(arg, 2, arg.length() - 2);
+				val = "";
+			}
+			else
+			{
+				key = std::string(arg, 2, pos - 2);
+				val = std::string(arg, pos + 1, arg.length() - 1);
+			}
 
-        	std::string::size_type pos;
-		    std::string key, val;
-	        if ((pos = arg.find( '=')) == std::string::npos) {
-	        	key = std::string(arg, 2, arg.length() - 2);
-	        	val = "";
-	        } else {
-	        	key = std::string(arg, 2, pos - 2);
-	        	val = std::string(arg, pos + 1, arg.length() - 1);
-	        }
-			
 			//only add new keys, don't replace existing
-			if(pairs.find(key) == pairs.end())
+			if (pairs.find(key) == pairs.end())
 			{
-        		pairs[key] = val;
+				pairs[key] = val;
 			}
-	    }
+		}
 	}
 
-	bool CheckCmdLineFlag(const char* arg_name)
+	bool CheckCmdLineFlag(const char *arg_name)
 	{
 		std::map<std::string, std::string>::iterator itr;
-		if ((itr = pairs.find(arg_name)) != pairs.end()) {
+		if ((itr = pairs.find(arg_name)) != pairs.end())
+		{
 			return true;
-	    }
+		}
 		return false;
 	}
 
@@ -73,29 +76,31 @@ template <typename T>
 inline bool b3CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
 {
 	std::map<std::string, std::string>::iterator itr;
-	if ((itr = pairs.find(arg_name)) != pairs.end()) {
+	if ((itr = pairs.find(arg_name)) != pairs.end())
+	{
 		std::istringstream strstream(itr->second);
 		strstream >> val;
 		return true;
-    }
+	}
 	return false;
 }
 
 template <>
-inline bool b3CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
+inline bool b3CommandLineArgs::GetCmdLineArgument<char *>(const char *arg_name, char *&val)
 {
 	std::map<std::string, std::string>::iterator itr;
-	if ((itr = pairs.find(arg_name)) != pairs.end()) {
-
+	if ((itr = pairs.find(arg_name)) != pairs.end())
+	{
 		std::string s = itr->second;
-		val = (char*) malloc(sizeof(char) * (s.length() + 1));
+		val = (char *)malloc(sizeof(char) * (s.length() + 1));
 		std::strcpy(val, s.c_str());
 		return true;
-	} else {
-    	val = NULL;
+	}
+	else
+	{
+		val = NULL;
 	}
 	return false;
 }
 
-
-#endif //COMMAND_LINE_ARGS_H
+#endif  //COMMAND_LINE_ARGS_H
diff --git a/thirdparty/bullet/Bullet3Common/b3FileUtils.h b/thirdparty/bullet/Bullet3Common/b3FileUtils.h
index b5e8225cf0..9ded17eaaf 100644
--- a/thirdparty/bullet/Bullet3Common/b3FileUtils.h
+++ b/thirdparty/bullet/Bullet3Common/b3FileUtils.h
@@ -3,7 +3,7 @@
 
 #include <stdio.h>
 #include "b3Scalar.h"
-#include <stddef.h>//ptrdiff_h
+#include <stddef.h>  //ptrdiff_h
 #include <string.h>
 
 struct b3FileUtils
@@ -17,42 +17,42 @@ struct b3FileUtils
 
 	static bool findFile(const char* orgFileName, char* relativeFileName, int maxRelativeFileNameMaxLen)
 	{
-		FILE* f=0;
-		f = fopen(orgFileName,"rb");
-                if (f)
-                {
+		FILE* f = 0;
+		f = fopen(orgFileName, "rb");
+		if (f)
+		{
 			//printf("original file found: [%s]\n", orgFileName);
-			sprintf(relativeFileName,"%s", orgFileName);
+			sprintf(relativeFileName, "%s", orgFileName);
 			fclose(f);
 			return true;
 		}
 
-		//printf("Trying various directories, relative to current working directory\n");	
-			const char* prefix[]={"./","./data/","../data/","../../data/","../../../data/","../../../../data/"};
-			int numPrefixes = sizeof(prefix)/sizeof(const char*);
-	
-			f=0;
-			bool fileFound = false;
+		//printf("Trying various directories, relative to current working directory\n");
+		const char* prefix[] = {"./", "./data/", "../data/", "../../data/", "../../../data/", "../../../../data/"};
+		int numPrefixes = sizeof(prefix) / sizeof(const char*);
 
-			for (int i=0;!f && i<numPrefixes;i++)
-			{
+		f = 0;
+		bool fileFound = false;
+
+		for (int i = 0; !f && i < numPrefixes; i++)
+		{
 #ifdef _MSC_VER
-				sprintf_s(relativeFileName,maxRelativeFileNameMaxLen,"%s%s",prefix[i],orgFileName);
+			sprintf_s(relativeFileName, maxRelativeFileNameMaxLen, "%s%s", prefix[i], orgFileName);
 #else
-				sprintf(relativeFileName,"%s%s",prefix[i],orgFileName);
+			sprintf(relativeFileName, "%s%s", prefix[i], orgFileName);
 #endif
-				f = fopen(relativeFileName,"rb");
-				if (f)
-				{
-					fileFound = true;
-					break;
-				}
-			}
+			f = fopen(relativeFileName, "rb");
 			if (f)
 			{
-				fclose(f);
+				fileFound = true;
+				break;
 			}
-	
+		}
+		if (f)
+		{
+			fclose(f);
+		}
+
 		return fileFound;
 	}
 
@@ -60,8 +60,8 @@ struct b3FileUtils
 	{
 		size_t const patlen = strlen(pattern);
 		size_t patcnt = 0;
-		const char * oriptr;
-		const char * patloc;
+		const char* oriptr;
+		const char* patloc;
 		// find how many times the pattern occurs in the original string
 		for (oriptr = name; (patloc = strstr(oriptr, pattern)); oriptr = patloc + patlen)
 		{
@@ -70,29 +70,27 @@ struct b3FileUtils
 		return oriptr;
 	}
 
-	
-
 	static int extractPath(const char* fileName, char* path, int maxPathLength)
 	{
 		const char* stripped = strip2(fileName, "/");
 		stripped = strip2(stripped, "\\");
 
-		ptrdiff_t len = stripped-fileName;
-		b3Assert((len+1)<maxPathLength);
+		ptrdiff_t len = stripped - fileName;
+		b3Assert((len + 1) < maxPathLength);
 
-		if (len && ((len+1)<maxPathLength))
+		if (len && ((len + 1) < maxPathLength))
 		{
-
-			for (int i=0;i<len;i++)
+			for (int i = 0; i < len; i++)
 			{
 				path[i] = fileName[i];
 			}
-			path[len]=0;
-		} else
+			path[len] = 0;
+		}
+		else
 		{
 			len = 0;
-			b3Assert(maxPathLength>0);
-			if (maxPathLength>0)
+			b3Assert(maxPathLength > 0);
+			if (maxPathLength > 0)
 			{
 				path[len] = 0;
 			}
@@ -102,23 +100,21 @@ struct b3FileUtils
 
 	static char toLowerChar(const char t)
 	{
-		if (t>=(char)'A' && t<=(char)'Z')
+		if (t >= (char)'A' && t <= (char)'Z')
 			return t + ((char)'a' - (char)'A');
 		else
 			return t;
 	}
 
-
 	static void toLower(char* str)
 	{
-		int len=strlen(str);
-		for (int i=0;i<len;i++)
+		int len = strlen(str);
+		for (int i = 0; i < len; i++)
 		{
 			str[i] = toLowerChar(str[i]);
 		}
 	}
 
-
 	/*static const char* strip2(const char* name, const char* pattern)
 	{
 		size_t const patlen = strlen(pattern);
@@ -133,6 +129,5 @@ struct b3FileUtils
 		return oriptr;
 	}
 	*/
-
 };
-#endif //B3_FILE_UTILS_H
+#endif  //B3_FILE_UTILS_H
diff --git a/thirdparty/bullet/Bullet3Common/b3HashMap.h b/thirdparty/bullet/Bullet3Common/b3HashMap.h
index 24a59d9baa..3009e2cf2f 100644
--- a/thirdparty/bullet/Bullet3Common/b3HashMap.h
+++ b/thirdparty/bullet/Bullet3Common/b3HashMap.h
@@ -13,86 +13,80 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
-
 #ifndef B3_HASH_MAP_H
 #define B3_HASH_MAP_H
 
 #include "b3AlignedObjectArray.h"
 
-
 #include <string>
 
 ///very basic hashable string implementation, compatible with b3HashMap
 struct b3HashString
 {
 	std::string m_string;
-	unsigned int	m_hash;
+	unsigned int m_hash;
 
-	B3_FORCE_INLINE	unsigned int getHash()const
+	B3_FORCE_INLINE unsigned int getHash() const
 	{
 		return m_hash;
 	}
 
-
 	b3HashString(const char* name)
-		:m_string(name)
+		: m_string(name)
 	{
-
 		/* magic numbers from http://www.isthe.com/chongo/tech/comp/fnv/ */
-		static const unsigned int  InitialFNV = 2166136261u;
+		static const unsigned int InitialFNV = 2166136261u;
 		static const unsigned int FNVMultiple = 16777619u;
 
 		/* Fowler / Noll / Vo (FNV) Hash */
 		unsigned int hash = InitialFNV;
 		int len = m_string.length();
-		for(int i = 0; i<len; i++)
+		for (int i = 0; i < len; i++)
 		{
-			hash = hash ^ (m_string[i]);       /* xor  the low 8 bits */
-			hash = hash * FNVMultiple;  /* multiply by the magic number */
+			hash = hash ^ (m_string[i]); /* xor  the low 8 bits */
+			hash = hash * FNVMultiple;   /* multiply by the magic number */
 		}
 		m_hash = hash;
 	}
 
-	int portableStringCompare(const char* src,	const char* dst) const
+	int portableStringCompare(const char* src, const char* dst) const
 	{
-			int ret = 0 ;
+		int ret = 0;
 
-			while( ! (ret = *(unsigned char *)src - *(unsigned char *)dst) && *dst)
-					++src, ++dst;
+		while (!(ret = *(unsigned char*)src - *(unsigned char*)dst) && *dst)
+			++src, ++dst;
 
-			if ( ret < 0 )
-					ret = -1 ;
-			else if ( ret > 0 )
-					ret = 1 ;
+		if (ret < 0)
+			ret = -1;
+		else if (ret > 0)
+			ret = 1;
 
-			return( ret );
+		return (ret);
 	}
 
 	bool equals(const b3HashString& other) const
 	{
 		return (m_string == other.m_string);
 	}
-
 };
 
-
-const int B3_HASH_NULL=0xffffffff;
-
+const int B3_HASH_NULL = 0xffffffff;
 
 class b3HashInt
 {
-	int	m_uid;
+	int m_uid;
+
 public:
-	b3HashInt(int uid)	:m_uid(uid)
+	b3HashInt(int uid) : m_uid(uid)
 	{
 	}
 
-	int	getUid1() const
+	int getUid1() const
 	{
 		return m_uid;
 	}
 
-	void	setUid1(int uid)
+	void setUid1(int uid)
 	{
 		m_uid = uid;
 	}
@@ -102,34 +96,34 @@ public:
 		return getUid1() == other.getUid1();
 	}
 	//to our success
-	B3_FORCE_INLINE	unsigned int getHash()const
+	B3_FORCE_INLINE unsigned int getHash() const
 	{
 		int key = m_uid;
 		// Thomas Wang's hash
-		key += ~(key << 15);	key ^=  (key >> 10);	key +=  (key << 3);	key ^=  (key >> 6);	key += ~(key << 11);	key ^=  (key >> 16);
+		key += ~(key << 15);
+		key ^= (key >> 10);
+		key += (key << 3);
+		key ^= (key >> 6);
+		key += ~(key << 11);
+		key ^= (key >> 16);
 		return key;
 	}
 };
 
-
-
 class b3HashPtr
 {
-
-	union
-	{
-		const void*	m_pointer;
-		int	m_hashValues[2];
+	union {
+		const void* m_pointer;
+		int m_hashValues[2];
 	};
 
 public:
-
 	b3HashPtr(const void* ptr)
-		:m_pointer(ptr)
+		: m_pointer(ptr)
 	{
 	}
 
-	const void*	getPointer() const
+	const void* getPointer() const
 	{
 		return m_pointer;
 	}
@@ -140,65 +134,69 @@ public:
 	}
 
 	//to our success
-	B3_FORCE_INLINE	unsigned int getHash()const
+	B3_FORCE_INLINE unsigned int getHash() const
 	{
-		const bool VOID_IS_8 = ((sizeof(void*)==8));
-		
-		int key = VOID_IS_8? m_hashValues[0]+m_hashValues[1] : m_hashValues[0];
-	
+		const bool VOID_IS_8 = ((sizeof(void*) == 8));
+
+		int key = VOID_IS_8 ? m_hashValues[0] + m_hashValues[1] : m_hashValues[0];
+
 		// Thomas Wang's hash
-		key += ~(key << 15);	key ^=  (key >> 10);	key +=  (key << 3);	key ^=  (key >> 6);	key += ~(key << 11);	key ^=  (key >> 16);
+		key += ~(key << 15);
+		key ^= (key >> 10);
+		key += (key << 3);
+		key ^= (key >> 6);
+		key += ~(key << 11);
+		key ^= (key >> 16);
 		return key;
 	}
-
-	
 };
 
-
 template <class Value>
 class b3HashKeyPtr
 {
-        int     m_uid;
+	int m_uid;
+
 public:
+	b3HashKeyPtr(int uid) : m_uid(uid)
+	{
+	}
 
-        b3HashKeyPtr(int uid)    :m_uid(uid)
-        {
-        }
-
-        int     getUid1() const
-        {
-                return m_uid;
-        }
-
-        bool equals(const b3HashKeyPtr<Value>& other) const
-        {
-                return getUid1() == other.getUid1();
-        }
-
-        //to our success
-        B3_FORCE_INLINE       unsigned int getHash()const
-        {
-                int key = m_uid;
-                // Thomas Wang's hash
-                key += ~(key << 15);	key ^=  (key >> 10);	key +=  (key << 3);	key ^=  (key >> 6);	key += ~(key << 11);	key ^=  (key >> 16);
-                return key;
-        }
-
-        
-};
+	int getUid1() const
+	{
+		return m_uid;
+	}
 
+	bool equals(const b3HashKeyPtr<Value>& other) const
+	{
+		return getUid1() == other.getUid1();
+	}
+
+	//to our success
+	B3_FORCE_INLINE unsigned int getHash() const
+	{
+		int key = m_uid;
+		// Thomas Wang's hash
+		key += ~(key << 15);
+		key ^= (key >> 10);
+		key += (key << 3);
+		key ^= (key >> 6);
+		key += ~(key << 11);
+		key ^= (key >> 16);
+		return key;
+	}
+};
 
 template <class Value>
 class b3HashKey
 {
-	int	m_uid;
-public:
+	int m_uid;
 
-	b3HashKey(int uid)	:m_uid(uid)
+public:
+	b3HashKey(int uid) : m_uid(uid)
 	{
 	}
 
-	int	getUid1() const
+	int getUid1() const
 	{
 		return m_uid;
 	}
@@ -208,30 +206,33 @@ public:
 		return getUid1() == other.getUid1();
 	}
 	//to our success
-	B3_FORCE_INLINE	unsigned int getHash()const
+	B3_FORCE_INLINE unsigned int getHash() const
 	{
 		int key = m_uid;
 		// Thomas Wang's hash
-		key += ~(key << 15);	key ^=  (key >> 10);	key +=  (key << 3);	key ^=  (key >> 6);	key += ~(key << 11);	key ^=  (key >> 16);
+		key += ~(key << 15);
+		key ^= (key >> 10);
+		key += (key << 3);
+		key ^= (key >> 6);
+		key += ~(key << 11);
+		key ^= (key >> 16);
 		return key;
 	}
 };
 
-
 ///The b3HashMap template class implements a generic and lightweight hashmap.
 ///A basic sample of how to use b3HashMap is located in Demos\BasicDemo\main.cpp
 template <class Key, class Value>
 class b3HashMap
 {
-
 protected:
-	b3AlignedObjectArray<int>		m_hashTable;
-	b3AlignedObjectArray<int>		m_next;
-	
-	b3AlignedObjectArray<Value>		m_valueArray;
-	b3AlignedObjectArray<Key>		m_keyArray;
+	b3AlignedObjectArray<int> m_hashTable;
+	b3AlignedObjectArray<int> m_next;
+
+	b3AlignedObjectArray<Value> m_valueArray;
+	b3AlignedObjectArray<Key> m_keyArray;
 
-	void	growTables(const Key& /*key*/)
+	void growTables(const Key& /*key*/)
 	{
 		int newCapacity = m_valueArray.capacity();
 
@@ -245,7 +246,7 @@ protected:
 
 			int i;
 
-			for (i= 0; i < newCapacity; ++i)
+			for (i = 0; i < newCapacity; ++i)
 			{
 				m_hashTable[i] = B3_HASH_NULL;
 			}
@@ -254,30 +255,28 @@ protected:
 				m_next[i] = B3_HASH_NULL;
 			}
 
-			for(i=0;i<curHashtableSize;i++)
+			for (i = 0; i < curHashtableSize; i++)
 			{
 				//const Value& value = m_valueArray[i];
 				//const Key& key = m_keyArray[i];
 
-				int	hashValue = m_keyArray[i].getHash() & (m_valueArray.capacity()-1);	// New hash value with new mask
+				int hashValue = m_keyArray[i].getHash() & (m_valueArray.capacity() - 1);  // New hash value with new mask
 				m_next[i] = m_hashTable[hashValue];
 				m_hashTable[hashValue] = i;
 			}
-
-
 		}
 	}
 
-	public:
-
-	void insert(const Key& key, const Value& value) {
-		int hash = key.getHash() & (m_valueArray.capacity()-1);
+public:
+	void insert(const Key& key, const Value& value)
+	{
+		int hash = key.getHash() & (m_valueArray.capacity() - 1);
 
 		//replace value if the key is already there
 		int index = findIndex(key);
 		if (index != B3_HASH_NULL)
 		{
-			m_valueArray[index]=value;
+			m_valueArray[index] = value;
 			return;
 		}
 
@@ -291,19 +290,19 @@ protected:
 		{
 			growTables(key);
 			//hash with new capacity
-			hash = key.getHash() & (m_valueArray.capacity()-1);
+			hash = key.getHash() & (m_valueArray.capacity() - 1);
 		}
 		m_next[count] = m_hashTable[hash];
 		m_hashTable[hash] = count;
 	}
 
-	void remove(const Key& key) {
-
-		int hash = key.getHash() & (m_valueArray.capacity()-1);
+	void remove(const Key& key)
+	{
+		int hash = key.getHash() & (m_valueArray.capacity() - 1);
 
 		int pairIndex = findIndex(key);
-		
-		if (pairIndex ==B3_HASH_NULL)
+
+		if (pairIndex == B3_HASH_NULL)
 		{
 			return;
 		}
@@ -344,7 +343,7 @@ protected:
 		}
 
 		// Remove the last pair from the hash table.
-		int lastHash = m_keyArray[lastPairIndex].getHash() & (m_valueArray.capacity()-1);
+		int lastHash = m_keyArray[lastPairIndex].getHash() & (m_valueArray.capacity() - 1);
 
 		index = m_hashTable[lastHash];
 		b3Assert(index != B3_HASH_NULL);
@@ -376,10 +375,8 @@ protected:
 
 		m_valueArray.pop_back();
 		m_keyArray.pop_back();
-
 	}
 
-
 	int size() const
 	{
 		return m_valueArray.size();
@@ -399,23 +396,24 @@ protected:
 		return &m_valueArray[index];
 	}
 
-	 Key getKeyAtIndex(int index)
-    {
-        b3Assert(index < m_keyArray.size());
-        return m_keyArray[index];
-    }
-    
-    const Key getKeyAtIndex(int index) const
-    {
-        b3Assert(index < m_keyArray.size());
-        return m_keyArray[index];
-    }
+	Key getKeyAtIndex(int index)
+	{
+		b3Assert(index < m_keyArray.size());
+		return m_keyArray[index];
+	}
+
+	const Key getKeyAtIndex(int index) const
+	{
+		b3Assert(index < m_keyArray.size());
+		return m_keyArray[index];
+	}
 
-	Value* operator[](const Key& key) {
+	Value* operator[](const Key& key)
+	{
 		return find(key);
 	}
 
-	const Value*	find(const Key& key) const
+	const Value* find(const Key& key) const
 	{
 		int index = findIndex(key);
 		if (index == B3_HASH_NULL)
@@ -425,7 +423,7 @@ protected:
 		return &m_valueArray[index];
 	}
 
-	Value*	find(const Key& key)
+	Value* find(const Key& key)
 	{
 		int index = findIndex(key);
 		if (index == B3_HASH_NULL)
@@ -435,10 +433,9 @@ protected:
 		return &m_valueArray[index];
 	}
 
-
-	int	findIndex(const Key& key) const
+	int findIndex(const Key& key) const
 	{
-		unsigned int hash = key.getHash() & (m_valueArray.capacity()-1);
+		unsigned int hash = key.getHash() & (m_valueArray.capacity() - 1);
 
 		if (hash >= (unsigned int)m_hashTable.size())
 		{
@@ -453,14 +450,13 @@ protected:
 		return index;
 	}
 
-	void	clear()
+	void clear()
 	{
 		m_hashTable.clear();
 		m_next.clear();
 		m_valueArray.clear();
 		m_keyArray.clear();
 	}
-
 };
 
-#endif //B3_HASH_MAP_H
+#endif  //B3_HASH_MAP_H
diff --git a/thirdparty/bullet/Bullet3Common/b3Logging.cpp b/thirdparty/bullet/Bullet3Common/b3Logging.cpp
index a8e9507155..9c9f7c09ea 100644
--- a/thirdparty/bullet/Bullet3Common/b3Logging.cpp
+++ b/thirdparty/bullet/Bullet3Common/b3Logging.cpp
@@ -20,17 +20,16 @@ subject to the following restrictions:
 
 #ifdef _WIN32
 #include <windows.h>
-#endif //_WIN32
-
+#endif  //_WIN32
 
 void b3PrintfFuncDefault(const char* msg)
 {
 #ifdef _WIN32
 	OutputDebugStringA(msg);
 #endif
-	printf("%s",msg);
-    //is this portable?
-    fflush(stdout);
+	printf("%s", msg);
+	//is this portable?
+	fflush(stdout);
 }
 
 void b3WarningMessageFuncDefault(const char* msg)
@@ -38,32 +37,26 @@ void b3WarningMessageFuncDefault(const char* msg)
 #ifdef _WIN32
 	OutputDebugStringA(msg);
 #endif
-	printf("%s",msg);
-    //is this portable?
-    fflush(stdout);
-
+	printf("%s", msg);
+	//is this portable?
+	fflush(stdout);
 }
 
-
 void b3ErrorMessageFuncDefault(const char* msg)
 {
 #ifdef _WIN32
 	OutputDebugStringA(msg);
 #endif
-	printf("%s",msg);
+	printf("%s", msg);
 
-    //is this portable?
-    fflush(stdout);
-    
+	//is this portable?
+	fflush(stdout);
 }
 
-
-
 static b3PrintfFunc* b3s_printfFunc = b3PrintfFuncDefault;
 static b3WarningMessageFunc* b3s_warningMessageFunc = b3WarningMessageFuncDefault;
 static b3ErrorMessageFunc* b3s_errorMessageFunc = b3ErrorMessageFuncDefault;
 
-
 ///The developer can route b3Printf output using their own implementation
 void b3SetCustomPrintfFunc(b3PrintfFunc* printfFunc)
 {
@@ -81,54 +74,50 @@ void b3SetCustomErrorMessageFunc(b3PrintfFunc* errorMessageFunc)
 //#define B3_MAX_DEBUG_STRING_LENGTH 2048
 #define B3_MAX_DEBUG_STRING_LENGTH 32768
 
-
-void b3OutputPrintfVarArgsInternal(const char *str, ...)
+void b3OutputPrintfVarArgsInternal(const char* str, ...)
 {
-    char strDebug[B3_MAX_DEBUG_STRING_LENGTH]={0};
-    va_list argList;
-    va_start(argList, str);
+	char strDebug[B3_MAX_DEBUG_STRING_LENGTH] = {0};
+	va_list argList;
+	va_start(argList, str);
 #ifdef _MSC_VER
-    vsprintf_s(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList);
+	vsprintf_s(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList);
 #else
-    vsnprintf(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList);
+	vsnprintf(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList);
 #endif
-        (b3s_printfFunc)(strDebug);
-    va_end(argList);    
+	(b3s_printfFunc)(strDebug);
+	va_end(argList);
 }
-void b3OutputWarningMessageVarArgsInternal(const char *str, ...)
+void b3OutputWarningMessageVarArgsInternal(const char* str, ...)
 {
-    char strDebug[B3_MAX_DEBUG_STRING_LENGTH]={0};
-    va_list argList;
-    va_start(argList, str);
+	char strDebug[B3_MAX_DEBUG_STRING_LENGTH] = {0};
+	va_list argList;
+	va_start(argList, str);
 #ifdef _MSC_VER
-    vsprintf_s(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList);
+	vsprintf_s(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList);
 #else
-    vsnprintf(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList);
+	vsnprintf(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList);
 #endif
-        (b3s_warningMessageFunc)(strDebug);
-    va_end(argList);    
+	(b3s_warningMessageFunc)(strDebug);
+	va_end(argList);
 }
-void b3OutputErrorMessageVarArgsInternal(const char *str, ...)
+void b3OutputErrorMessageVarArgsInternal(const char* str, ...)
 {
-	
-    char strDebug[B3_MAX_DEBUG_STRING_LENGTH]={0};
-    va_list argList;
-    va_start(argList, str);
+	char strDebug[B3_MAX_DEBUG_STRING_LENGTH] = {0};
+	va_list argList;
+	va_start(argList, str);
 #ifdef _MSC_VER
-    vsprintf_s(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList);
+	vsprintf_s(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList);
 #else
-    vsnprintf(strDebug,B3_MAX_DEBUG_STRING_LENGTH,str,argList);
+	vsnprintf(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList);
 #endif
-        (b3s_errorMessageFunc)(strDebug);
-    va_end(argList);    
-
+	(b3s_errorMessageFunc)(strDebug);
+	va_end(argList);
 }
 
-
-void	b3EnterProfileZoneDefault(const char* name)
+void b3EnterProfileZoneDefault(const char* name)
 {
 }
-void	b3LeaveProfileZoneDefault()
+void b3LeaveProfileZoneDefault()
 {
 }
 static b3EnterProfileZoneFunc* b3s_enterFunc = b3EnterProfileZoneDefault;
@@ -151,10 +140,6 @@ void b3SetCustomLeaveProfileZoneFunc(b3LeaveProfileZoneFunc* leaveFunc)
 	b3s_leaveFunc = leaveFunc;
 }
 
-
-
-
 #ifndef _MSC_VER
 #undef vsprintf_s
 #endif
-
diff --git a/thirdparty/bullet/Bullet3Common/b3Logging.h b/thirdparty/bullet/Bullet3Common/b3Logging.h
index b302effe43..9c92b12ebb 100644
--- a/thirdparty/bullet/Bullet3Common/b3Logging.h
+++ b/thirdparty/bullet/Bullet3Common/b3Logging.h
@@ -3,75 +3,84 @@
 #define B3_LOGGING_H
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
-    
+
 ///We add the do/while so that the statement "if (condition) b3Printf("test"); else {...}" would fail
 ///You can also customize the message by uncommenting out a different line below
 #define b3Printf(...) b3OutputPrintfVarArgsInternal(__VA_ARGS__)
-//#define b3Printf(...) do {b3OutputPrintfVarArgsInternal("b3Printf[%s,%d]:",__FILE__,__LINE__);b3OutputPrintfVarArgsInternal(__VA_ARGS__); } while(0)
-//#define b3Printf b3OutputPrintfVarArgsInternal
-//#define b3Printf(...) printf(__VA_ARGS__)
-//#define b3Printf(...)
-
-#define b3Warning(...) do {b3OutputWarningMessageVarArgsInternal("b3Warning[%s,%d]:\n",__FILE__,__LINE__);b3OutputWarningMessageVarArgsInternal(__VA_ARGS__); }while(0)
-#define b3Error(...) do {b3OutputErrorMessageVarArgsInternal("b3Error[%s,%d]:\n",__FILE__,__LINE__);b3OutputErrorMessageVarArgsInternal(__VA_ARGS__); } while(0)
-
+	//#define b3Printf(...) do {b3OutputPrintfVarArgsInternal("b3Printf[%s,%d]:",__FILE__,__LINE__);b3OutputPrintfVarArgsInternal(__VA_ARGS__); } while(0)
+	//#define b3Printf b3OutputPrintfVarArgsInternal
+	//#define b3Printf(...) printf(__VA_ARGS__)
+	//#define b3Printf(...)
+
+#define b3Warning(...)                                                                    \
+	do                                                                                    \
+	{                                                                                     \
+		b3OutputWarningMessageVarArgsInternal("b3Warning[%s,%d]:\n", __FILE__, __LINE__); \
+		b3OutputWarningMessageVarArgsInternal(__VA_ARGS__);                               \
+	} while (0)
+#define b3Error(...)                                                                  \
+	do                                                                                \
+	{                                                                                 \
+		b3OutputErrorMessageVarArgsInternal("b3Error[%s,%d]:\n", __FILE__, __LINE__); \
+		b3OutputErrorMessageVarArgsInternal(__VA_ARGS__);                             \
+	} while (0)
 
 #ifndef B3_NO_PROFILE
 
-void b3EnterProfileZone(const char* name);
-void b3LeaveProfileZone();
+	void b3EnterProfileZone(const char* name);
+	void b3LeaveProfileZone();
 #ifdef __cplusplus
 
-class	b3ProfileZone
-{
-public:
-	b3ProfileZone(const char* name)
-	{ 
-		b3EnterProfileZone( name ); 
-	}
-
-	~b3ProfileZone()
-	{ 
-		b3LeaveProfileZone(); 
-	}
-};
-
-#define	B3_PROFILE( name )			b3ProfileZone __profile( name )
+	class b3ProfileZone
+	{
+	public:
+		b3ProfileZone(const char* name)
+		{
+			b3EnterProfileZone(name);
+		}
+
+		~b3ProfileZone()
+		{
+			b3LeaveProfileZone();
+		}
+	};
+
+#define B3_PROFILE(name) b3ProfileZone __profile(name)
 #endif
 
-#else //B3_NO_PROFILE
+#else  //B3_NO_PROFILE
 
-#define	B3_PROFILE( name )
+#define B3_PROFILE(name)
 #define b3StartProfile(a)
 #define b3StopProfile
 
-#endif //#ifndef B3_NO_PROFILE
-
+#endif  //#ifndef B3_NO_PROFILE
 
-typedef void (b3PrintfFunc)(const char* msg);
-typedef void (b3WarningMessageFunc)(const char* msg);
-typedef void (b3ErrorMessageFunc)(const char* msg);
-typedef void (b3EnterProfileZoneFunc)(const char* msg);
-typedef void (b3LeaveProfileZoneFunc)();
+	typedef void(b3PrintfFunc)(const char* msg);
+	typedef void(b3WarningMessageFunc)(const char* msg);
+	typedef void(b3ErrorMessageFunc)(const char* msg);
+	typedef void(b3EnterProfileZoneFunc)(const char* msg);
+	typedef void(b3LeaveProfileZoneFunc)();
 
-///The developer can route b3Printf output using their own implementation
-void b3SetCustomPrintfFunc(b3PrintfFunc* printfFunc);
-void b3SetCustomWarningMessageFunc(b3WarningMessageFunc* warningMsgFunc);
-void b3SetCustomErrorMessageFunc(b3ErrorMessageFunc* errorMsgFunc);
+	///The developer can route b3Printf output using their own implementation
+	void b3SetCustomPrintfFunc(b3PrintfFunc* printfFunc);
+	void b3SetCustomWarningMessageFunc(b3WarningMessageFunc* warningMsgFunc);
+	void b3SetCustomErrorMessageFunc(b3ErrorMessageFunc* errorMsgFunc);
 
-///Set custom profile zone functions (zones can be nested)
-void b3SetCustomEnterProfileZoneFunc(b3EnterProfileZoneFunc* enterFunc);
-void b3SetCustomLeaveProfileZoneFunc(b3LeaveProfileZoneFunc* leaveFunc);
+	///Set custom profile zone functions (zones can be nested)
+	void b3SetCustomEnterProfileZoneFunc(b3EnterProfileZoneFunc* enterFunc);
+	void b3SetCustomLeaveProfileZoneFunc(b3LeaveProfileZoneFunc* leaveFunc);
 
-///Don't use those internal functions directly, use the b3Printf or b3SetCustomPrintfFunc instead (or warning/error version)
-void b3OutputPrintfVarArgsInternal(const char *str, ...);
-void b3OutputWarningMessageVarArgsInternal(const char *str, ...);
-void b3OutputErrorMessageVarArgsInternal(const char *str, ...);
+	///Don't use those internal functions directly, use the b3Printf or b3SetCustomPrintfFunc instead (or warning/error version)
+	void b3OutputPrintfVarArgsInternal(const char* str, ...);
+	void b3OutputWarningMessageVarArgsInternal(const char* str, ...);
+	void b3OutputErrorMessageVarArgsInternal(const char* str, ...);
 
 #ifdef __cplusplus
-    }
+}
 #endif
 
-#endif//B3_LOGGING_H
-\ No newline at end of file
+#endif  //B3_LOGGING_H
+\ No newline at end of file
diff --git a/thirdparty/bullet/Bullet3Common/b3Matrix3x3.h b/thirdparty/bullet/Bullet3Common/b3Matrix3x3.h
index 89b57cf59a..6c46536a81 100644
--- a/thirdparty/bullet/Bullet3Common/b3Matrix3x3.h
+++ b/thirdparty/bullet/Bullet3Common/b3Matrix3x3.h
@@ -12,8 +12,7 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
-
-#ifndef	B3_MATRIX3x3_H
+#ifndef B3_MATRIX3x3_H
 #define B3_MATRIX3x3_H
 
 #include "b3Vector3.h"
@@ -32,22 +31,22 @@ const b3SimdFloat4 B3_ATTRIBUTE_ALIGNED16(b3v0010) = {0.0f, 0.0f, 1.0f, 0.0f};
 #endif
 
 #ifdef B3_USE_DOUBLE_PRECISION
-#define b3Matrix3x3Data	b3Matrix3x3DoubleData 
+#define b3Matrix3x3Data b3Matrix3x3DoubleData
 #else
-#define b3Matrix3x3Data	b3Matrix3x3FloatData
-#endif //B3_USE_DOUBLE_PRECISION
-
+#define b3Matrix3x3Data b3Matrix3x3FloatData
+#endif  //B3_USE_DOUBLE_PRECISION
 
 /**@brief The b3Matrix3x3 class implements a 3x3 rotation matrix, to perform linear algebra in combination with b3Quaternion, b3Transform and b3Vector3.
 * Make sure to only include a pure orthogonal matrix without scaling. */
-B3_ATTRIBUTE_ALIGNED16(class) b3Matrix3x3 {
-
+B3_ATTRIBUTE_ALIGNED16(class)
+b3Matrix3x3
+{
 	///Data storage for the matrix, each vector is a row of the matrix
 	b3Vector3 m_el[3];
 
 public:
 	/** @brief No initializaion constructor */
-	b3Matrix3x3 () {}
+	b3Matrix3x3() {}
 
 	//		explicit b3Matrix3x3(const b3Scalar *m) { setFromOpenGLSubMatrix(m); }
 
@@ -62,27 +61,27 @@ public:
 	*/
 	/** @brief Constructor with row major formatting */
 	b3Matrix3x3(const b3Scalar& xx, const b3Scalar& xy, const b3Scalar& xz,
-		const b3Scalar& yx, const b3Scalar& yy, const b3Scalar& yz,
-		const b3Scalar& zx, const b3Scalar& zy, const b3Scalar& zz)
-	{ 
-		setValue(xx, xy, xz, 
-			yx, yy, yz, 
-			zx, zy, zz);
+				const b3Scalar& yx, const b3Scalar& yy, const b3Scalar& yz,
+				const b3Scalar& zx, const b3Scalar& zy, const b3Scalar& zz)
+	{
+		setValue(xx, xy, xz,
+				 yx, yy, yz,
+				 zx, zy, zz);
 	}
 
-#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON)
-	B3_FORCE_INLINE b3Matrix3x3 (const b3SimdFloat4 v0, const b3SimdFloat4 v1, const b3SimdFloat4 v2 ) 
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+	B3_FORCE_INLINE b3Matrix3x3(const b3SimdFloat4 v0, const b3SimdFloat4 v1, const b3SimdFloat4 v2)
 	{
-        m_el[0].mVec128 = v0;
-        m_el[1].mVec128 = v1;
-        m_el[2].mVec128 = v2;
+		m_el[0].mVec128 = v0;
+		m_el[1].mVec128 = v1;
+		m_el[2].mVec128 = v2;
 	}
 
-	B3_FORCE_INLINE b3Matrix3x3 (const b3Vector3& v0, const b3Vector3& v1, const b3Vector3& v2 ) 
+	B3_FORCE_INLINE b3Matrix3x3(const b3Vector3& v0, const b3Vector3& v1, const b3Vector3& v2)
 	{
-        m_el[0] = v0;
-        m_el[1] = v1;
-        m_el[2] = v2;
+		m_el[0] = v0;
+		m_el[1] = v1;
+		m_el[2] = v2;
 	}
 
 	// Copy constructor
@@ -94,25 +93,25 @@ public:
 	}
 
 	// Assignment Operator
-	B3_FORCE_INLINE b3Matrix3x3& operator=(const b3Matrix3x3& m) 
+	B3_FORCE_INLINE b3Matrix3x3& operator=(const b3Matrix3x3& m)
 	{
 		m_el[0].mVec128 = m.m_el[0].mVec128;
 		m_el[1].mVec128 = m.m_el[1].mVec128;
 		m_el[2].mVec128 = m.m_el[2].mVec128;
-		
+
 		return *this;
 	}
 
 #else
 
 	/** @brief Copy constructor */
-	B3_FORCE_INLINE b3Matrix3x3 (const b3Matrix3x3& other)
+	B3_FORCE_INLINE b3Matrix3x3(const b3Matrix3x3& other)
 	{
 		m_el[0] = other.m_el[0];
 		m_el[1] = other.m_el[1];
 		m_el[2] = other.m_el[2];
 	}
-    
+
 	/** @brief Assignment Operator */
 	B3_FORCE_INLINE b3Matrix3x3& operator=(const b3Matrix3x3& other)
 	{
@@ -128,10 +127,9 @@ public:
 	*  @param i Column number 0 indexed */
 	B3_FORCE_INLINE b3Vector3 getColumn(int i) const
 	{
-		return b3MakeVector3(m_el[0][i],m_el[1][i],m_el[2][i]);
+		return b3MakeVector3(m_el[0][i], m_el[1][i], m_el[2][i]);
 	}
 
-
 	/** @brief Get a row of the matrix as a vector 
 	*  @param i Row number 0 indexed */
 	B3_FORCE_INLINE const b3Vector3& getRow(int i) const
@@ -142,10 +140,10 @@ public:
 
 	/** @brief Get a mutable reference to a row of the matrix as a vector 
 	*  @param i Row number 0 indexed */
-	B3_FORCE_INLINE b3Vector3&  operator[](int i)
-	{ 
+	B3_FORCE_INLINE b3Vector3& operator[](int i)
+	{
 		b3FullAssert(0 <= i && i < 3);
-		return m_el[i]; 
+		return m_el[i];
 	}
 
 	/** @brief Get a const reference to a row of the matrix as a vector 
@@ -153,32 +151,31 @@ public:
 	B3_FORCE_INLINE const b3Vector3& operator[](int i) const
 	{
 		b3FullAssert(0 <= i && i < 3);
-		return m_el[i]; 
+		return m_el[i];
 	}
 
 	/** @brief Multiply by the target matrix on the right
 	*  @param m Rotation matrix to be applied 
 	* Equivilant to this = this * m */
-	b3Matrix3x3& operator*=(const b3Matrix3x3& m); 
+	b3Matrix3x3& operator*=(const b3Matrix3x3& m);
 
 	/** @brief Adds by the target matrix on the right
 	*  @param m matrix to be applied 
 	* Equivilant to this = this + m */
-	b3Matrix3x3& operator+=(const b3Matrix3x3& m); 
+	b3Matrix3x3& operator+=(const b3Matrix3x3& m);
 
 	/** @brief Substractss by the target matrix on the right
 	*  @param m matrix to be applied 
 	* Equivilant to this = this - m */
-	b3Matrix3x3& operator-=(const b3Matrix3x3& m); 
+	b3Matrix3x3& operator-=(const b3Matrix3x3& m);
 
 	/** @brief Set from the rotational part of a 4x4 OpenGL matrix
 	*  @param m A pointer to the beginning of the array of scalars*/
-	void setFromOpenGLSubMatrix(const b3Scalar *m)
+	void setFromOpenGLSubMatrix(const b3Scalar* m)
 	{
-		m_el[0].setValue(m[0],m[4],m[8]);
-		m_el[1].setValue(m[1],m[5],m[9]);
-		m_el[2].setValue(m[2],m[6],m[10]);
-
+		m_el[0].setValue(m[0], m[4], m[8]);
+		m_el[1].setValue(m[1], m[5], m[9]);
+		m_el[2].setValue(m[2], m[6], m[10]);
 	}
 	/** @brief Set the values of the matrix explicitly (row major)
 	*  @param xx Top left
@@ -190,93 +187,92 @@ public:
 	*  @param zx Bottom Left
 	*  @param zy Bottom Middle
 	*  @param zz Bottom Right*/
-	void setValue(const b3Scalar& xx, const b3Scalar& xy, const b3Scalar& xz, 
-		const b3Scalar& yx, const b3Scalar& yy, const b3Scalar& yz, 
-		const b3Scalar& zx, const b3Scalar& zy, const b3Scalar& zz)
+	void setValue(const b3Scalar& xx, const b3Scalar& xy, const b3Scalar& xz,
+				  const b3Scalar& yx, const b3Scalar& yy, const b3Scalar& yz,
+				  const b3Scalar& zx, const b3Scalar& zy, const b3Scalar& zz)
 	{
-		m_el[0].setValue(xx,xy,xz);
-		m_el[1].setValue(yx,yy,yz);
-		m_el[2].setValue(zx,zy,zz);
+		m_el[0].setValue(xx, xy, xz);
+		m_el[1].setValue(yx, yy, yz);
+		m_el[2].setValue(zx, zy, zz);
 	}
 
 	/** @brief Set the matrix from a quaternion
-	*  @param q The Quaternion to match */  
-	void setRotation(const b3Quaternion& q) 
+	*  @param q The Quaternion to match */
+	void setRotation(const b3Quaternion& q)
 	{
 		b3Scalar d = q.length2();
 		b3FullAssert(d != b3Scalar(0.0));
 		b3Scalar s = b3Scalar(2.0) / d;
-    
-    #if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
-        __m128	vs, Q = q.get128();
+
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vs, Q = q.get128();
 		__m128i Qi = b3CastfTo128i(Q);
-        __m128	Y, Z;
-        __m128	V1, V2, V3;
-        __m128	V11, V21, V31;
-        __m128	NQ = _mm_xor_ps(Q, b3vMzeroMask);
+		__m128 Y, Z;
+		__m128 V1, V2, V3;
+		__m128 V11, V21, V31;
+		__m128 NQ = _mm_xor_ps(Q, b3vMzeroMask);
 		__m128i NQi = b3CastfTo128i(NQ);
-        
-        V1 = b3CastiTo128f(_mm_shuffle_epi32 (Qi, B3_SHUFFLE(1,0,2,3)));	// Y X Z W
-		V2 = _mm_shuffle_ps(NQ, Q, B3_SHUFFLE(0,0,1,3));     // -X -X  Y  W
-        V3 = b3CastiTo128f(_mm_shuffle_epi32 (Qi, B3_SHUFFLE(2,1,0,3)));	// Z Y X W
-        V1 = _mm_xor_ps(V1, b3vMPPP);	//	change the sign of the first element
-			
-        V11	= b3CastiTo128f(_mm_shuffle_epi32 (Qi, B3_SHUFFLE(1,1,0,3)));	// Y Y X W
-		V21 = _mm_unpackhi_ps(Q, Q);                    //  Z  Z  W  W
-		V31 = _mm_shuffle_ps(Q, NQ, B3_SHUFFLE(0,2,0,3));	//  X  Z -X -W
-
-		V2 = V2 * V1;	//
-		V1 = V1 * V11;	//
-		V3 = V3 * V31;	//
-
-        V11 = _mm_shuffle_ps(NQ, Q, B3_SHUFFLE(2,3,1,3));	//	-Z -W  Y  W
-		V11 = V11 * V21;	//
-        V21 = _mm_xor_ps(V21, b3vMPPP);	//	change the sign of the first element
-		V31 = _mm_shuffle_ps(Q, NQ, B3_SHUFFLE(3,3,1,3));	//	 W  W -Y -W
-        V31 = _mm_xor_ps(V31, b3vMPPP);	//	change the sign of the first element
-		Y = b3CastiTo128f(_mm_shuffle_epi32 (NQi, B3_SHUFFLE(3,2,0,3)));	// -W -Z -X -W
-		Z = b3CastiTo128f(_mm_shuffle_epi32 (Qi, B3_SHUFFLE(1,0,1,3)));	//  Y  X  Y  W
+
+		V1 = b3CastiTo128f(_mm_shuffle_epi32(Qi, B3_SHUFFLE(1, 0, 2, 3)));  // Y X Z W
+		V2 = _mm_shuffle_ps(NQ, Q, B3_SHUFFLE(0, 0, 1, 3));                 // -X -X  Y  W
+		V3 = b3CastiTo128f(_mm_shuffle_epi32(Qi, B3_SHUFFLE(2, 1, 0, 3)));  // Z Y X W
+		V1 = _mm_xor_ps(V1, b3vMPPP);                                       //	change the sign of the first element
+
+		V11 = b3CastiTo128f(_mm_shuffle_epi32(Qi, B3_SHUFFLE(1, 1, 0, 3)));  // Y Y X W
+		V21 = _mm_unpackhi_ps(Q, Q);                                         //  Z  Z  W  W
+		V31 = _mm_shuffle_ps(Q, NQ, B3_SHUFFLE(0, 2, 0, 3));                 //  X  Z -X -W
+
+		V2 = V2 * V1;   //
+		V1 = V1 * V11;  //
+		V3 = V3 * V31;  //
+
+		V11 = _mm_shuffle_ps(NQ, Q, B3_SHUFFLE(2, 3, 1, 3));                //	-Z -W  Y  W
+		V11 = V11 * V21;                                                    //
+		V21 = _mm_xor_ps(V21, b3vMPPP);                                     //	change the sign of the first element
+		V31 = _mm_shuffle_ps(Q, NQ, B3_SHUFFLE(3, 3, 1, 3));                //	 W  W -Y -W
+		V31 = _mm_xor_ps(V31, b3vMPPP);                                     //	change the sign of the first element
+		Y = b3CastiTo128f(_mm_shuffle_epi32(NQi, B3_SHUFFLE(3, 2, 0, 3)));  // -W -Z -X -W
+		Z = b3CastiTo128f(_mm_shuffle_epi32(Qi, B3_SHUFFLE(1, 0, 1, 3)));   //  Y  X  Y  W
 
 		vs = _mm_load_ss(&s);
 		V21 = V21 * Y;
 		V31 = V31 * Z;
 
 		V1 = V1 + V11;
-        V2 = V2 + V21;
-        V3 = V3 + V31;
-
-        vs = b3_splat3_ps(vs, 0);
-            //	s ready
-        V1 = V1 * vs;
-        V2 = V2 * vs;
-        V3 = V3 * vs;
-        
-        V1 = V1 + b3v1000;
-        V2 = V2 + b3v0100;
-        V3 = V3 + b3v0010;
-        
-        m_el[0] = b3MakeVector3(V1); 
-        m_el[1] = b3MakeVector3(V2);
-        m_el[2] = b3MakeVector3(V3);
-    #else    
-		b3Scalar xs = q.getX() * s,   ys = q.getY() * s,   zs = q.getZ() * s;
-		b3Scalar wx = q.getW() * xs,  wy = q.getW() * ys,  wz = q.getW() * zs;
-		b3Scalar xx = q.getX() * xs,  xy = q.getX() * ys,  xz = q.getX() * zs;
-		b3Scalar yy = q.getY() * ys,  yz = q.getY() * zs,  zz = q.getZ() * zs;
+		V2 = V2 + V21;
+		V3 = V3 + V31;
+
+		vs = b3_splat3_ps(vs, 0);
+		//	s ready
+		V1 = V1 * vs;
+		V2 = V2 * vs;
+		V3 = V3 * vs;
+
+		V1 = V1 + b3v1000;
+		V2 = V2 + b3v0100;
+		V3 = V3 + b3v0010;
+
+		m_el[0] = b3MakeVector3(V1);
+		m_el[1] = b3MakeVector3(V2);
+		m_el[2] = b3MakeVector3(V3);
+#else
+		b3Scalar xs = q.getX() * s, ys = q.getY() * s, zs = q.getZ() * s;
+		b3Scalar wx = q.getW() * xs, wy = q.getW() * ys, wz = q.getW() * zs;
+		b3Scalar xx = q.getX() * xs, xy = q.getX() * ys, xz = q.getX() * zs;
+		b3Scalar yy = q.getY() * ys, yz = q.getY() * zs, zz = q.getZ() * zs;
 		setValue(
-            b3Scalar(1.0) - (yy + zz), xy - wz, xz + wy,
+			b3Scalar(1.0) - (yy + zz), xy - wz, xz + wy,
 			xy + wz, b3Scalar(1.0) - (xx + zz), yz - wx,
 			xz - wy, yz + wx, b3Scalar(1.0) - (xx + yy));
-	#endif
-    }
-
+#endif
+	}
 
 	/** @brief Set the matrix from euler angles using YPR around YXZ respectively
 	*  @param yaw Yaw about Y axis
 	*  @param pitch Pitch about X axis
 	*  @param roll Roll about Z axis 
 	*/
-	void setEulerYPR(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll) 
+	void setEulerYPR(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll)
 	{
 		setEulerZYX(roll, pitch, yaw);
 	}
@@ -290,182 +286,197 @@ public:
 	* angles are applied in ZYX order. I.e a vector is first rotated 
 	* about X then Y and then Z
 	**/
-	void setEulerZYX(b3Scalar eulerX,b3Scalar eulerY,b3Scalar eulerZ) { 
+	void setEulerZYX(b3Scalar eulerX, b3Scalar eulerY, b3Scalar eulerZ)
+	{
 		///@todo proposed to reverse this since it's labeled zyx but takes arguments xyz and it will match all other parts of the code
-		b3Scalar ci ( b3Cos(eulerX)); 
-		b3Scalar cj ( b3Cos(eulerY)); 
-		b3Scalar ch ( b3Cos(eulerZ)); 
-		b3Scalar si ( b3Sin(eulerX)); 
-		b3Scalar sj ( b3Sin(eulerY)); 
-		b3Scalar sh ( b3Sin(eulerZ)); 
-		b3Scalar cc = ci * ch; 
-		b3Scalar cs = ci * sh; 
-		b3Scalar sc = si * ch; 
+		b3Scalar ci(b3Cos(eulerX));
+		b3Scalar cj(b3Cos(eulerY));
+		b3Scalar ch(b3Cos(eulerZ));
+		b3Scalar si(b3Sin(eulerX));
+		b3Scalar sj(b3Sin(eulerY));
+		b3Scalar sh(b3Sin(eulerZ));
+		b3Scalar cc = ci * ch;
+		b3Scalar cs = ci * sh;
+		b3Scalar sc = si * ch;
 		b3Scalar ss = si * sh;
 
 		setValue(cj * ch, sj * sc - cs, sj * cc + ss,
-			cj * sh, sj * ss + cc, sj * cs - sc, 
-			-sj,      cj * si,      cj * ci);
+				 cj * sh, sj * ss + cc, sj * cs - sc,
+				 -sj, cj * si, cj * ci);
 	}
 
 	/**@brief Set the matrix to the identity */
 	void setIdentity()
-	{ 
-#if (defined(B3_USE_SSE_IN_API)&& defined (B3_USE_SSE)) || defined(B3_USE_NEON)
-			m_el[0] = b3MakeVector3(b3v1000); 
-			m_el[1] = b3MakeVector3(b3v0100);
-			m_el[2] = b3MakeVector3(b3v0010);
+	{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+		m_el[0] = b3MakeVector3(b3v1000);
+		m_el[1] = b3MakeVector3(b3v0100);
+		m_el[2] = b3MakeVector3(b3v0010);
 #else
-		setValue(b3Scalar(1.0), b3Scalar(0.0), b3Scalar(0.0), 
-			b3Scalar(0.0), b3Scalar(1.0), b3Scalar(0.0), 
-			b3Scalar(0.0), b3Scalar(0.0), b3Scalar(1.0)); 
+		setValue(b3Scalar(1.0), b3Scalar(0.0), b3Scalar(0.0),
+				 b3Scalar(0.0), b3Scalar(1.0), b3Scalar(0.0),
+				 b3Scalar(0.0), b3Scalar(0.0), b3Scalar(1.0));
 #endif
 	}
 
-	static const b3Matrix3x3&	getIdentity()
+	static const b3Matrix3x3& getIdentity()
 	{
-#if (defined(B3_USE_SSE_IN_API)&& defined (B3_USE_SSE)) || defined(B3_USE_NEON)
-        static const b3Matrix3x3 
-        identityMatrix(b3v1000, b3v0100, b3v0010);
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+		static const b3Matrix3x3
+			identityMatrix(b3v1000, b3v0100, b3v0010);
 #else
-		static const b3Matrix3x3 
-        identityMatrix(
-            b3Scalar(1.0), b3Scalar(0.0), b3Scalar(0.0), 
-			b3Scalar(0.0), b3Scalar(1.0), b3Scalar(0.0), 
-			b3Scalar(0.0), b3Scalar(0.0), b3Scalar(1.0));
+		static const b3Matrix3x3
+			identityMatrix(
+				b3Scalar(1.0), b3Scalar(0.0), b3Scalar(0.0),
+				b3Scalar(0.0), b3Scalar(1.0), b3Scalar(0.0),
+				b3Scalar(0.0), b3Scalar(0.0), b3Scalar(1.0));
 #endif
 		return identityMatrix;
 	}
 
 	/**@brief Fill the rotational part of an OpenGL matrix and clear the shear/perspective
 	* @param m The array to be filled */
-	void getOpenGLSubMatrix(b3Scalar *m) const 
+	void getOpenGLSubMatrix(b3Scalar * m) const
 	{
-#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
-        __m128 v0 = m_el[0].mVec128;
-        __m128 v1 = m_el[1].mVec128;
-        __m128 v2 = m_el[2].mVec128;    //  x2 y2 z2 w2
-        __m128 *vm = (__m128 *)m;
-        __m128 vT;
-        
-        v2 = _mm_and_ps(v2, b3vFFF0fMask);  //  x2 y2 z2 0
-        
-        vT = _mm_unpackhi_ps(v0, v1);	//	z0 z1 * *
-        v0 = _mm_unpacklo_ps(v0, v1);	//	x0 x1 y0 y1
-
-        v1 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(2, 3, 1, 3) );	// y0 y1 y2 0
-        v0 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(0, 1, 0, 3) );	// x0 x1 x2 0
-        v2 = b3CastdTo128f(_mm_move_sd(b3CastfTo128d(v2), b3CastfTo128d(vT)));	// z0 z1 z2 0
-
-        vm[0] = v0;
-        vm[1] = v1;
-        vm[2] = v2;
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 v0 = m_el[0].mVec128;
+		__m128 v1 = m_el[1].mVec128;
+		__m128 v2 = m_el[2].mVec128;  //  x2 y2 z2 w2
+		__m128* vm = (__m128*)m;
+		__m128 vT;
+
+		v2 = _mm_and_ps(v2, b3vFFF0fMask);  //  x2 y2 z2 0
+
+		vT = _mm_unpackhi_ps(v0, v1);  //	z0 z1 * *
+		v0 = _mm_unpacklo_ps(v0, v1);  //	x0 x1 y0 y1
+
+		v1 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(2, 3, 1, 3));                    // y0 y1 y2 0
+		v0 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(0, 1, 0, 3));                    // x0 x1 x2 0
+		v2 = b3CastdTo128f(_mm_move_sd(b3CastfTo128d(v2), b3CastfTo128d(vT)));  // z0 z1 z2 0
+
+		vm[0] = v0;
+		vm[1] = v1;
+		vm[2] = v2;
 #elif defined(B3_USE_NEON)
-        // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
-        static const uint32x2_t zMask = (const uint32x2_t) {-1, 0 };
-        float32x4_t *vm = (float32x4_t *)m;
-        float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 );  // {x0 x1 z0 z1}, {y0 y1 w0 w1}
-        float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) );       // {x2  0 }, {y2 0}
-        float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
-        float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
-        float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
-        float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q );       // z0 z1 z2  0
-
-        vm[0] = v0;
-        vm[1] = v1;
-        vm[2] = v2;
+		// note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
+		static const uint32x2_t zMask = (const uint32x2_t){-1, 0};
+		float32x4_t* vm = (float32x4_t*)m;
+		float32x4x2_t top = vtrnq_f32(m_el[0].mVec128, m_el[1].mVec128);               // {x0 x1 z0 z1}, {y0 y1 w0 w1}
+		float32x2x2_t bl = vtrn_f32(vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f));  // {x2  0 }, {y2 0}
+		float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]);
+		float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]);
+		float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(m_el[2].mVec128), zMask);
+		float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q);  // z0 z1 z2  0
+
+		vm[0] = v0;
+		vm[1] = v1;
+		vm[2] = v2;
 #else
-		m[0]  = b3Scalar(m_el[0].getX()); 
-		m[1]  = b3Scalar(m_el[1].getX());
-		m[2]  = b3Scalar(m_el[2].getX());
-		m[3]  = b3Scalar(0.0); 
-		m[4]  = b3Scalar(m_el[0].getY());
-		m[5]  = b3Scalar(m_el[1].getY());
-		m[6]  = b3Scalar(m_el[2].getY());
-		m[7]  = b3Scalar(0.0); 
-		m[8]  = b3Scalar(m_el[0].getZ()); 
-		m[9]  = b3Scalar(m_el[1].getZ());
+		m[0] = b3Scalar(m_el[0].getX());
+		m[1] = b3Scalar(m_el[1].getX());
+		m[2] = b3Scalar(m_el[2].getX());
+		m[3] = b3Scalar(0.0);
+		m[4] = b3Scalar(m_el[0].getY());
+		m[5] = b3Scalar(m_el[1].getY());
+		m[6] = b3Scalar(m_el[2].getY());
+		m[7] = b3Scalar(0.0);
+		m[8] = b3Scalar(m_el[0].getZ());
+		m[9] = b3Scalar(m_el[1].getZ());
 		m[10] = b3Scalar(m_el[2].getZ());
-		m[11] = b3Scalar(0.0); 
+		m[11] = b3Scalar(0.0);
 #endif
 	}
 
 	/**@brief Get the matrix represented as a quaternion 
 	* @param q The quaternion which will be set */
-	void getRotation(b3Quaternion& q) const
+	void getRotation(b3Quaternion & q) const
 	{
-#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON)
-        b3Scalar trace = m_el[0].getX() + m_el[1].getY() + m_el[2].getZ();
-        b3Scalar s, x;
-        
-        union {
-            b3SimdFloat4 vec;
-            b3Scalar f[4];
-        } temp;
-        
-        if (trace > b3Scalar(0.0)) 
-        {
-            x = trace + b3Scalar(1.0);
-
-            temp.f[0]=m_el[2].getY() - m_el[1].getZ();
-            temp.f[1]=m_el[0].getZ() - m_el[2].getX();
-            temp.f[2]=m_el[1].getX() - m_el[0].getY();
-            temp.f[3]=x;
-            //temp.f[3]= s * b3Scalar(0.5);
-        } 
-        else 
-        {
-            int i, j, k;
-            if(m_el[0].getX() < m_el[1].getY()) 
-            { 
-                if( m_el[1].getY() < m_el[2].getZ() )
-                    { i = 2; j = 0; k = 1; }
-                else
-                    { i = 1; j = 2; k = 0; }
-            }
-            else
-            {
-                if( m_el[0].getX() < m_el[2].getZ())
-                    { i = 2; j = 0; k = 1; }
-                else
-                    { i = 0; j = 1; k = 2; }
-            }
-
-            x = m_el[i][i] - m_el[j][j] - m_el[k][k] + b3Scalar(1.0);
-
-            temp.f[3] = (m_el[k][j] - m_el[j][k]);
-            temp.f[j] = (m_el[j][i] + m_el[i][j]);
-            temp.f[k] = (m_el[k][i] + m_el[i][k]);
-            temp.f[i] = x;
-            //temp.f[i] = s * b3Scalar(0.5);
-        }
-
-        s = b3Sqrt(x);
-        q.set128(temp.vec);
-        s = b3Scalar(0.5) / s;
-
-        q *= s;
-#else    
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+		b3Scalar trace = m_el[0].getX() + m_el[1].getY() + m_el[2].getZ();
+		b3Scalar s, x;
+
+		union {
+			b3SimdFloat4 vec;
+			b3Scalar f[4];
+		} temp;
+
+		if (trace > b3Scalar(0.0))
+		{
+			x = trace + b3Scalar(1.0);
+
+			temp.f[0] = m_el[2].getY() - m_el[1].getZ();
+			temp.f[1] = m_el[0].getZ() - m_el[2].getX();
+			temp.f[2] = m_el[1].getX() - m_el[0].getY();
+			temp.f[3] = x;
+			//temp.f[3]= s * b3Scalar(0.5);
+		}
+		else
+		{
+			int i, j, k;
+			if (m_el[0].getX() < m_el[1].getY())
+			{
+				if (m_el[1].getY() < m_el[2].getZ())
+				{
+					i = 2;
+					j = 0;
+					k = 1;
+				}
+				else
+				{
+					i = 1;
+					j = 2;
+					k = 0;
+				}
+			}
+			else
+			{
+				if (m_el[0].getX() < m_el[2].getZ())
+				{
+					i = 2;
+					j = 0;
+					k = 1;
+				}
+				else
+				{
+					i = 0;
+					j = 1;
+					k = 2;
+				}
+			}
+
+			x = m_el[i][i] - m_el[j][j] - m_el[k][k] + b3Scalar(1.0);
+
+			temp.f[3] = (m_el[k][j] - m_el[j][k]);
+			temp.f[j] = (m_el[j][i] + m_el[i][j]);
+			temp.f[k] = (m_el[k][i] + m_el[i][k]);
+			temp.f[i] = x;
+			//temp.f[i] = s * b3Scalar(0.5);
+		}
+
+		s = b3Sqrt(x);
+		q.set128(temp.vec);
+		s = b3Scalar(0.5) / s;
+
+		q *= s;
+#else
 		b3Scalar trace = m_el[0].getX() + m_el[1].getY() + m_el[2].getZ();
 
 		b3Scalar temp[4];
 
-		if (trace > b3Scalar(0.0)) 
+		if (trace > b3Scalar(0.0))
 		{
 			b3Scalar s = b3Sqrt(trace + b3Scalar(1.0));
-			temp[3]=(s * b3Scalar(0.5));
+			temp[3] = (s * b3Scalar(0.5));
 			s = b3Scalar(0.5) / s;
 
-			temp[0]=((m_el[2].getY() - m_el[1].getZ()) * s);
-			temp[1]=((m_el[0].getZ() - m_el[2].getX()) * s);
-			temp[2]=((m_el[1].getX() - m_el[0].getY()) * s);
-		} 
-		else 
+			temp[0] = ((m_el[2].getY() - m_el[1].getZ()) * s);
+			temp[1] = ((m_el[0].getZ() - m_el[2].getX()) * s);
+			temp[2] = ((m_el[1].getX() - m_el[0].getY()) * s);
+		}
+		else
 		{
-			int i = m_el[0].getX() < m_el[1].getY() ? 
-				(m_el[1].getY() < m_el[2].getZ() ? 2 : 1) :
-				(m_el[0].getX() < m_el[2].getZ() ? 2 : 0); 
-			int j = (i + 1) % 3;  
+			int i = m_el[0].getX() < m_el[1].getY() ? (m_el[1].getY() < m_el[2].getZ() ? 2 : 1) : (m_el[0].getX() < m_el[2].getZ() ? 2 : 0);
+			int j = (i + 1) % 3;
 			int k = (i + 2) % 3;
 
 			b3Scalar s = b3Sqrt(m_el[i][i] - m_el[j][j] - m_el[k][k] + b3Scalar(1.0));
@@ -476,44 +487,42 @@ public:
 			temp[j] = (m_el[j][i] + m_el[i][j]) * s;
 			temp[k] = (m_el[k][i] + m_el[i][k]) * s;
 		}
-		q.setValue(temp[0],temp[1],temp[2],temp[3]);
+		q.setValue(temp[0], temp[1], temp[2], temp[3]);
 #endif
 	}
 
 	/**@brief Get the matrix represented as euler angles around YXZ, roundtrip with setEulerYPR
 	* @param yaw Yaw around Y axis
 	* @param pitch Pitch around X axis
-	* @param roll around Z axis */	
-	void getEulerYPR(b3Scalar& yaw, b3Scalar& pitch, b3Scalar& roll) const
+	* @param roll around Z axis */
+	void getEulerYPR(b3Scalar & yaw, b3Scalar & pitch, b3Scalar & roll) const
 	{
-
 		// first use the normal calculus
 		yaw = b3Scalar(b3Atan2(m_el[1].getX(), m_el[0].getX()));
 		pitch = b3Scalar(b3Asin(-m_el[2].getX()));
 		roll = b3Scalar(b3Atan2(m_el[2].getY(), m_el[2].getZ()));
 
 		// on pitch = +/-HalfPI
-		if (b3Fabs(pitch)==B3_HALF_PI)
+		if (b3Fabs(pitch) == B3_HALF_PI)
 		{
-			if (yaw>0)
-				yaw-=B3_PI;
+			if (yaw > 0)
+				yaw -= B3_PI;
 			else
-				yaw+=B3_PI;
+				yaw += B3_PI;
 
-			if (roll>0)
-				roll-=B3_PI;
+			if (roll > 0)
+				roll -= B3_PI;
 			else
-				roll+=B3_PI;
+				roll += B3_PI;
 		}
 	};
 
-
 	/**@brief Get the matrix represented as euler angles around ZYX
 	* @param yaw Yaw around X axis
 	* @param pitch Pitch around Y axis
 	* @param roll around X axis 
-	* @param solution_number Which solution of two possible solutions ( 1 or 2) are possible values*/	
-	void getEulerZYX(b3Scalar& yaw, b3Scalar& pitch, b3Scalar& roll, unsigned int solution_number = 1) const
+	* @param solution_number Which solution of two possible solutions ( 1 or 2) are possible values*/
+	void getEulerZYX(b3Scalar & yaw, b3Scalar & pitch, b3Scalar & roll, unsigned int solution_number = 1) const
 	{
 		struct Euler
 		{
@@ -523,7 +532,7 @@ public:
 		};
 
 		Euler euler_out;
-		Euler euler_out2; //second solution
+		Euler euler_out2;  //second solution
 		//get the pointer to the raw data
 
 		// Check that pitch is not at a singularity
@@ -533,7 +542,7 @@ public:
 			euler_out2.yaw = 0;
 
 			// From difference of angles formula
-			b3Scalar delta = b3Atan2(m_el[0].getX(),m_el[0].getZ());
+			b3Scalar delta = b3Atan2(m_el[0].getX(), m_el[0].getZ());
 			if (m_el[2].getX() > 0)  //gimbal locked up
 			{
 				euler_out.pitch = B3_PI / b3Scalar(2.0);
@@ -541,7 +550,7 @@ public:
 				euler_out.roll = euler_out.pitch + delta;
 				euler_out2.roll = euler_out.pitch + delta;
 			}
-			else // gimbal locked down
+			else  // gimbal locked down
 			{
 				euler_out.pitch = -B3_PI / b3Scalar(2.0);
 				euler_out2.pitch = -B3_PI / b3Scalar(2.0);
@@ -551,29 +560,29 @@ public:
 		}
 		else
 		{
-			euler_out.pitch = - b3Asin(m_el[2].getX());
+			euler_out.pitch = -b3Asin(m_el[2].getX());
 			euler_out2.pitch = B3_PI - euler_out.pitch;
 
-			euler_out.roll = b3Atan2(m_el[2].getY()/b3Cos(euler_out.pitch), 
-				m_el[2].getZ()/b3Cos(euler_out.pitch));
-			euler_out2.roll = b3Atan2(m_el[2].getY()/b3Cos(euler_out2.pitch), 
-				m_el[2].getZ()/b3Cos(euler_out2.pitch));
+			euler_out.roll = b3Atan2(m_el[2].getY() / b3Cos(euler_out.pitch),
+									 m_el[2].getZ() / b3Cos(euler_out.pitch));
+			euler_out2.roll = b3Atan2(m_el[2].getY() / b3Cos(euler_out2.pitch),
+									  m_el[2].getZ() / b3Cos(euler_out2.pitch));
 
-			euler_out.yaw = b3Atan2(m_el[1].getX()/b3Cos(euler_out.pitch), 
-				m_el[0].getX()/b3Cos(euler_out.pitch));
-			euler_out2.yaw = b3Atan2(m_el[1].getX()/b3Cos(euler_out2.pitch), 
-				m_el[0].getX()/b3Cos(euler_out2.pitch));
+			euler_out.yaw = b3Atan2(m_el[1].getX() / b3Cos(euler_out.pitch),
+									m_el[0].getX() / b3Cos(euler_out.pitch));
+			euler_out2.yaw = b3Atan2(m_el[1].getX() / b3Cos(euler_out2.pitch),
+									 m_el[0].getX() / b3Cos(euler_out2.pitch));
 		}
 
 		if (solution_number == 1)
-		{ 
-			yaw = euler_out.yaw; 
+		{
+			yaw = euler_out.yaw;
 			pitch = euler_out.pitch;
 			roll = euler_out.roll;
 		}
 		else
-		{ 
-			yaw = euler_out2.yaw; 
+		{
+			yaw = euler_out2.yaw;
 			pitch = euler_out2.pitch;
 			roll = euler_out2.roll;
 		}
@@ -584,18 +593,18 @@ public:
 
 	b3Matrix3x3 scaled(const b3Vector3& s) const
 	{
-#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON)
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
 		return b3Matrix3x3(m_el[0] * s, m_el[1] * s, m_el[2] * s);
-#else		
+#else
 		return b3Matrix3x3(
-            m_el[0].getX() * s.getX(), m_el[0].getY() * s.getY(), m_el[0].getZ() * s.getZ(),
+			m_el[0].getX() * s.getX(), m_el[0].getY() * s.getY(), m_el[0].getZ() * s.getZ(),
 			m_el[1].getX() * s.getX(), m_el[1].getY() * s.getY(), m_el[1].getZ() * s.getZ(),
 			m_el[2].getX() * s.getX(), m_el[2].getY() * s.getY(), m_el[2].getZ() * s.getZ());
 #endif
 	}
 
 	/**@brief Return the determinant of the matrix */
-	b3Scalar            determinant() const;
+	b3Scalar determinant() const;
 	/**@brief Return the adjoint of the matrix */
 	b3Matrix3x3 adjoint() const;
 	/**@brief Return the matrix with all values non negative */
@@ -603,25 +612,24 @@ public:
 	/**@brief Return the transpose of the matrix */
 	b3Matrix3x3 transpose() const;
 	/**@brief Return the inverse of the matrix */
-	b3Matrix3x3 inverse() const; 
+	b3Matrix3x3 inverse() const;
 
 	b3Matrix3x3 transposeTimes(const b3Matrix3x3& m) const;
 	b3Matrix3x3 timesTranspose(const b3Matrix3x3& m) const;
 
-	B3_FORCE_INLINE b3Scalar tdotx(const b3Vector3& v) const 
+	B3_FORCE_INLINE b3Scalar tdotx(const b3Vector3& v) const
 	{
 		return m_el[0].getX() * v.getX() + m_el[1].getX() * v.getY() + m_el[2].getX() * v.getZ();
 	}
-	B3_FORCE_INLINE b3Scalar tdoty(const b3Vector3& v) const 
+	B3_FORCE_INLINE b3Scalar tdoty(const b3Vector3& v) const
 	{
 		return m_el[0].getY() * v.getX() + m_el[1].getY() * v.getY() + m_el[2].getY() * v.getZ();
 	}
-	B3_FORCE_INLINE b3Scalar tdotz(const b3Vector3& v) const 
+	B3_FORCE_INLINE b3Scalar tdotz(const b3Vector3& v) const
 	{
 		return m_el[0].getZ() * v.getX() + m_el[1].getZ() * v.getY() + m_el[2].getZ() * v.getZ();
 	}
 
-
 	/**@brief diagonalizes this matrix by the Jacobi method.
 	* @param rot stores the rotation from the coordinate system in which the matrix is diagonal to the original
 	* coordinate system, i.e., old_this = rot * new_this * rot^T. 
@@ -631,7 +639,7 @@ public:
 	* 
 	* Note that this matrix is assumed to be symmetric. 
 	*/
-	void diagonalize(b3Matrix3x3& rot, b3Scalar threshold, int maxSteps)
+	void diagonalize(b3Matrix3x3 & rot, b3Scalar threshold, int maxSteps)
 	{
 		rot.setIdentity();
 		for (int step = maxSteps; step > 0; step--)
@@ -667,7 +675,7 @@ public:
 				step = 1;
 			}
 
-			// compute Jacobi rotation J which leads to a zero for element [p][q] 
+			// compute Jacobi rotation J which leads to a zero for element [p][q]
 			b3Scalar mpq = m_el[p][q];
 			b3Scalar theta = (m_el[q][q] - m_el[p][p]) / (2 * mpq);
 			b3Scalar theta2 = theta * theta;
@@ -676,7 +684,7 @@ public:
 			if (theta2 * theta2 < b3Scalar(10 / B3_EPSILON))
 			{
 				t = (theta >= 0) ? 1 / (theta + b3Sqrt(1 + theta2))
-					: 1 / (theta - b3Sqrt(1 + theta2));
+								 : 1 / (theta - b3Sqrt(1 + theta2));
 				cos = 1 / b3Sqrt(1 + t * t);
 				sin = cos * t;
 			}
@@ -709,9 +717,6 @@ public:
 		}
 	}
 
-
-
-
 	/**@brief Calculate the matrix cofactor 
 	* @param r1 The first row to use for calculating the cofactor
 	* @param c1 The first column to use for calculating the cofactor
@@ -719,304 +724,298 @@ public:
 	* @param c1 The second column to use for calculating the cofactor
 	* See http://en.wikipedia.org/wiki/Cofactor_(linear_algebra) for more details
 	*/
-	b3Scalar cofac(int r1, int c1, int r2, int c2) const 
+	b3Scalar cofac(int r1, int c1, int r2, int c2) const
 	{
 		return m_el[r1][c1] * m_el[r2][c2] - m_el[r1][c2] * m_el[r2][c1];
 	}
 
-	void	serialize(struct	b3Matrix3x3Data& dataOut) const;
+	void serialize(struct b3Matrix3x3Data & dataOut) const;
 
-	void	serializeFloat(struct	b3Matrix3x3FloatData& dataOut) const;
+	void serializeFloat(struct b3Matrix3x3FloatData & dataOut) const;
 
-	void	deSerialize(const struct	b3Matrix3x3Data& dataIn);
+	void deSerialize(const struct b3Matrix3x3Data& dataIn);
 
-	void	deSerializeFloat(const struct	b3Matrix3x3FloatData& dataIn);
-
-	void	deSerializeDouble(const struct	b3Matrix3x3DoubleData& dataIn);
+	void deSerializeFloat(const struct b3Matrix3x3FloatData& dataIn);
 
+	void deSerializeDouble(const struct b3Matrix3x3DoubleData& dataIn);
 };
 
-
-B3_FORCE_INLINE b3Matrix3x3& 
+B3_FORCE_INLINE b3Matrix3x3&
 b3Matrix3x3::operator*=(const b3Matrix3x3& m)
 {
-#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
-    __m128 rv00, rv01, rv02;
-    __m128 rv10, rv11, rv12;
-    __m128 rv20, rv21, rv22;
-    __m128 mv0, mv1, mv2;
-
-    rv02 = m_el[0].mVec128;
-    rv12 = m_el[1].mVec128;
-    rv22 = m_el[2].mVec128;
-
-    mv0 = _mm_and_ps(m[0].mVec128, b3vFFF0fMask); 
-    mv1 = _mm_and_ps(m[1].mVec128, b3vFFF0fMask); 
-    mv2 = _mm_and_ps(m[2].mVec128, b3vFFF0fMask); 
-    
-    // rv0
-    rv00 = b3_splat_ps(rv02, 0);
-    rv01 = b3_splat_ps(rv02, 1);
-    rv02 = b3_splat_ps(rv02, 2);
-    
-    rv00 = _mm_mul_ps(rv00, mv0);
-    rv01 = _mm_mul_ps(rv01, mv1);
-    rv02 = _mm_mul_ps(rv02, mv2);
-    
-    // rv1
-    rv10 = b3_splat_ps(rv12, 0);
-    rv11 = b3_splat_ps(rv12, 1);
-    rv12 = b3_splat_ps(rv12, 2);
-    
-    rv10 = _mm_mul_ps(rv10, mv0);
-    rv11 = _mm_mul_ps(rv11, mv1);
-    rv12 = _mm_mul_ps(rv12, mv2);
-    
-    // rv2
-    rv20 = b3_splat_ps(rv22, 0);
-    rv21 = b3_splat_ps(rv22, 1);
-    rv22 = b3_splat_ps(rv22, 2);
-    
-    rv20 = _mm_mul_ps(rv20, mv0);
-    rv21 = _mm_mul_ps(rv21, mv1);
-    rv22 = _mm_mul_ps(rv22, mv2);
-
-    rv00 = _mm_add_ps(rv00, rv01);
-    rv10 = _mm_add_ps(rv10, rv11);
-    rv20 = _mm_add_ps(rv20, rv21);
-
-    m_el[0].mVec128 = _mm_add_ps(rv00, rv02);
-    m_el[1].mVec128 = _mm_add_ps(rv10, rv12);
-    m_el[2].mVec128 = _mm_add_ps(rv20, rv22);
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+	__m128 rv00, rv01, rv02;
+	__m128 rv10, rv11, rv12;
+	__m128 rv20, rv21, rv22;
+	__m128 mv0, mv1, mv2;
+
+	rv02 = m_el[0].mVec128;
+	rv12 = m_el[1].mVec128;
+	rv22 = m_el[2].mVec128;
+
+	mv0 = _mm_and_ps(m[0].mVec128, b3vFFF0fMask);
+	mv1 = _mm_and_ps(m[1].mVec128, b3vFFF0fMask);
+	mv2 = _mm_and_ps(m[2].mVec128, b3vFFF0fMask);
+
+	// rv0
+	rv00 = b3_splat_ps(rv02, 0);
+	rv01 = b3_splat_ps(rv02, 1);
+	rv02 = b3_splat_ps(rv02, 2);
+
+	rv00 = _mm_mul_ps(rv00, mv0);
+	rv01 = _mm_mul_ps(rv01, mv1);
+	rv02 = _mm_mul_ps(rv02, mv2);
+
+	// rv1
+	rv10 = b3_splat_ps(rv12, 0);
+	rv11 = b3_splat_ps(rv12, 1);
+	rv12 = b3_splat_ps(rv12, 2);
+
+	rv10 = _mm_mul_ps(rv10, mv0);
+	rv11 = _mm_mul_ps(rv11, mv1);
+	rv12 = _mm_mul_ps(rv12, mv2);
+
+	// rv2
+	rv20 = b3_splat_ps(rv22, 0);
+	rv21 = b3_splat_ps(rv22, 1);
+	rv22 = b3_splat_ps(rv22, 2);
+
+	rv20 = _mm_mul_ps(rv20, mv0);
+	rv21 = _mm_mul_ps(rv21, mv1);
+	rv22 = _mm_mul_ps(rv22, mv2);
+
+	rv00 = _mm_add_ps(rv00, rv01);
+	rv10 = _mm_add_ps(rv10, rv11);
+	rv20 = _mm_add_ps(rv20, rv21);
+
+	m_el[0].mVec128 = _mm_add_ps(rv00, rv02);
+	m_el[1].mVec128 = _mm_add_ps(rv10, rv12);
+	m_el[2].mVec128 = _mm_add_ps(rv20, rv22);
 
 #elif defined(B3_USE_NEON)
 
-    float32x4_t rv0, rv1, rv2;
-    float32x4_t v0, v1, v2;
-    float32x4_t mv0, mv1, mv2;
-
-    v0 = m_el[0].mVec128;
-    v1 = m_el[1].mVec128;
-    v2 = m_el[2].mVec128;
-
-    mv0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, b3vFFF0Mask); 
-    mv1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, b3vFFF0Mask); 
-    mv2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, b3vFFF0Mask); 
-    
-    rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
-    rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
-    rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
-    
-    rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
-    rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
-    rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
-    
-    rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
-    rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
-    rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
-
-    m_el[0].mVec128 = rv0;
-    m_el[1].mVec128 = rv1;
-    m_el[2].mVec128 = rv2;
-#else    
+	float32x4_t rv0, rv1, rv2;
+	float32x4_t v0, v1, v2;
+	float32x4_t mv0, mv1, mv2;
+
+	v0 = m_el[0].mVec128;
+	v1 = m_el[1].mVec128;
+	v2 = m_el[2].mVec128;
+
+	mv0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, b3vFFF0Mask);
+	mv1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, b3vFFF0Mask);
+	mv2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, b3vFFF0Mask);
+
+	rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
+	rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
+	rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
+
+	rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
+	rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
+	rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
+
+	rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
+	rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
+	rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
+
+	m_el[0].mVec128 = rv0;
+	m_el[1].mVec128 = rv1;
+	m_el[2].mVec128 = rv2;
+#else
 	setValue(
-        m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
+		m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
 		m.tdotx(m_el[1]), m.tdoty(m_el[1]), m.tdotz(m_el[1]),
 		m.tdotx(m_el[2]), m.tdoty(m_el[2]), m.tdotz(m_el[2]));
 #endif
 	return *this;
 }
 
-B3_FORCE_INLINE b3Matrix3x3& 
+B3_FORCE_INLINE b3Matrix3x3&
 b3Matrix3x3::operator+=(const b3Matrix3x3& m)
 {
-#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON)
-    m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128;
-    m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128;
-    m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128;
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+	m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128;
+	m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128;
+	m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128;
 #else
 	setValue(
-		m_el[0][0]+m.m_el[0][0], 
-		m_el[0][1]+m.m_el[0][1],
-		m_el[0][2]+m.m_el[0][2],
-		m_el[1][0]+m.m_el[1][0], 
-		m_el[1][1]+m.m_el[1][1],
-		m_el[1][2]+m.m_el[1][2],
-		m_el[2][0]+m.m_el[2][0], 
-		m_el[2][1]+m.m_el[2][1],
-		m_el[2][2]+m.m_el[2][2]);
+		m_el[0][0] + m.m_el[0][0],
+		m_el[0][1] + m.m_el[0][1],
+		m_el[0][2] + m.m_el[0][2],
+		m_el[1][0] + m.m_el[1][0],
+		m_el[1][1] + m.m_el[1][1],
+		m_el[1][2] + m.m_el[1][2],
+		m_el[2][0] + m.m_el[2][0],
+		m_el[2][1] + m.m_el[2][1],
+		m_el[2][2] + m.m_el[2][2]);
 #endif
 	return *this;
 }
 
 B3_FORCE_INLINE b3Matrix3x3
-operator*(const b3Matrix3x3& m, const b3Scalar & k)
+operator*(const b3Matrix3x3& m, const b3Scalar& k)
 {
-#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
-    __m128 vk = b3_splat_ps(_mm_load_ss((float *)&k), 0x80);
-    return b3Matrix3x3(
-                _mm_mul_ps(m[0].mVec128, vk), 
-                _mm_mul_ps(m[1].mVec128, vk), 
-                _mm_mul_ps(m[2].mVec128, vk)); 
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+	__m128 vk = b3_splat_ps(_mm_load_ss((float*)&k), 0x80);
+	return b3Matrix3x3(
+		_mm_mul_ps(m[0].mVec128, vk),
+		_mm_mul_ps(m[1].mVec128, vk),
+		_mm_mul_ps(m[2].mVec128, vk));
 #elif defined(B3_USE_NEON)
-    return b3Matrix3x3(
-                vmulq_n_f32(m[0].mVec128, k),
-                vmulq_n_f32(m[1].mVec128, k),
-                vmulq_n_f32(m[2].mVec128, k)); 
+	return b3Matrix3x3(
+		vmulq_n_f32(m[0].mVec128, k),
+		vmulq_n_f32(m[1].mVec128, k),
+		vmulq_n_f32(m[2].mVec128, k));
 #else
 	return b3Matrix3x3(
-		m[0].getX()*k,m[0].getY()*k,m[0].getZ()*k,
-		m[1].getX()*k,m[1].getY()*k,m[1].getZ()*k,
-		m[2].getX()*k,m[2].getY()*k,m[2].getZ()*k);
+		m[0].getX() * k, m[0].getY() * k, m[0].getZ() * k,
+		m[1].getX() * k, m[1].getY() * k, m[1].getZ() * k,
+		m[2].getX() * k, m[2].getY() * k, m[2].getZ() * k);
 #endif
 }
 
-B3_FORCE_INLINE b3Matrix3x3 
+B3_FORCE_INLINE b3Matrix3x3
 operator+(const b3Matrix3x3& m1, const b3Matrix3x3& m2)
 {
-#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON)
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
 	return b3Matrix3x3(
-        m1[0].mVec128 + m2[0].mVec128,
-        m1[1].mVec128 + m2[1].mVec128,
-        m1[2].mVec128 + m2[2].mVec128);
+		m1[0].mVec128 + m2[0].mVec128,
+		m1[1].mVec128 + m2[1].mVec128,
+		m1[2].mVec128 + m2[2].mVec128);
 #else
 	return b3Matrix3x3(
-        m1[0][0]+m2[0][0], 
-        m1[0][1]+m2[0][1],
-        m1[0][2]+m2[0][2],
-        
-        m1[1][0]+m2[1][0], 
-        m1[1][1]+m2[1][1],
-        m1[1][2]+m2[1][2],
-        
-        m1[2][0]+m2[2][0], 
-        m1[2][1]+m2[2][1],
-        m1[2][2]+m2[2][2]);
-#endif    
+		m1[0][0] + m2[0][0],
+		m1[0][1] + m2[0][1],
+		m1[0][2] + m2[0][2],
+
+		m1[1][0] + m2[1][0],
+		m1[1][1] + m2[1][1],
+		m1[1][2] + m2[1][2],
+
+		m1[2][0] + m2[2][0],
+		m1[2][1] + m2[2][1],
+		m1[2][2] + m2[2][2]);
+#endif
 }
 
-B3_FORCE_INLINE b3Matrix3x3 
+B3_FORCE_INLINE b3Matrix3x3
 operator-(const b3Matrix3x3& m1, const b3Matrix3x3& m2)
 {
-#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON)
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
 	return b3Matrix3x3(
-        m1[0].mVec128 - m2[0].mVec128,
-        m1[1].mVec128 - m2[1].mVec128,
-        m1[2].mVec128 - m2[2].mVec128);
+		m1[0].mVec128 - m2[0].mVec128,
+		m1[1].mVec128 - m2[1].mVec128,
+		m1[2].mVec128 - m2[2].mVec128);
 #else
 	return b3Matrix3x3(
-        m1[0][0]-m2[0][0], 
-        m1[0][1]-m2[0][1],
-        m1[0][2]-m2[0][2],
-        
-        m1[1][0]-m2[1][0], 
-        m1[1][1]-m2[1][1],
-        m1[1][2]-m2[1][2],
-        
-        m1[2][0]-m2[2][0], 
-        m1[2][1]-m2[2][1],
-        m1[2][2]-m2[2][2]);
+		m1[0][0] - m2[0][0],
+		m1[0][1] - m2[0][1],
+		m1[0][2] - m2[0][2],
+
+		m1[1][0] - m2[1][0],
+		m1[1][1] - m2[1][1],
+		m1[1][2] - m2[1][2],
+
+		m1[2][0] - m2[2][0],
+		m1[2][1] - m2[2][1],
+		m1[2][2] - m2[2][2]);
 #endif
 }
 
-
-B3_FORCE_INLINE b3Matrix3x3& 
+B3_FORCE_INLINE b3Matrix3x3&
 b3Matrix3x3::operator-=(const b3Matrix3x3& m)
 {
-#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON)
-    m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128;
-    m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128;
-    m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128;
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+	m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128;
+	m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128;
+	m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128;
 #else
 	setValue(
-	m_el[0][0]-m.m_el[0][0], 
-	m_el[0][1]-m.m_el[0][1],
-	m_el[0][2]-m.m_el[0][2],
-	m_el[1][0]-m.m_el[1][0], 
-	m_el[1][1]-m.m_el[1][1],
-	m_el[1][2]-m.m_el[1][2],
-	m_el[2][0]-m.m_el[2][0], 
-	m_el[2][1]-m.m_el[2][1],
-	m_el[2][2]-m.m_el[2][2]);
+		m_el[0][0] - m.m_el[0][0],
+		m_el[0][1] - m.m_el[0][1],
+		m_el[0][2] - m.m_el[0][2],
+		m_el[1][0] - m.m_el[1][0],
+		m_el[1][1] - m.m_el[1][1],
+		m_el[1][2] - m.m_el[1][2],
+		m_el[2][0] - m.m_el[2][0],
+		m_el[2][1] - m.m_el[2][1],
+		m_el[2][2] - m.m_el[2][2]);
 #endif
 	return *this;
 }
 
-
-B3_FORCE_INLINE b3Scalar 
+B3_FORCE_INLINE b3Scalar
 b3Matrix3x3::determinant() const
-{ 
+{
 	return b3Triple((*this)[0], (*this)[1], (*this)[2]);
 }
 
-
-B3_FORCE_INLINE b3Matrix3x3 
+B3_FORCE_INLINE b3Matrix3x3
 b3Matrix3x3::absolute() const
 {
-#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
-    return b3Matrix3x3(
-            _mm_and_ps(m_el[0].mVec128, b3vAbsfMask),
-            _mm_and_ps(m_el[1].mVec128, b3vAbsfMask),
-            _mm_and_ps(m_el[2].mVec128, b3vAbsfMask));
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+	return b3Matrix3x3(
+		_mm_and_ps(m_el[0].mVec128, b3vAbsfMask),
+		_mm_and_ps(m_el[1].mVec128, b3vAbsfMask),
+		_mm_and_ps(m_el[2].mVec128, b3vAbsfMask));
 #elif defined(B3_USE_NEON)
-    return b3Matrix3x3(
-            (float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, b3v3AbsMask),
-            (float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, b3v3AbsMask),
-            (float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, b3v3AbsMask));
-#else	
 	return b3Matrix3x3(
-            b3Fabs(m_el[0].getX()), b3Fabs(m_el[0].getY()), b3Fabs(m_el[0].getZ()),
-            b3Fabs(m_el[1].getX()), b3Fabs(m_el[1].getY()), b3Fabs(m_el[1].getZ()),
-            b3Fabs(m_el[2].getX()), b3Fabs(m_el[2].getY()), b3Fabs(m_el[2].getZ()));
+		(float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, b3v3AbsMask),
+		(float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, b3v3AbsMask),
+		(float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, b3v3AbsMask));
+#else
+	return b3Matrix3x3(
+		b3Fabs(m_el[0].getX()), b3Fabs(m_el[0].getY()), b3Fabs(m_el[0].getZ()),
+		b3Fabs(m_el[1].getX()), b3Fabs(m_el[1].getY()), b3Fabs(m_el[1].getZ()),
+		b3Fabs(m_el[2].getX()), b3Fabs(m_el[2].getY()), b3Fabs(m_el[2].getZ()));
 #endif
 }
 
-B3_FORCE_INLINE b3Matrix3x3 
-b3Matrix3x3::transpose() const 
+B3_FORCE_INLINE b3Matrix3x3
+b3Matrix3x3::transpose() const
 {
-#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
-    __m128 v0 = m_el[0].mVec128;
-    __m128 v1 = m_el[1].mVec128;
-    __m128 v2 = m_el[2].mVec128;    //  x2 y2 z2 w2
-    __m128 vT;
-    
-    v2 = _mm_and_ps(v2, b3vFFF0fMask);  //  x2 y2 z2 0
-    
-    vT = _mm_unpackhi_ps(v0, v1);	//	z0 z1 * *
-    v0 = _mm_unpacklo_ps(v0, v1);	//	x0 x1 y0 y1
-
-    v1 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(2, 3, 1, 3) );	// y0 y1 y2 0
-    v0 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(0, 1, 0, 3) );	// x0 x1 x2 0
-    v2 = b3CastdTo128f(_mm_move_sd(b3CastfTo128d(v2), b3CastfTo128d(vT)));	// z0 z1 z2 0
-
-
-    return b3Matrix3x3( v0, v1, v2 );
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+	__m128 v0 = m_el[0].mVec128;
+	__m128 v1 = m_el[1].mVec128;
+	__m128 v2 = m_el[2].mVec128;  //  x2 y2 z2 w2
+	__m128 vT;
+
+	v2 = _mm_and_ps(v2, b3vFFF0fMask);  //  x2 y2 z2 0
+
+	vT = _mm_unpackhi_ps(v0, v1);  //	z0 z1 * *
+	v0 = _mm_unpacklo_ps(v0, v1);  //	x0 x1 y0 y1
+
+	v1 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(2, 3, 1, 3));                    // y0 y1 y2 0
+	v0 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(0, 1, 0, 3));                    // x0 x1 x2 0
+	v2 = b3CastdTo128f(_mm_move_sd(b3CastfTo128d(v2), b3CastfTo128d(vT)));  // z0 z1 z2 0
+
+	return b3Matrix3x3(v0, v1, v2);
 #elif defined(B3_USE_NEON)
-    // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
-    static const uint32x2_t zMask = (const uint32x2_t) {-1, 0 };
-    float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 );  // {x0 x1 z0 z1}, {y0 y1 w0 w1}
-    float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) );       // {x2  0 }, {y2 0}
-    float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
-    float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
-    float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
-    float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q );       // z0 z1 z2  0
-    return b3Matrix3x3( v0, v1, v2 ); 
+	// note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
+	static const uint32x2_t zMask = (const uint32x2_t){-1, 0};
+	float32x4x2_t top = vtrnq_f32(m_el[0].mVec128, m_el[1].mVec128);               // {x0 x1 z0 z1}, {y0 y1 w0 w1}
+	float32x2x2_t bl = vtrn_f32(vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f));  // {x2  0 }, {y2 0}
+	float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]);
+	float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]);
+	float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(m_el[2].mVec128), zMask);
+	float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q);  // z0 z1 z2  0
+	return b3Matrix3x3(v0, v1, v2);
 #else
-	return b3Matrix3x3( m_el[0].getX(), m_el[1].getX(), m_el[2].getX(),
-                        m_el[0].getY(), m_el[1].getY(), m_el[2].getY(),
-                        m_el[0].getZ(), m_el[1].getZ(), m_el[2].getZ());
+	return b3Matrix3x3(m_el[0].getX(), m_el[1].getX(), m_el[2].getX(),
+					   m_el[0].getY(), m_el[1].getY(), m_el[2].getY(),
+					   m_el[0].getZ(), m_el[1].getZ(), m_el[2].getZ());
 #endif
 }
 
-B3_FORCE_INLINE b3Matrix3x3 
-b3Matrix3x3::adjoint() const 
+B3_FORCE_INLINE b3Matrix3x3
+b3Matrix3x3::adjoint() const
 {
 	return b3Matrix3x3(cofac(1, 1, 2, 2), cofac(0, 2, 2, 1), cofac(0, 1, 1, 2),
-		cofac(1, 2, 2, 0), cofac(0, 0, 2, 2), cofac(0, 2, 1, 0),
-		cofac(1, 0, 2, 1), cofac(0, 1, 2, 0), cofac(0, 0, 1, 1));
+					   cofac(1, 2, 2, 0), cofac(0, 0, 2, 2), cofac(0, 2, 1, 0),
+					   cofac(1, 0, 2, 1), cofac(0, 1, 2, 0), cofac(0, 0, 1, 1));
 }
 
-B3_FORCE_INLINE b3Matrix3x3 
+B3_FORCE_INLINE b3Matrix3x3
 b3Matrix3x3::inverse() const
 {
 	b3Vector3 co = b3MakeVector3(cofac(1, 1, 2, 2), cofac(1, 2, 2, 0), cofac(1, 0, 2, 1));
@@ -1024,54 +1023,54 @@ b3Matrix3x3::inverse() const
 	b3FullAssert(det != b3Scalar(0.0));
 	b3Scalar s = b3Scalar(1.0) / det;
 	return b3Matrix3x3(co.getX() * s, cofac(0, 2, 2, 1) * s, cofac(0, 1, 1, 2) * s,
-		co.getY() * s, cofac(0, 0, 2, 2) * s, cofac(0, 2, 1, 0) * s,
-		co.getZ() * s, cofac(0, 1, 2, 0) * s, cofac(0, 0, 1, 1) * s);
+					   co.getY() * s, cofac(0, 0, 2, 2) * s, cofac(0, 2, 1, 0) * s,
+					   co.getZ() * s, cofac(0, 1, 2, 0) * s, cofac(0, 0, 1, 1) * s);
 }
 
-B3_FORCE_INLINE b3Matrix3x3 
+B3_FORCE_INLINE b3Matrix3x3
 b3Matrix3x3::transposeTimes(const b3Matrix3x3& m) const
 {
-#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
-    // zeros w
-//    static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL };
-    __m128 row = m_el[0].mVec128;
-    __m128 m0 = _mm_and_ps( m.getRow(0).mVec128, b3vFFF0fMask );
-    __m128 m1 = _mm_and_ps( m.getRow(1).mVec128, b3vFFF0fMask);
-    __m128 m2 = _mm_and_ps( m.getRow(2).mVec128, b3vFFF0fMask );
-    __m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0));
-    __m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55));
-    __m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa));
-    row = m_el[1].mVec128;
-    r0 = _mm_add_ps( r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0)));
-    r1 = _mm_add_ps( r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55)));
-    r2 = _mm_add_ps( r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa)));
-    row = m_el[2].mVec128;
-    r0 = _mm_add_ps( r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0)));
-    r1 = _mm_add_ps( r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55)));
-    r2 = _mm_add_ps( r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa)));
-    return b3Matrix3x3( r0, r1, r2 );
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+	// zeros w
+	//    static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL };
+	__m128 row = m_el[0].mVec128;
+	__m128 m0 = _mm_and_ps(m.getRow(0).mVec128, b3vFFF0fMask);
+	__m128 m1 = _mm_and_ps(m.getRow(1).mVec128, b3vFFF0fMask);
+	__m128 m2 = _mm_and_ps(m.getRow(2).mVec128, b3vFFF0fMask);
+	__m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0));
+	__m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55));
+	__m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa));
+	row = m_el[1].mVec128;
+	r0 = _mm_add_ps(r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0)));
+	r1 = _mm_add_ps(r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55)));
+	r2 = _mm_add_ps(r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa)));
+	row = m_el[2].mVec128;
+	r0 = _mm_add_ps(r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0)));
+	r1 = _mm_add_ps(r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55)));
+	r2 = _mm_add_ps(r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa)));
+	return b3Matrix3x3(r0, r1, r2);
 
 #elif defined B3_USE_NEON
-    // zeros w
-    static const uint32x4_t xyzMask = (const uint32x4_t){ -1, -1, -1, 0 };
-    float32x4_t m0 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(0).mVec128, xyzMask );
-    float32x4_t m1 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(1).mVec128, xyzMask );
-    float32x4_t m2 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(2).mVec128, xyzMask );
-    float32x4_t row = m_el[0].mVec128;
-    float32x4_t r0 = vmulq_lane_f32( m0, vget_low_f32(row), 0);
-    float32x4_t r1 = vmulq_lane_f32( m0, vget_low_f32(row), 1);
-    float32x4_t r2 = vmulq_lane_f32( m0, vget_high_f32(row), 0);
-    row = m_el[1].mVec128;
-    r0 = vmlaq_lane_f32( r0, m1, vget_low_f32(row), 0);
-    r1 = vmlaq_lane_f32( r1, m1, vget_low_f32(row), 1);
-    r2 = vmlaq_lane_f32( r2, m1, vget_high_f32(row), 0);
-    row = m_el[2].mVec128;
-    r0 = vmlaq_lane_f32( r0, m2, vget_low_f32(row), 0);
-    r1 = vmlaq_lane_f32( r1, m2, vget_low_f32(row), 1);
-    r2 = vmlaq_lane_f32( r2, m2, vget_high_f32(row), 0);
-    return b3Matrix3x3( r0, r1, r2 );
+	// zeros w
+	static const uint32x4_t xyzMask = (const uint32x4_t){-1, -1, -1, 0};
+	float32x4_t m0 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(0).mVec128, xyzMask);
+	float32x4_t m1 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(1).mVec128, xyzMask);
+	float32x4_t m2 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(2).mVec128, xyzMask);
+	float32x4_t row = m_el[0].mVec128;
+	float32x4_t r0 = vmulq_lane_f32(m0, vget_low_f32(row), 0);
+	float32x4_t r1 = vmulq_lane_f32(m0, vget_low_f32(row), 1);
+	float32x4_t r2 = vmulq_lane_f32(m0, vget_high_f32(row), 0);
+	row = m_el[1].mVec128;
+	r0 = vmlaq_lane_f32(r0, m1, vget_low_f32(row), 0);
+	r1 = vmlaq_lane_f32(r1, m1, vget_low_f32(row), 1);
+	r2 = vmlaq_lane_f32(r2, m1, vget_high_f32(row), 0);
+	row = m_el[2].mVec128;
+	r0 = vmlaq_lane_f32(r0, m2, vget_low_f32(row), 0);
+	r1 = vmlaq_lane_f32(r1, m2, vget_low_f32(row), 1);
+	r2 = vmlaq_lane_f32(r2, m2, vget_high_f32(row), 0);
+	return b3Matrix3x3(r0, r1, r2);
 #else
-    return b3Matrix3x3(
+	return b3Matrix3x3(
 		m_el[0].getX() * m[0].getX() + m_el[1].getX() * m[1].getX() + m_el[2].getX() * m[2].getX(),
 		m_el[0].getX() * m[0].getY() + m_el[1].getX() * m[1].getY() + m_el[2].getX() * m[2].getY(),
 		m_el[0].getX() * m[0].getZ() + m_el[1].getX() * m[1].getZ() + m_el[2].getX() * m[2].getZ(),
@@ -1084,51 +1083,51 @@ b3Matrix3x3::transposeTimes(const b3Matrix3x3& m) const
 #endif
 }
 
-B3_FORCE_INLINE b3Matrix3x3 
+B3_FORCE_INLINE b3Matrix3x3
 b3Matrix3x3::timesTranspose(const b3Matrix3x3& m) const
 {
-#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
-    __m128 a0 = m_el[0].mVec128;
-    __m128 a1 = m_el[1].mVec128;
-    __m128 a2 = m_el[2].mVec128;
-    
-    b3Matrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
-    __m128 mx = mT[0].mVec128;
-    __m128 my = mT[1].mVec128;
-    __m128 mz = mT[2].mVec128;
-    
-    __m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00));
-    __m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00));
-    __m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00));
-    r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55)));
-    r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55)));
-    r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55)));
-    r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa)));
-    r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa)));
-    r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa)));
-    return b3Matrix3x3( r0, r1, r2);
-            
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+	__m128 a0 = m_el[0].mVec128;
+	__m128 a1 = m_el[1].mVec128;
+	__m128 a2 = m_el[2].mVec128;
+
+	b3Matrix3x3 mT = m.transpose();  // we rely on transpose() zeroing w channel so that we don't have to do it here
+	__m128 mx = mT[0].mVec128;
+	__m128 my = mT[1].mVec128;
+	__m128 mz = mT[2].mVec128;
+
+	__m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00));
+	__m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00));
+	__m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00));
+	r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55)));
+	r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55)));
+	r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55)));
+	r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa)));
+	r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa)));
+	r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa)));
+	return b3Matrix3x3(r0, r1, r2);
+
 #elif defined B3_USE_NEON
-    float32x4_t a0 = m_el[0].mVec128;
-    float32x4_t a1 = m_el[1].mVec128;
-    float32x4_t a2 = m_el[2].mVec128;
-    
-    b3Matrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
-    float32x4_t mx = mT[0].mVec128;
-    float32x4_t my = mT[1].mVec128;
-    float32x4_t mz = mT[2].mVec128;
-    
-    float32x4_t r0 = vmulq_lane_f32( mx, vget_low_f32(a0), 0);
-    float32x4_t r1 = vmulq_lane_f32( mx, vget_low_f32(a1), 0);
-    float32x4_t r2 = vmulq_lane_f32( mx, vget_low_f32(a2), 0);
-    r0 = vmlaq_lane_f32( r0, my, vget_low_f32(a0), 1);
-    r1 = vmlaq_lane_f32( r1, my, vget_low_f32(a1), 1);
-    r2 = vmlaq_lane_f32( r2, my, vget_low_f32(a2), 1);
-    r0 = vmlaq_lane_f32( r0, mz, vget_high_f32(a0), 0);
-    r1 = vmlaq_lane_f32( r1, mz, vget_high_f32(a1), 0);
-    r2 = vmlaq_lane_f32( r2, mz, vget_high_f32(a2), 0);
-    return b3Matrix3x3( r0, r1, r2 );
-    
+	float32x4_t a0 = m_el[0].mVec128;
+	float32x4_t a1 = m_el[1].mVec128;
+	float32x4_t a2 = m_el[2].mVec128;
+
+	b3Matrix3x3 mT = m.transpose();  // we rely on transpose() zeroing w channel so that we don't have to do it here
+	float32x4_t mx = mT[0].mVec128;
+	float32x4_t my = mT[1].mVec128;
+	float32x4_t mz = mT[2].mVec128;
+
+	float32x4_t r0 = vmulq_lane_f32(mx, vget_low_f32(a0), 0);
+	float32x4_t r1 = vmulq_lane_f32(mx, vget_low_f32(a1), 0);
+	float32x4_t r2 = vmulq_lane_f32(mx, vget_low_f32(a2), 0);
+	r0 = vmlaq_lane_f32(r0, my, vget_low_f32(a0), 1);
+	r1 = vmlaq_lane_f32(r1, my, vget_low_f32(a1), 1);
+	r2 = vmlaq_lane_f32(r2, my, vget_low_f32(a2), 1);
+	r0 = vmlaq_lane_f32(r0, mz, vget_high_f32(a0), 0);
+	r1 = vmlaq_lane_f32(r1, mz, vget_high_f32(a1), 0);
+	r2 = vmlaq_lane_f32(r2, mz, vget_high_f32(a2), 0);
+	return b3Matrix3x3(r0, r1, r2);
+
 #else
 	return b3Matrix3x3(
 		m_el[0].dot(m[0]), m_el[0].dot(m[1]), m_el[0].dot(m[2]),
@@ -1137,139 +1136,138 @@ b3Matrix3x3::timesTranspose(const b3Matrix3x3& m) const
 #endif
 }
 
-B3_FORCE_INLINE b3Vector3 
-operator*(const b3Matrix3x3& m, const b3Vector3& v) 
+B3_FORCE_INLINE b3Vector3
+operator*(const b3Matrix3x3& m, const b3Vector3& v)
 {
-#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))|| defined (B3_USE_NEON)
-    return v.dot3(m[0], m[1], m[2]);
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+	return v.dot3(m[0], m[1], m[2]);
 #else
 	return b3MakeVector3(m[0].dot(v), m[1].dot(v), m[2].dot(v));
 #endif
 }
 
-
 B3_FORCE_INLINE b3Vector3
 operator*(const b3Vector3& v, const b3Matrix3x3& m)
 {
-#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
 
-    const __m128 vv = v.mVec128;
+	const __m128 vv = v.mVec128;
 
-    __m128 c0 = b3_splat_ps( vv, 0);
-    __m128 c1 = b3_splat_ps( vv, 1);
-    __m128 c2 = b3_splat_ps( vv, 2);
+	__m128 c0 = b3_splat_ps(vv, 0);
+	__m128 c1 = b3_splat_ps(vv, 1);
+	__m128 c2 = b3_splat_ps(vv, 2);
 
-    c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, b3vFFF0fMask) );
-    c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, b3vFFF0fMask) );
-    c0 = _mm_add_ps(c0, c1);
-    c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, b3vFFF0fMask) );
-    
-    return b3MakeVector3(_mm_add_ps(c0, c2));
+	c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, b3vFFF0fMask));
+	c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, b3vFFF0fMask));
+	c0 = _mm_add_ps(c0, c1);
+	c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, b3vFFF0fMask));
+
+	return b3MakeVector3(_mm_add_ps(c0, c2));
 #elif defined(B3_USE_NEON)
-    const float32x4_t vv = v.mVec128;
-    const float32x2_t vlo = vget_low_f32(vv);
-    const float32x2_t vhi = vget_high_f32(vv);
-
-    float32x4_t c0, c1, c2;
-
-    c0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, b3vFFF0Mask);
-    c1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, b3vFFF0Mask);
-    c2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, b3vFFF0Mask);
-
-    c0 = vmulq_lane_f32(c0, vlo, 0);
-    c1 = vmulq_lane_f32(c1, vlo, 1);
-    c2 = vmulq_lane_f32(c2, vhi, 0);
-    c0 = vaddq_f32(c0, c1);
-    c0 = vaddq_f32(c0, c2);
-    
-    return b3MakeVector3(c0);
+	const float32x4_t vv = v.mVec128;
+	const float32x2_t vlo = vget_low_f32(vv);
+	const float32x2_t vhi = vget_high_f32(vv);
+
+	float32x4_t c0, c1, c2;
+
+	c0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, b3vFFF0Mask);
+	c1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, b3vFFF0Mask);
+	c2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, b3vFFF0Mask);
+
+	c0 = vmulq_lane_f32(c0, vlo, 0);
+	c1 = vmulq_lane_f32(c1, vlo, 1);
+	c2 = vmulq_lane_f32(c2, vhi, 0);
+	c0 = vaddq_f32(c0, c1);
+	c0 = vaddq_f32(c0, c2);
+
+	return b3MakeVector3(c0);
 #else
 	return b3MakeVector3(m.tdotx(v), m.tdoty(v), m.tdotz(v));
 #endif
 }
 
-B3_FORCE_INLINE b3Matrix3x3 
+B3_FORCE_INLINE b3Matrix3x3
 operator*(const b3Matrix3x3& m1, const b3Matrix3x3& m2)
 {
-#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
-
-    __m128 m10 = m1[0].mVec128;  
-    __m128 m11 = m1[1].mVec128;
-    __m128 m12 = m1[2].mVec128;
-    
-    __m128 m2v = _mm_and_ps(m2[0].mVec128, b3vFFF0fMask);
-    
-    __m128 c0 = b3_splat_ps( m10, 0);
-    __m128 c1 = b3_splat_ps( m11, 0);
-    __m128 c2 = b3_splat_ps( m12, 0);
-    
-    c0 = _mm_mul_ps(c0, m2v);
-    c1 = _mm_mul_ps(c1, m2v);
-    c2 = _mm_mul_ps(c2, m2v);
-    
-    m2v = _mm_and_ps(m2[1].mVec128, b3vFFF0fMask);
-    
-    __m128 c0_1 = b3_splat_ps( m10, 1);
-    __m128 c1_1 = b3_splat_ps( m11, 1);
-    __m128 c2_1 = b3_splat_ps( m12, 1);
-    
-    c0_1 = _mm_mul_ps(c0_1, m2v);
-    c1_1 = _mm_mul_ps(c1_1, m2v);
-    c2_1 = _mm_mul_ps(c2_1, m2v);
-    
-    m2v = _mm_and_ps(m2[2].mVec128, b3vFFF0fMask);
-    
-    c0 = _mm_add_ps(c0, c0_1);
-    c1 = _mm_add_ps(c1, c1_1);
-    c2 = _mm_add_ps(c2, c2_1);
-    
-    m10 = b3_splat_ps( m10, 2);
-    m11 = b3_splat_ps( m11, 2);
-    m12 = b3_splat_ps( m12, 2);
-    
-    m10 = _mm_mul_ps(m10, m2v);
-    m11 = _mm_mul_ps(m11, m2v);
-    m12 = _mm_mul_ps(m12, m2v);
-    
-    c0 = _mm_add_ps(c0, m10);
-    c1 = _mm_add_ps(c1, m11);
-    c2 = _mm_add_ps(c2, m12);
-    
-    return b3Matrix3x3(c0, c1, c2);
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+
+	__m128 m10 = m1[0].mVec128;
+	__m128 m11 = m1[1].mVec128;
+	__m128 m12 = m1[2].mVec128;
+
+	__m128 m2v = _mm_and_ps(m2[0].mVec128, b3vFFF0fMask);
+
+	__m128 c0 = b3_splat_ps(m10, 0);
+	__m128 c1 = b3_splat_ps(m11, 0);
+	__m128 c2 = b3_splat_ps(m12, 0);
+
+	c0 = _mm_mul_ps(c0, m2v);
+	c1 = _mm_mul_ps(c1, m2v);
+	c2 = _mm_mul_ps(c2, m2v);
+
+	m2v = _mm_and_ps(m2[1].mVec128, b3vFFF0fMask);
+
+	__m128 c0_1 = b3_splat_ps(m10, 1);
+	__m128 c1_1 = b3_splat_ps(m11, 1);
+	__m128 c2_1 = b3_splat_ps(m12, 1);
+
+	c0_1 = _mm_mul_ps(c0_1, m2v);
+	c1_1 = _mm_mul_ps(c1_1, m2v);
+	c2_1 = _mm_mul_ps(c2_1, m2v);
+
+	m2v = _mm_and_ps(m2[2].mVec128, b3vFFF0fMask);
+
+	c0 = _mm_add_ps(c0, c0_1);
+	c1 = _mm_add_ps(c1, c1_1);
+	c2 = _mm_add_ps(c2, c2_1);
+
+	m10 = b3_splat_ps(m10, 2);
+	m11 = b3_splat_ps(m11, 2);
+	m12 = b3_splat_ps(m12, 2);
+
+	m10 = _mm_mul_ps(m10, m2v);
+	m11 = _mm_mul_ps(m11, m2v);
+	m12 = _mm_mul_ps(m12, m2v);
+
+	c0 = _mm_add_ps(c0, m10);
+	c1 = _mm_add_ps(c1, m11);
+	c2 = _mm_add_ps(c2, m12);
+
+	return b3Matrix3x3(c0, c1, c2);
 
 #elif defined(B3_USE_NEON)
 
-    float32x4_t rv0, rv1, rv2;
-    float32x4_t v0, v1, v2;
-    float32x4_t mv0, mv1, mv2;
-
-    v0 = m1[0].mVec128;
-    v1 = m1[1].mVec128;
-    v2 = m1[2].mVec128;
-
-    mv0 = (float32x4_t) vandq_s32((int32x4_t)m2[0].mVec128, b3vFFF0Mask); 
-    mv1 = (float32x4_t) vandq_s32((int32x4_t)m2[1].mVec128, b3vFFF0Mask); 
-    mv2 = (float32x4_t) vandq_s32((int32x4_t)m2[2].mVec128, b3vFFF0Mask); 
-    
-    rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
-    rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
-    rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
-    
-    rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
-    rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
-    rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
-    
-    rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
-    rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
-    rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
+	float32x4_t rv0, rv1, rv2;
+	float32x4_t v0, v1, v2;
+	float32x4_t mv0, mv1, mv2;
+
+	v0 = m1[0].mVec128;
+	v1 = m1[1].mVec128;
+	v2 = m1[2].mVec128;
+
+	mv0 = (float32x4_t)vandq_s32((int32x4_t)m2[0].mVec128, b3vFFF0Mask);
+	mv1 = (float32x4_t)vandq_s32((int32x4_t)m2[1].mVec128, b3vFFF0Mask);
+	mv2 = (float32x4_t)vandq_s32((int32x4_t)m2[2].mVec128, b3vFFF0Mask);
+
+	rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
+	rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
+	rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
+
+	rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
+	rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
+	rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
+
+	rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
+	rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
+	rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
 
 	return b3Matrix3x3(rv0, rv1, rv2);
-        
-#else	
+
+#else
 	return b3Matrix3x3(
-		m2.tdotx( m1[0]), m2.tdoty( m1[0]), m2.tdotz( m1[0]),
-		m2.tdotx( m1[1]), m2.tdoty( m1[1]), m2.tdotz( m1[1]),
-		m2.tdotx( m1[2]), m2.tdoty( m1[2]), m2.tdotz( m1[2]));
+		m2.tdotx(m1[0]), m2.tdoty(m1[0]), m2.tdotz(m1[0]),
+		m2.tdotx(m1[1]), m2.tdoty(m1[1]), m2.tdotz(m1[1]),
+		m2.tdotx(m1[2]), m2.tdoty(m1[2]), m2.tdotz(m1[2]));
 #endif
 }
 
@@ -1292,71 +1290,65 @@ m1[0][2] * m2[0][2] + m1[1][2] * m2[1][2] + m1[2][2] * m2[2][2]);
 * It will test all elements are equal.  */
 B3_FORCE_INLINE bool operator==(const b3Matrix3x3& m1, const b3Matrix3x3& m2)
 {
-#if (defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
-
-    __m128 c0, c1, c2;
-
-    c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128);
-    c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128);
-    c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128);
-    
-    c0 = _mm_and_ps(c0, c1);
-    c0 = _mm_and_ps(c0, c2);
-
-    return (0x7 == _mm_movemask_ps((__m128)c0));
-#else 
-	return 
-    (   m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
-		m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] &&
-		m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2] );
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+
+	__m128 c0, c1, c2;
+
+	c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128);
+	c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128);
+	c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128);
+
+	c0 = _mm_and_ps(c0, c1);
+	c0 = _mm_and_ps(c0, c2);
+
+	return (0x7 == _mm_movemask_ps((__m128)c0));
+#else
+	return (m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
+			m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] &&
+			m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2]);
 #endif
 }
 
 ///for serialization
-struct	b3Matrix3x3FloatData
+struct b3Matrix3x3FloatData
 {
 	b3Vector3FloatData m_el[3];
 };
 
 ///for serialization
-struct	b3Matrix3x3DoubleData
+struct b3Matrix3x3DoubleData
 {
 	b3Vector3DoubleData m_el[3];
 };
 
-
-	
-
-B3_FORCE_INLINE	void	b3Matrix3x3::serialize(struct	b3Matrix3x3Data& dataOut) const
+B3_FORCE_INLINE void b3Matrix3x3::serialize(struct b3Matrix3x3Data& dataOut) const
 {
-	for (int i=0;i<3;i++)
+	for (int i = 0; i < 3; i++)
 		m_el[i].serialize(dataOut.m_el[i]);
 }
 
-B3_FORCE_INLINE	void	b3Matrix3x3::serializeFloat(struct	b3Matrix3x3FloatData& dataOut) const
+B3_FORCE_INLINE void b3Matrix3x3::serializeFloat(struct b3Matrix3x3FloatData& dataOut) const
 {
-	for (int i=0;i<3;i++)
+	for (int i = 0; i < 3; i++)
 		m_el[i].serializeFloat(dataOut.m_el[i]);
 }
 
-
-B3_FORCE_INLINE	void	b3Matrix3x3::deSerialize(const struct	b3Matrix3x3Data& dataIn)
+B3_FORCE_INLINE void b3Matrix3x3::deSerialize(const struct b3Matrix3x3Data& dataIn)
 {
-	for (int i=0;i<3;i++)
+	for (int i = 0; i < 3; i++)
 		m_el[i].deSerialize(dataIn.m_el[i]);
 }
 
-B3_FORCE_INLINE	void	b3Matrix3x3::deSerializeFloat(const struct	b3Matrix3x3FloatData& dataIn)
+B3_FORCE_INLINE void b3Matrix3x3::deSerializeFloat(const struct b3Matrix3x3FloatData& dataIn)
 {
-	for (int i=0;i<3;i++)
+	for (int i = 0; i < 3; i++)
 		m_el[i].deSerializeFloat(dataIn.m_el[i]);
 }
 
-B3_FORCE_INLINE	void	b3Matrix3x3::deSerializeDouble(const struct	b3Matrix3x3DoubleData& dataIn)
+B3_FORCE_INLINE void b3Matrix3x3::deSerializeDouble(const struct b3Matrix3x3DoubleData& dataIn)
 {
-	for (int i=0;i<3;i++)
+	for (int i = 0; i < 3; i++)
 		m_el[i].deSerializeDouble(dataIn.m_el[i]);
 }
 
-#endif //B3_MATRIX3x3_H
-
+#endif  //B3_MATRIX3x3_H
diff --git a/thirdparty/bullet/Bullet3Common/b3MinMax.h b/thirdparty/bullet/Bullet3Common/b3MinMax.h
index 73af23a4f9..c09c3db3f5 100644
--- a/thirdparty/bullet/Bullet3Common/b3MinMax.h
+++ b/thirdparty/bullet/Bullet3Common/b3MinMax.h
@@ -12,60 +12,58 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
-
-
 #ifndef B3_GEN_MINMAX_H
 #define B3_GEN_MINMAX_H
 
 #include "b3Scalar.h"
 
 template <class T>
-B3_FORCE_INLINE const T& b3Min(const T& a, const T& b) 
+B3_FORCE_INLINE const T& b3Min(const T& a, const T& b)
 {
-  return a < b ? a : b ;
+	return a < b ? a : b;
 }
 
 template <class T>
-B3_FORCE_INLINE const T& b3Max(const T& a, const T& b) 
+B3_FORCE_INLINE const T& b3Max(const T& a, const T& b)
 {
-  return  a > b ? a : b;
+	return a > b ? a : b;
 }
 
 template <class T>
-B3_FORCE_INLINE const T& b3Clamped(const T& a, const T& lb, const T& ub) 
+B3_FORCE_INLINE const T& b3Clamped(const T& a, const T& lb, const T& ub)
 {
-	return a < lb ? lb : (ub < a ? ub : a); 
+	return a < lb ? lb : (ub < a ? ub : a);
 }
 
 template <class T>
-B3_FORCE_INLINE void b3SetMin(T& a, const T& b) 
+B3_FORCE_INLINE void b3SetMin(T& a, const T& b)
 {
-    if (b < a) 
+	if (b < a)
 	{
 		a = b;
 	}
 }
 
 template <class T>
-B3_FORCE_INLINE void b3SetMax(T& a, const T& b) 
+B3_FORCE_INLINE void b3SetMax(T& a, const T& b)
 {
-    if (a < b) 
+	if (a < b)
 	{
 		a = b;
 	}
 }
 
 template <class T>
-B3_FORCE_INLINE void b3Clamp(T& a, const T& lb, const T& ub) 
+B3_FORCE_INLINE void b3Clamp(T& a, const T& lb, const T& ub)
 {
-	if (a < lb) 
+	if (a < lb)
 	{
-		a = lb; 
+		a = lb;
 	}
-	else if (ub < a) 
+	else if (ub < a)
 	{
 		a = ub;
 	}
 }
 
-#endif //B3_GEN_MINMAX_H
+#endif  //B3_GEN_MINMAX_H
diff --git a/thirdparty/bullet/Bullet3Common/b3PoolAllocator.h b/thirdparty/bullet/Bullet3Common/b3PoolAllocator.h
index 2fcdcf5b24..ed56bc627d 100644
--- a/thirdparty/bullet/Bullet3Common/b3PoolAllocator.h
+++ b/thirdparty/bullet/Bullet3Common/b3PoolAllocator.h
@@ -12,7 +12,6 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
-
 #ifndef _BT_POOL_ALLOCATOR_H
 #define _BT_POOL_ALLOCATOR_H
 
@@ -22,37 +21,37 @@ subject to the following restrictions:
 ///The b3PoolAllocator class allows to efficiently allocate a large pool of objects, instead of dynamically allocating them separately.
 class b3PoolAllocator
 {
-	int				m_elemSize;
-	int				m_maxElements;
-	int				m_freeCount;
-	void*			m_firstFree;
-	unsigned char*	m_pool;
+	int m_elemSize;
+	int m_maxElements;
+	int m_freeCount;
+	void* m_firstFree;
+	unsigned char* m_pool;
 
 public:
-
 	b3PoolAllocator(int elemSize, int maxElements)
-		:m_elemSize(elemSize),
-		m_maxElements(maxElements)
+		: m_elemSize(elemSize),
+		  m_maxElements(maxElements)
 	{
-		m_pool = (unsigned char*) b3AlignedAlloc( static_cast<unsigned int>(m_elemSize*m_maxElements),16);
+		m_pool = (unsigned char*)b3AlignedAlloc(static_cast<unsigned int>(m_elemSize * m_maxElements), 16);
 
 		unsigned char* p = m_pool;
-        m_firstFree = p;
-        m_freeCount = m_maxElements;
-        int count = m_maxElements;
-        while (--count) {
-            *(void**)p = (p + m_elemSize);
-            p += m_elemSize;
-        }
-        *(void**)p = 0;
-    }
+		m_firstFree = p;
+		m_freeCount = m_maxElements;
+		int count = m_maxElements;
+		while (--count)
+		{
+			*(void**)p = (p + m_elemSize);
+			p += m_elemSize;
+		}
+		*(void**)p = 0;
+	}
 
 	~b3PoolAllocator()
 	{
-		b3AlignedFree( m_pool);
+		b3AlignedFree(m_pool);
 	}
 
-	int	getFreeCount() const
+	int getFreeCount() const
 	{
 		return m_freeCount;
 	}
@@ -67,21 +66,22 @@ public:
 		return m_maxElements;
 	}
 
-	void*	allocate(int size)
+	void* allocate(int size)
 	{
 		// release mode fix
 		(void)size;
-		b3Assert(!size || size<=m_elemSize);
-		b3Assert(m_freeCount>0);
-        void* result = m_firstFree;
-        m_firstFree = *(void**)m_firstFree;
-        --m_freeCount;
-        return result;
+		b3Assert(!size || size <= m_elemSize);
+		b3Assert(m_freeCount > 0);
+		void* result = m_firstFree;
+		m_firstFree = *(void**)m_firstFree;
+		--m_freeCount;
+		return result;
 	}
 
 	bool validPtr(void* ptr)
 	{
-		if (ptr) {
+		if (ptr)
+		{
 			if (((unsigned char*)ptr >= m_pool && (unsigned char*)ptr < m_pool + m_maxElements * m_elemSize))
 			{
 				return true;
@@ -90,32 +90,32 @@ public:
 		return false;
 	}
 
-	void	freeMemory(void* ptr)
+	void freeMemory(void* ptr)
 	{
-		 if (ptr) {
-            b3Assert((unsigned char*)ptr >= m_pool && (unsigned char*)ptr < m_pool + m_maxElements * m_elemSize);
+		if (ptr)
+		{
+			b3Assert((unsigned char*)ptr >= m_pool && (unsigned char*)ptr < m_pool + m_maxElements * m_elemSize);
 
-            *(void**)ptr = m_firstFree;
-            m_firstFree = ptr;
-            ++m_freeCount;
-        }
+			*(void**)ptr = m_firstFree;
+			m_firstFree = ptr;
+			++m_freeCount;
+		}
 	}
 
-	int	getElementSize() const
+	int getElementSize() const
 	{
 		return m_elemSize;
 	}
 
-	unsigned char*	getPoolAddress()
+	unsigned char* getPoolAddress()
 	{
 		return m_pool;
 	}
 
-	const unsigned char*	getPoolAddress() const
+	const unsigned char* getPoolAddress() const
 	{
 		return m_pool;
 	}
-
 };
 
-#endif //_BT_POOL_ALLOCATOR_H
+#endif  //_BT_POOL_ALLOCATOR_H
diff --git a/thirdparty/bullet/Bullet3Common/b3QuadWord.h b/thirdparty/bullet/Bullet3Common/b3QuadWord.h
index 65c9581977..0def305fac 100644
--- a/thirdparty/bullet/Bullet3Common/b3QuadWord.h
+++ b/thirdparty/bullet/Bullet3Common/b3QuadWord.h
@@ -12,18 +12,13 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
-
 #ifndef B3_SIMD_QUADWORD_H
 #define B3_SIMD_QUADWORD_H
 
 #include "b3Scalar.h"
 #include "b3MinMax.h"
 
-
-
-
-
-#if defined (__CELLOS_LV2) && defined (__SPU__)
+#if defined(__CELLOS_LV2) && defined(__SPU__)
 #include <altivec.h>
 #endif
 
@@ -31,58 +26,64 @@ subject to the following restrictions:
  * Some issues under PS3 Linux with IBM 2.1 SDK, gcc compiler prevent from using aligned quadword.
  */
 #ifndef USE_LIBSPE2
-B3_ATTRIBUTE_ALIGNED16(class) b3QuadWord
+B3_ATTRIBUTE_ALIGNED16(class)
+b3QuadWord
 #else
 class b3QuadWord
 #endif
 {
 protected:
-
-#if defined (__SPU__) && defined (__CELLOS_LV2__)
+#if defined(__SPU__) && defined(__CELLOS_LV2__)
 	union {
 		vec_float4 mVec128;
-		b3Scalar	m_floats[4];
+		b3Scalar m_floats[4];
 	};
+
 public:
-	vec_float4	get128() const
+	vec_float4 get128() const
 	{
 		return mVec128;
 	}
 
-#else //__CELLOS_LV2__ __SPU__
+#else  //__CELLOS_LV2__ __SPU__
 
-#if defined(B3_USE_SSE) || defined(B3_USE_NEON) 
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
 public:
 	union {
 		b3SimdFloat4 mVec128;
-		b3Scalar	m_floats[4];
-		struct {b3Scalar x,y,z,w;};
+		b3Scalar m_floats[4];
+		struct
+		{
+			b3Scalar x, y, z, w;
+		};
 	};
+
 public:
-	B3_FORCE_INLINE	b3SimdFloat4	get128() const
+	B3_FORCE_INLINE b3SimdFloat4 get128() const
 	{
 		return mVec128;
 	}
-	B3_FORCE_INLINE	void	set128(b3SimdFloat4 v128)
+	B3_FORCE_INLINE void set128(b3SimdFloat4 v128)
 	{
 		mVec128 = v128;
 	}
 #else
 public:
-	union
-	{
-		b3Scalar	m_floats[4];
-		struct {b3Scalar x,y,z,w;};
+	union {
+		b3Scalar m_floats[4];
+		struct
+		{
+			b3Scalar x, y, z, w;
+		};
 	};
-#endif // B3_USE_SSE
+#endif  // B3_USE_SSE
 
-#endif //__CELLOS_LV2__ __SPU__
+#endif  //__CELLOS_LV2__ __SPU__
 
-	public:
-  
+public:
 #if defined(B3_USE_SSE) || defined(B3_USE_NEON)
 
-	// Set Vector 
+	// Set Vector
 	B3_FORCE_INLINE b3QuadWord(const b3SimdFloat4 vec)
 	{
 		mVec128 = vec;
@@ -95,151 +96,147 @@ public:
 	}
 
 	// Assignment Operator
-	B3_FORCE_INLINE b3QuadWord& 
-	operator=(const b3QuadWord& v) 
+	B3_FORCE_INLINE b3QuadWord&
+	operator=(const b3QuadWord& v)
 	{
 		mVec128 = v.mVec128;
-		
+
 		return *this;
 	}
-	
+
 #endif
 
-  /**@brief Return the x value */
-		B3_FORCE_INLINE const b3Scalar& getX() const { return m_floats[0]; }
-  /**@brief Return the y value */
-		B3_FORCE_INLINE const b3Scalar& getY() const { return m_floats[1]; }
-  /**@brief Return the z value */
-		B3_FORCE_INLINE const b3Scalar& getZ() const { return m_floats[2]; }
-  /**@brief Set the x value */
-		B3_FORCE_INLINE void	setX(b3Scalar _x) { m_floats[0] = _x;};
-  /**@brief Set the y value */
-		B3_FORCE_INLINE void	setY(b3Scalar _y) { m_floats[1] = _y;};
-  /**@brief Set the z value */
-		B3_FORCE_INLINE void	setZ(b3Scalar _z) { m_floats[2] = _z;};
-  /**@brief Set the w value */
-		B3_FORCE_INLINE void	setW(b3Scalar _w) { m_floats[3] = _w;};
-  /**@brief Return the x value */
-
-
-	//B3_FORCE_INLINE b3Scalar&       operator[](int i)       { return (&m_floats[0])[i];	}      
+	/**@brief Return the x value */
+	B3_FORCE_INLINE const b3Scalar& getX() const { return m_floats[0]; }
+	/**@brief Return the y value */
+	B3_FORCE_INLINE const b3Scalar& getY() const { return m_floats[1]; }
+	/**@brief Return the z value */
+	B3_FORCE_INLINE const b3Scalar& getZ() const { return m_floats[2]; }
+	/**@brief Set the x value */
+	B3_FORCE_INLINE void setX(b3Scalar _x) { m_floats[0] = _x; };
+	/**@brief Set the y value */
+	B3_FORCE_INLINE void setY(b3Scalar _y) { m_floats[1] = _y; };
+	/**@brief Set the z value */
+	B3_FORCE_INLINE void setZ(b3Scalar _z) { m_floats[2] = _z; };
+	/**@brief Set the w value */
+	B3_FORCE_INLINE void setW(b3Scalar _w) { m_floats[3] = _w; };
+	/**@brief Return the x value */
+
+	//B3_FORCE_INLINE b3Scalar&       operator[](int i)       { return (&m_floats[0])[i];	}
 	//B3_FORCE_INLINE const b3Scalar& operator[](int i) const { return (&m_floats[0])[i]; }
 	///operator b3Scalar*() replaces operator[], using implicit conversion. We added operator != and operator == to avoid pointer comparisons.
-	B3_FORCE_INLINE	operator       b3Scalar *()       { return &m_floats[0]; }
-	B3_FORCE_INLINE	operator const b3Scalar *() const { return &m_floats[0]; }
+	B3_FORCE_INLINE operator b3Scalar*() { return &m_floats[0]; }
+	B3_FORCE_INLINE operator const b3Scalar*() const { return &m_floats[0]; }
 
-	B3_FORCE_INLINE	bool	operator==(const b3QuadWord& other) const
+	B3_FORCE_INLINE bool operator==(const b3QuadWord& other) const
 	{
 #ifdef B3_USE_SSE
-        return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
-#else 
-		return ((m_floats[3]==other.m_floats[3]) && 
-                (m_floats[2]==other.m_floats[2]) && 
-                (m_floats[1]==other.m_floats[1]) && 
-                (m_floats[0]==other.m_floats[0]));
+		return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
+#else
+		return ((m_floats[3] == other.m_floats[3]) &&
+				(m_floats[2] == other.m_floats[2]) &&
+				(m_floats[1] == other.m_floats[1]) &&
+				(m_floats[0] == other.m_floats[0]));
 #endif
 	}
 
-	B3_FORCE_INLINE	bool	operator!=(const b3QuadWord& other) const
+	B3_FORCE_INLINE bool operator!=(const b3QuadWord& other) const
 	{
 		return !(*this == other);
 	}
 
-  /**@brief Set x,y,z and zero w 
+	/**@brief Set x,y,z and zero w 
    * @param x Value of x
    * @param y Value of y
    * @param z Value of z
    */
-		B3_FORCE_INLINE void 	setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z)
-		{
-			m_floats[0]=_x;
-			m_floats[1]=_y;
-			m_floats[2]=_z;
-			m_floats[3] = 0.f;
-		}
+	B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z)
+	{
+		m_floats[0] = _x;
+		m_floats[1] = _y;
+		m_floats[2] = _z;
+		m_floats[3] = 0.f;
+	}
 
-/*		void getValue(b3Scalar *m) const 
+	/*		void getValue(b3Scalar *m) const 
 		{
 			m[0] = m_floats[0];
 			m[1] = m_floats[1];
 			m[2] = m_floats[2];
 		}
 */
-/**@brief Set the values 
+	/**@brief Set the values 
    * @param x Value of x
    * @param y Value of y
    * @param z Value of z
    * @param w Value of w
    */
-		B3_FORCE_INLINE void	setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z,const b3Scalar& _w)
-		{
-			m_floats[0]=_x;
-			m_floats[1]=_y;
-			m_floats[2]=_z;
-			m_floats[3]=_w;
-		}
-  /**@brief No initialization constructor */
-		B3_FORCE_INLINE b3QuadWord()
-		//	:m_floats[0](b3Scalar(0.)),m_floats[1](b3Scalar(0.)),m_floats[2](b3Scalar(0.)),m_floats[3](b3Scalar(0.))
-		{
-		}
- 
-  /**@brief Three argument constructor (zeros w)
+	B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z, const b3Scalar& _w)
+	{
+		m_floats[0] = _x;
+		m_floats[1] = _y;
+		m_floats[2] = _z;
+		m_floats[3] = _w;
+	}
+	/**@brief No initialization constructor */
+	B3_FORCE_INLINE b3QuadWord()
+	//	:m_floats[0](b3Scalar(0.)),m_floats[1](b3Scalar(0.)),m_floats[2](b3Scalar(0.)),m_floats[3](b3Scalar(0.))
+	{
+	}
+
+	/**@brief Three argument constructor (zeros w)
    * @param x Value of x
    * @param y Value of y
    * @param z Value of z
    */
-		B3_FORCE_INLINE b3QuadWord(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z)		
-		{
-			m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = 0.0f;
-		}
+	B3_FORCE_INLINE b3QuadWord(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z)
+	{
+		m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = 0.0f;
+	}
 
-/**@brief Initializing constructor
+	/**@brief Initializing constructor
    * @param x Value of x
    * @param y Value of y
    * @param z Value of z
    * @param w Value of w
    */
-		B3_FORCE_INLINE b3QuadWord(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z,const b3Scalar& _w) 
-		{
-			m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = _w;
-		}
+	B3_FORCE_INLINE b3QuadWord(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z, const b3Scalar& _w)
+	{
+		m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = _w;
+	}
 
-  /**@brief Set each element to the max of the current values and the values of another b3QuadWord
+	/**@brief Set each element to the max of the current values and the values of another b3QuadWord
    * @param other The other b3QuadWord to compare with 
    */
-		B3_FORCE_INLINE void	setMax(const b3QuadWord& other)
-		{
-        #ifdef B3_USE_SSE
-            mVec128 = _mm_max_ps(mVec128, other.mVec128);
-        #elif defined(B3_USE_NEON)
-            mVec128 = vmaxq_f32(mVec128, other.mVec128);
-        #else
-        	b3SetMax(m_floats[0], other.m_floats[0]);
-			b3SetMax(m_floats[1], other.m_floats[1]);
-			b3SetMax(m_floats[2], other.m_floats[2]);
-			b3SetMax(m_floats[3], other.m_floats[3]);
-		#endif
-        }
-  /**@brief Set each element to the min of the current values and the values of another b3QuadWord
+	B3_FORCE_INLINE void setMax(const b3QuadWord& other)
+	{
+#ifdef B3_USE_SSE
+		mVec128 = _mm_max_ps(mVec128, other.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vmaxq_f32(mVec128, other.mVec128);
+#else
+		b3SetMax(m_floats[0], other.m_floats[0]);
+		b3SetMax(m_floats[1], other.m_floats[1]);
+		b3SetMax(m_floats[2], other.m_floats[2]);
+		b3SetMax(m_floats[3], other.m_floats[3]);
+#endif
+	}
+	/**@brief Set each element to the min of the current values and the values of another b3QuadWord
    * @param other The other b3QuadWord to compare with 
    */
-		B3_FORCE_INLINE void	setMin(const b3QuadWord& other)
-		{
-        #ifdef B3_USE_SSE
-            mVec128 = _mm_min_ps(mVec128, other.mVec128);
-        #elif defined(B3_USE_NEON)
-            mVec128 = vminq_f32(mVec128, other.mVec128);
-        #else
-        	b3SetMin(m_floats[0], other.m_floats[0]);
-			b3SetMin(m_floats[1], other.m_floats[1]);
-			b3SetMin(m_floats[2], other.m_floats[2]);
-			b3SetMin(m_floats[3], other.m_floats[3]);
-		#endif
-        }
-
-
-
+	B3_FORCE_INLINE void setMin(const b3QuadWord& other)
+	{
+#ifdef B3_USE_SSE
+		mVec128 = _mm_min_ps(mVec128, other.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vminq_f32(mVec128, other.mVec128);
+#else
+		b3SetMin(m_floats[0], other.m_floats[0]);
+		b3SetMin(m_floats[1], other.m_floats[1]);
+		b3SetMin(m_floats[2], other.m_floats[2]);
+		b3SetMin(m_floats[3], other.m_floats[3]);
+#endif
+	}
 };
 
-#endif //B3_SIMD_QUADWORD_H
+#endif  //B3_SIMD_QUADWORD_H
diff --git a/thirdparty/bullet/Bullet3Common/b3Quaternion.h b/thirdparty/bullet/Bullet3Common/b3Quaternion.h
index ad20543348..9bd5ff7d90 100644
--- a/thirdparty/bullet/Bullet3Common/b3Quaternion.h
+++ b/thirdparty/bullet/Bullet3Common/b3Quaternion.h
@@ -12,19 +12,12 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
-
-
 #ifndef B3_SIMD__QUATERNION_H_
 #define B3_SIMD__QUATERNION_H_
 
-
 #include "b3Vector3.h"
 #include "b3QuadWord.h"
 
-
-
-
-
 #ifdef B3_USE_SSE
 
 const __m128 B3_ATTRIBUTE_ALIGNED16(b3vOnes) = {1.0f, 1.0f, 1.0f, 1.0f};
@@ -39,13 +32,14 @@ const b3SimdFloat4 B3_ATTRIBUTE_ALIGNED16(b3vPPPM) = {+0.0f, +0.0f, +0.0f, -0.0f
 #endif
 
 /**@brief The b3Quaternion implements quaternion to perform linear algebra rotations in combination with b3Matrix3x3, b3Vector3 and b3Transform. */
-class b3Quaternion : public b3QuadWord {
+class b3Quaternion : public b3QuadWord
+{
 public:
-  /**@brief No initialization constructor */
+	/**@brief No initialization constructor */
 	b3Quaternion() {}
 
-#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))|| defined(B3_USE_NEON) 
-	// Set Vector 
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+	// Set Vector
 	B3_FORCE_INLINE b3Quaternion(const b3SimdFloat4 vec)
 	{
 		mVec128 = vec;
@@ -58,63 +52,70 @@ public:
 	}
 
 	// Assignment Operator
-	B3_FORCE_INLINE b3Quaternion& 
-	operator=(const b3Quaternion& v) 
+	B3_FORCE_INLINE b3Quaternion&
+	operator=(const b3Quaternion& v)
 	{
 		mVec128 = v.mVec128;
-		
+
 		return *this;
 	}
-	
+
 #endif
 
 	//		template <typename b3Scalar>
 	//		explicit Quaternion(const b3Scalar *v) : Tuple4<b3Scalar>(v) {}
-  /**@brief Constructor from scalars */
-	b3Quaternion(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z, const b3Scalar& _w) 
-		: b3QuadWord(_x, _y, _z, _w) 
+	/**@brief Constructor from scalars */
+	b3Quaternion(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z, const b3Scalar& _w)
+		: b3QuadWord(_x, _y, _z, _w)
 	{
 		//b3Assert(!((_x==1.f) && (_y==0.f) && (_z==0.f) && (_w==0.f)));
 	}
-  /**@brief Axis angle Constructor
+	/**@brief Axis angle Constructor
    * @param axis The axis which the rotation is around
    * @param angle The magnitude of the rotation around the angle (Radians) */
-	b3Quaternion(const b3Vector3& _axis, const b3Scalar& _angle) 
-	{ 
-		setRotation(_axis, _angle); 
+	b3Quaternion(const b3Vector3& _axis, const b3Scalar& _angle)
+	{
+		setRotation(_axis, _angle);
 	}
-  /**@brief Constructor from Euler angles
+	/**@brief Constructor from Euler angles
    * @param yaw Angle around Y unless B3_EULER_DEFAULT_ZYX defined then Z
    * @param pitch Angle around X unless B3_EULER_DEFAULT_ZYX defined then Y
    * @param roll Angle around Z unless B3_EULER_DEFAULT_ZYX defined then X */
 	b3Quaternion(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll)
-	{ 
+	{
 #ifndef B3_EULER_DEFAULT_ZYX
-		setEuler(yaw, pitch, roll); 
+		setEuler(yaw, pitch, roll);
 #else
-		setEulerZYX(yaw, pitch, roll); 
-#endif 
+		setEulerZYX(yaw, pitch, roll);
+#endif
 	}
-  /**@brief Set the rotation using axis angle notation 
+	/**@brief Set the rotation using axis angle notation 
    * @param axis The axis around which to rotate
    * @param angle The magnitude of the rotation in Radians */
 	void setRotation(const b3Vector3& axis, const b3Scalar& _angle)
 	{
 		b3Scalar d = axis.length();
 		b3Assert(d != b3Scalar(0.0));
-		b3Scalar s = b3Sin(_angle * b3Scalar(0.5)) / d;
-		setValue(axis.getX() * s, axis.getY() * s, axis.getZ() * s, 
-			b3Cos(_angle * b3Scalar(0.5)));
+		if (d < B3_EPSILON)
+		{
+			setValue(0, 0, 0, 1);
+		}
+		else
+		{
+			b3Scalar s = b3Sin(_angle * b3Scalar(0.5)) / d;
+			setValue(axis.getX() * s, axis.getY() * s, axis.getZ() * s,
+				b3Cos(_angle * b3Scalar(0.5)));
+		}
 	}
-  /**@brief Set the quaternion using Euler angles
+	/**@brief Set the quaternion using Euler angles
    * @param yaw Angle around Y
    * @param pitch Angle around X
    * @param roll Angle around Z */
 	void setEuler(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll)
 	{
-		b3Scalar halfYaw = b3Scalar(yaw) * b3Scalar(0.5);  
-		b3Scalar halfPitch = b3Scalar(pitch) * b3Scalar(0.5);  
-		b3Scalar halfRoll = b3Scalar(roll) * b3Scalar(0.5);  
+		b3Scalar halfYaw = b3Scalar(yaw) * b3Scalar(0.5);
+		b3Scalar halfPitch = b3Scalar(pitch) * b3Scalar(0.5);
+		b3Scalar halfRoll = b3Scalar(roll) * b3Scalar(0.5);
 		b3Scalar cosYaw = b3Cos(halfYaw);
 		b3Scalar sinYaw = b3Sin(halfYaw);
 		b3Scalar cosPitch = b3Cos(halfPitch);
@@ -122,34 +123,34 @@ public:
 		b3Scalar cosRoll = b3Cos(halfRoll);
 		b3Scalar sinRoll = b3Sin(halfRoll);
 		setValue(cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw,
-			cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw,
-			sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw,
-			cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw);
+				 cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw,
+				 sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw,
+				 cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw);
 	}
- 
+
 	/**@brief Set the quaternion using euler angles 
    * @param yaw Angle around Z
    * @param pitch Angle around Y
    * @param roll Angle around X */
 	void setEulerZYX(const b3Scalar& yawZ, const b3Scalar& pitchY, const b3Scalar& rollX)
 	{
-		b3Scalar halfYaw = b3Scalar(yawZ) * b3Scalar(0.5);  
-		b3Scalar halfPitch = b3Scalar(pitchY) * b3Scalar(0.5);  
-		b3Scalar halfRoll = b3Scalar(rollX) * b3Scalar(0.5);  
+		b3Scalar halfYaw = b3Scalar(yawZ) * b3Scalar(0.5);
+		b3Scalar halfPitch = b3Scalar(pitchY) * b3Scalar(0.5);
+		b3Scalar halfRoll = b3Scalar(rollX) * b3Scalar(0.5);
 		b3Scalar cosYaw = b3Cos(halfYaw);
 		b3Scalar sinYaw = b3Sin(halfYaw);
 		b3Scalar cosPitch = b3Cos(halfPitch);
 		b3Scalar sinPitch = b3Sin(halfPitch);
 		b3Scalar cosRoll = b3Cos(halfRoll);
 		b3Scalar sinRoll = b3Sin(halfRoll);
-		setValue(sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw, //x
-                         cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw, //y
-                         cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw, //z
-                         cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw); //formerly yzx
+		setValue(sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw,   //x
+				 cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw,   //y
+				 cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw,   //z
+				 cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw);  //formerly yzx
 		normalize();
 	}
 
-	  /**@brief Get the euler angles from this quaternion
+	/**@brief Get the euler angles from this quaternion
 	   * @param yaw Angle around Z
 	   * @param pitch Angle around Y
 	   * @param roll Angle around X */
@@ -166,221 +167,221 @@ public:
 		squ = m_floats[3] * m_floats[3];
 		rollX = b3Atan2(2 * (m_floats[1] * m_floats[2] + m_floats[3] * m_floats[0]), squ - sqx - sqy + sqz);
 		sarg = b3Scalar(-2.) * (m_floats[0] * m_floats[2] - m_floats[3] * m_floats[1]);
-		pitchY = sarg <= b3Scalar(-1.0) ? b3Scalar(-0.5) * B3_PI: (sarg >= b3Scalar(1.0) ? b3Scalar(0.5) * B3_PI : b3Asin(sarg));
+		pitchY = sarg <= b3Scalar(-1.0) ? b3Scalar(-0.5) * B3_PI : (sarg >= b3Scalar(1.0) ? b3Scalar(0.5) * B3_PI : b3Asin(sarg));
 		yawZ = b3Atan2(2 * (m_floats[0] * m_floats[1] + m_floats[3] * m_floats[2]), squ + sqx - sqy - sqz);
 	}
 
-  /**@brief Add two quaternions
+	/**@brief Add two quaternions
    * @param q The quaternion to add to this one */
-	B3_FORCE_INLINE	b3Quaternion& operator+=(const b3Quaternion& q)
+	B3_FORCE_INLINE b3Quaternion& operator+=(const b3Quaternion& q)
 	{
-#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 		mVec128 = _mm_add_ps(mVec128, q.mVec128);
 #elif defined(B3_USE_NEON)
 		mVec128 = vaddq_f32(mVec128, q.mVec128);
-#else	
-		m_floats[0] += q.getX(); 
-        m_floats[1] += q.getY(); 
-        m_floats[2] += q.getZ(); 
-        m_floats[3] += q.m_floats[3];
+#else
+		m_floats[0] += q.getX();
+		m_floats[1] += q.getY();
+		m_floats[2] += q.getZ();
+		m_floats[3] += q.m_floats[3];
 #endif
 		return *this;
 	}
 
-  /**@brief Subtract out a quaternion
+	/**@brief Subtract out a quaternion
    * @param q The quaternion to subtract from this one */
-	b3Quaternion& operator-=(const b3Quaternion& q) 
+	b3Quaternion& operator-=(const b3Quaternion& q)
 	{
-#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 		mVec128 = _mm_sub_ps(mVec128, q.mVec128);
 #elif defined(B3_USE_NEON)
 		mVec128 = vsubq_f32(mVec128, q.mVec128);
-#else	
-		m_floats[0] -= q.getX(); 
-        m_floats[1] -= q.getY(); 
-        m_floats[2] -= q.getZ(); 
-        m_floats[3] -= q.m_floats[3];
+#else
+		m_floats[0] -= q.getX();
+		m_floats[1] -= q.getY();
+		m_floats[2] -= q.getZ();
+		m_floats[3] -= q.m_floats[3];
 #endif
-        return *this;
+		return *this;
 	}
 
-  /**@brief Scale this quaternion
+	/**@brief Scale this quaternion
    * @param s The scalar to scale by */
 	b3Quaternion& operator*=(const b3Scalar& s)
 	{
-#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
-		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
-		vs = b3_pshufd_ps(vs, 0);	//	(S S S S)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vs = _mm_load_ss(&s);  //	(S 0 0 0)
+		vs = b3_pshufd_ps(vs, 0);     //	(S S S S)
 		mVec128 = _mm_mul_ps(mVec128, vs);
 #elif defined(B3_USE_NEON)
 		mVec128 = vmulq_n_f32(mVec128, s);
 #else
-		m_floats[0] *= s; 
-        m_floats[1] *= s; 
-        m_floats[2] *= s; 
-        m_floats[3] *= s;
+		m_floats[0] *= s;
+		m_floats[1] *= s;
+		m_floats[2] *= s;
+		m_floats[3] *= s;
 #endif
 		return *this;
 	}
 
-  /**@brief Multiply this quaternion by q on the right
+	/**@brief Multiply this quaternion by q on the right
    * @param q The other quaternion 
    * Equivilant to this = this * q */
 	b3Quaternion& operator*=(const b3Quaternion& q)
 	{
-#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 		__m128 vQ2 = q.get128();
-		
-		__m128 A1 = b3_pshufd_ps(mVec128, B3_SHUFFLE(0,1,2,0));
-		__m128 B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3,3,3,0));
-		
+
+		__m128 A1 = b3_pshufd_ps(mVec128, B3_SHUFFLE(0, 1, 2, 0));
+		__m128 B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3, 3, 3, 0));
+
 		A1 = A1 * B1;
-		
-		__m128 A2 = b3_pshufd_ps(mVec128, B3_SHUFFLE(1,2,0,1));
-		__m128 B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2,0,1,1));
-		
+
+		__m128 A2 = b3_pshufd_ps(mVec128, B3_SHUFFLE(1, 2, 0, 1));
+		__m128 B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2, 0, 1, 1));
+
 		A2 = A2 * B2;
-		
-		B1 = b3_pshufd_ps(mVec128, B3_SHUFFLE(2,0,1,2));
-		B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1,2,0,2));
-		
-		B1 = B1 * B2;	//	A3 *= B3
-		
-		mVec128 = b3_splat_ps(mVec128, 3);	//	A0
-		mVec128 = mVec128 * vQ2;	//	A0 * B0
-		
-		A1 = A1 + A2;	//	AB12
-		mVec128 = mVec128 - B1;	//	AB03 = AB0 - AB3 
-		A1 = _mm_xor_ps(A1, b3vPPPM);	//	change sign of the last element
-		mVec128 = mVec128+ A1;	//	AB03 + AB12
-
-#elif defined(B3_USE_NEON)     
-
-        float32x4_t vQ1 = mVec128;
-        float32x4_t vQ2 = q.get128();
-        float32x4_t A0, A1, B1, A2, B2, A3, B3;
-        float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
-        
-        {
-        float32x2x2_t tmp;
-        tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
-        vQ1zx = tmp.val[0];
-
-        tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
-        vQ2zx = tmp.val[0];
-        }
-        vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); 
-
-        vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
-
-        vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
-        vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
-
-        A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                    // X Y  z x 
-        B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W  W X 
-
-        A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
-        B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
-
-        A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
-        B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
-
-        A1 = vmulq_f32(A1, B1);
-        A2 = vmulq_f32(A2, B2);
-        A3 = vmulq_f32(A3, B3);	//	A3 *= B3
-        A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); //	A0 * B0
-
-        A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
-        A0 = vsubq_f32(A0, A3);	//	AB03 = AB0 - AB3 
-        
-        //	change the sign of the last element
-        A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM);	
-        A0 = vaddq_f32(A0, A1);	//	AB03 + AB12
-        
-        mVec128 = A0;
+
+		B1 = b3_pshufd_ps(mVec128, B3_SHUFFLE(2, 0, 1, 2));
+		B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1, 2, 0, 2));
+
+		B1 = B1 * B2;  //	A3 *= B3
+
+		mVec128 = b3_splat_ps(mVec128, 3);  //	A0
+		mVec128 = mVec128 * vQ2;            //	A0 * B0
+
+		A1 = A1 + A2;                  //	AB12
+		mVec128 = mVec128 - B1;        //	AB03 = AB0 - AB3
+		A1 = _mm_xor_ps(A1, b3vPPPM);  //	change sign of the last element
+		mVec128 = mVec128 + A1;        //	AB03 + AB12
+
+#elif defined(B3_USE_NEON)
+
+		float32x4_t vQ1 = mVec128;
+		float32x4_t vQ2 = q.get128();
+		float32x4_t A0, A1, B1, A2, B2, A3, B3;
+		float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
+
+		{
+			float32x2x2_t tmp;
+			tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1));  // {z x}, {w y}
+			vQ1zx = tmp.val[0];
+
+			tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2));  // {z x}, {w y}
+			vQ2zx = tmp.val[0];
+		}
+		vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
+
+		vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+
+		vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+		vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+
+		A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                     // X Y  z x
+		B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);  // W W  W X
+
+		A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
+		B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+
+		A3 = vcombine_f32(vQ1zx, vQ1yz);  // Z X Y Z
+		B3 = vcombine_f32(vQ2yz, vQ2xz);  // Y Z x z
+
+		A1 = vmulq_f32(A1, B1);
+		A2 = vmulq_f32(A2, B2);
+		A3 = vmulq_f32(A3, B3);                           //	A3 *= B3
+		A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1);  //	A0 * B0
+
+		A1 = vaddq_f32(A1, A2);  //	AB12 = AB1 + AB2
+		A0 = vsubq_f32(A0, A3);  //	AB03 = AB0 - AB3
+
+		//	change the sign of the last element
+		A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM);
+		A0 = vaddq_f32(A0, A1);  //	AB03 + AB12
+
+		mVec128 = A0;
 #else
 		setValue(
-            m_floats[3] * q.getX() + m_floats[0] * q.m_floats[3] + m_floats[1] * q.getZ() - m_floats[2] * q.getY(),
+			m_floats[3] * q.getX() + m_floats[0] * q.m_floats[3] + m_floats[1] * q.getZ() - m_floats[2] * q.getY(),
 			m_floats[3] * q.getY() + m_floats[1] * q.m_floats[3] + m_floats[2] * q.getX() - m_floats[0] * q.getZ(),
 			m_floats[3] * q.getZ() + m_floats[2] * q.m_floats[3] + m_floats[0] * q.getY() - m_floats[1] * q.getX(),
 			m_floats[3] * q.m_floats[3] - m_floats[0] * q.getX() - m_floats[1] * q.getY() - m_floats[2] * q.getZ());
 #endif
 		return *this;
 	}
-  /**@brief Return the dot product between this quaternion and another
+	/**@brief Return the dot product between this quaternion and another
    * @param q The other quaternion */
 	b3Scalar dot(const b3Quaternion& q) const
 	{
-#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
-		__m128	vd;
-		
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vd;
+
 		vd = _mm_mul_ps(mVec128, q.mVec128);
-		
-        __m128 t = _mm_movehl_ps(vd, vd);
+
+		__m128 t = _mm_movehl_ps(vd, vd);
 		vd = _mm_add_ps(vd, t);
 		t = _mm_shuffle_ps(vd, vd, 0x55);
 		vd = _mm_add_ss(vd, t);
-		
-        return _mm_cvtss_f32(vd);
+
+		return _mm_cvtss_f32(vd);
 #elif defined(B3_USE_NEON)
 		float32x4_t vd = vmulq_f32(mVec128, q.mVec128);
-		float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_high_f32(vd));  
+		float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_high_f32(vd));
 		x = vpadd_f32(x, x);
 		return vget_lane_f32(x, 0);
-#else    
-		return  m_floats[0] * q.getX() + 
-                m_floats[1] * q.getY() + 
-                m_floats[2] * q.getZ() + 
-                m_floats[3] * q.m_floats[3];
+#else
+		return m_floats[0] * q.getX() +
+			   m_floats[1] * q.getY() +
+			   m_floats[2] * q.getZ() +
+			   m_floats[3] * q.m_floats[3];
 #endif
 	}
 
-  /**@brief Return the length squared of the quaternion */
+	/**@brief Return the length squared of the quaternion */
 	b3Scalar length2() const
 	{
 		return dot(*this);
 	}
 
-  /**@brief Return the length of the quaternion */
+	/**@brief Return the length of the quaternion */
 	b3Scalar length() const
 	{
 		return b3Sqrt(length2());
 	}
 
-  /**@brief Normalize the quaternion 
+	/**@brief Normalize the quaternion 
    * Such that x^2 + y^2 + z^2 +w^2 = 1 */
-	b3Quaternion& normalize() 
+	b3Quaternion& normalize()
 	{
-#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
-		__m128	vd;
-		
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vd;
+
 		vd = _mm_mul_ps(mVec128, mVec128);
-		
-        __m128 t = _mm_movehl_ps(vd, vd);
+
+		__m128 t = _mm_movehl_ps(vd, vd);
 		vd = _mm_add_ps(vd, t);
 		t = _mm_shuffle_ps(vd, vd, 0x55);
 		vd = _mm_add_ss(vd, t);
 
 		vd = _mm_sqrt_ss(vd);
 		vd = _mm_div_ss(b3vOnes, vd);
-        vd = b3_pshufd_ps(vd, 0); // splat
+		vd = b3_pshufd_ps(vd, 0);  // splat
 		mVec128 = _mm_mul_ps(mVec128, vd);
-    
+
 		return *this;
-#else    
+#else
 		return *this /= length();
 #endif
 	}
 
-  /**@brief Return a scaled version of this quaternion
+	/**@brief Return a scaled version of this quaternion
    * @param s The scale factor */
 	B3_FORCE_INLINE b3Quaternion
 	operator*(const b3Scalar& s) const
 	{
-#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
-		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
-		vs = b3_pshufd_ps(vs, 0x00);	//	(S S S S)
-		
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vs = _mm_load_ss(&s);  //	(S 0 0 0)
+		vs = b3_pshufd_ps(vs, 0x00);  //	(S S S S)
+
 		return b3Quaternion(_mm_mul_ps(mVec128, vs));
 #elif defined(B3_USE_NEON)
 		return b3Quaternion(vmulq_n_f32(mVec128, s));
@@ -389,7 +390,7 @@ public:
 #endif
 	}
 
-  /**@brief Return an inversely scaled versionof this quaternion
+	/**@brief Return an inversely scaled versionof this quaternion
    * @param s The inverse scale factor */
 	b3Quaternion operator/(const b3Scalar& s) const
 	{
@@ -397,29 +398,29 @@ public:
 		return *this * (b3Scalar(1.0) / s);
 	}
 
-  /**@brief Inversely scale this quaternion
+	/**@brief Inversely scale this quaternion
    * @param s The scale factor */
-	b3Quaternion& operator/=(const b3Scalar& s) 
+	b3Quaternion& operator/=(const b3Scalar& s)
 	{
 		b3Assert(s != b3Scalar(0.0));
 		return *this *= b3Scalar(1.0) / s;
 	}
 
-  /**@brief Return a normalized version of this quaternion */
-	b3Quaternion normalized() const 
+	/**@brief Return a normalized version of this quaternion */
+	b3Quaternion normalized() const
 	{
 		return *this / length();
-	} 
-  /**@brief Return the angle between this quaternion and the other 
+	}
+	/**@brief Return the angle between this quaternion and the other 
    * @param q The other quaternion */
-	b3Scalar angle(const b3Quaternion& q) const 
+	b3Scalar angle(const b3Quaternion& q) const
 	{
 		b3Scalar s = b3Sqrt(length2() * q.length2());
 		b3Assert(s != b3Scalar(0.0));
 		return b3Acos(dot(q) / s);
 	}
-  /**@brief Return the angle of rotation represented by this quaternion */
-	b3Scalar getAngle() const 
+	/**@brief Return the angle of rotation represented by this quaternion */
+	b3Scalar getAngle() const
 	{
 		b3Scalar s = b3Scalar(2.) * b3Acos(m_floats[3]);
 		return s;
@@ -428,117 +429,116 @@ public:
 	/**@brief Return the axis of the rotation represented by this quaternion */
 	b3Vector3 getAxis() const
 	{
-		b3Scalar s_squared = 1.f-m_floats[3]*m_floats[3];
-		
-		if (s_squared < b3Scalar(10.) * B3_EPSILON) //Check for divide by zero
-			return b3MakeVector3(1.0, 0.0, 0.0);  // Arbitrary
-		b3Scalar s = 1.f/b3Sqrt(s_squared);
+		b3Scalar s_squared = 1.f - m_floats[3] * m_floats[3];
+
+		if (s_squared < b3Scalar(10.) * B3_EPSILON)  //Check for divide by zero
+			return b3MakeVector3(1.0, 0.0, 0.0);     // Arbitrary
+		b3Scalar s = 1.f / b3Sqrt(s_squared);
 		return b3MakeVector3(m_floats[0] * s, m_floats[1] * s, m_floats[2] * s);
 	}
 
 	/**@brief Return the inverse of this quaternion */
 	b3Quaternion inverse() const
 	{
-#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 		return b3Quaternion(_mm_xor_ps(mVec128, b3vQInv));
 #elif defined(B3_USE_NEON)
-        return b3Quaternion((b3SimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)b3vQInv));
-#else	
+		return b3Quaternion((b3SimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)b3vQInv));
+#else
 		return b3Quaternion(-m_floats[0], -m_floats[1], -m_floats[2], m_floats[3]);
 #endif
 	}
 
-  /**@brief Return the sum of this quaternion and the other 
+	/**@brief Return the sum of this quaternion and the other 
    * @param q2 The other quaternion */
 	B3_FORCE_INLINE b3Quaternion
 	operator+(const b3Quaternion& q2) const
 	{
-#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 		return b3Quaternion(_mm_add_ps(mVec128, q2.mVec128));
 #elif defined(B3_USE_NEON)
-        return b3Quaternion(vaddq_f32(mVec128, q2.mVec128));
-#else	
+		return b3Quaternion(vaddq_f32(mVec128, q2.mVec128));
+#else
 		const b3Quaternion& q1 = *this;
 		return b3Quaternion(q1.getX() + q2.getX(), q1.getY() + q2.getY(), q1.getZ() + q2.getZ(), q1.m_floats[3] + q2.m_floats[3]);
 #endif
 	}
 
-  /**@brief Return the difference between this quaternion and the other 
+	/**@brief Return the difference between this quaternion and the other 
    * @param q2 The other quaternion */
 	B3_FORCE_INLINE b3Quaternion
 	operator-(const b3Quaternion& q2) const
 	{
-#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 		return b3Quaternion(_mm_sub_ps(mVec128, q2.mVec128));
 #elif defined(B3_USE_NEON)
-        return b3Quaternion(vsubq_f32(mVec128, q2.mVec128));
-#else	
+		return b3Quaternion(vsubq_f32(mVec128, q2.mVec128));
+#else
 		const b3Quaternion& q1 = *this;
 		return b3Quaternion(q1.getX() - q2.getX(), q1.getY() - q2.getY(), q1.getZ() - q2.getZ(), q1.m_floats[3] - q2.m_floats[3]);
 #endif
 	}
 
-  /**@brief Return the negative of this quaternion 
+	/**@brief Return the negative of this quaternion 
    * This simply negates each element */
 	B3_FORCE_INLINE b3Quaternion operator-() const
 	{
-#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 		return b3Quaternion(_mm_xor_ps(mVec128, b3vMzeroMask));
 #elif defined(B3_USE_NEON)
-		return b3Quaternion((b3SimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)b3vMzeroMask) );
-#else	
+		return b3Quaternion((b3SimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)b3vMzeroMask));
+#else
 		const b3Quaternion& q2 = *this;
-		return b3Quaternion( - q2.getX(), - q2.getY(),  - q2.getZ(),  - q2.m_floats[3]);
+		return b3Quaternion(-q2.getX(), -q2.getY(), -q2.getZ(), -q2.m_floats[3]);
 #endif
 	}
-  /**@todo document this and it's use */
-	B3_FORCE_INLINE b3Quaternion farthest( const b3Quaternion& qd) const 
+	/**@todo document this and it's use */
+	B3_FORCE_INLINE b3Quaternion farthest(const b3Quaternion& qd) const
 	{
-		b3Quaternion diff,sum;
+		b3Quaternion diff, sum;
 		diff = *this - qd;
 		sum = *this + qd;
-		if( diff.dot(diff) > sum.dot(sum) )
+		if (diff.dot(diff) > sum.dot(sum))
 			return qd;
 		return (-qd);
 	}
 
 	/**@todo document this and it's use */
-	B3_FORCE_INLINE b3Quaternion nearest( const b3Quaternion& qd) const 
+	B3_FORCE_INLINE b3Quaternion nearest(const b3Quaternion& qd) const
 	{
-		b3Quaternion diff,sum;
+		b3Quaternion diff, sum;
 		diff = *this - qd;
 		sum = *this + qd;
-		if( diff.dot(diff) < sum.dot(sum) )
+		if (diff.dot(diff) < sum.dot(sum))
 			return qd;
 		return (-qd);
 	}
 
-
-  /**@brief Return the quaternion which is the result of Spherical Linear Interpolation between this and the other quaternion
+	/**@brief Return the quaternion which is the result of Spherical Linear Interpolation between this and the other quaternion
    * @param q The other quaternion to interpolate with 
    * @param t The ratio between this and q to interpolate.  If t = 0 the result is this, if t=1 the result is q.
    * Slerp interpolates assuming constant velocity.  */
 	b3Quaternion slerp(const b3Quaternion& q, const b3Scalar& t) const
 	{
-	  b3Scalar magnitude = b3Sqrt(length2() * q.length2()); 
-	  b3Assert(magnitude > b3Scalar(0));
+		b3Scalar magnitude = b3Sqrt(length2() * q.length2());
+		b3Assert(magnitude > b3Scalar(0));
 
-    b3Scalar product = dot(q) / magnitude;
-    if (b3Fabs(product) < b3Scalar(1))
+		b3Scalar product = dot(q) / magnitude;
+		if (b3Fabs(product) < b3Scalar(1))
 		{
-      // Take care of long angle case see http://en.wikipedia.org/wiki/Slerp
-      const b3Scalar sign = (product < 0) ? b3Scalar(-1) : b3Scalar(1);
-
-      const b3Scalar theta = b3Acos(sign * product);
-      const b3Scalar s1 = b3Sin(sign * t * theta);   
-      const b3Scalar d = b3Scalar(1.0) / b3Sin(theta);
-      const b3Scalar s0 = b3Sin((b3Scalar(1.0) - t) * theta);
-
-      return b3Quaternion(
-          (m_floats[0] * s0 + q.getX() * s1) * d,
-          (m_floats[1] * s0 + q.getY() * s1) * d,
-          (m_floats[2] * s0 + q.getZ() * s1) * d,
-          (m_floats[3] * s0 + q.m_floats[3] * s1) * d);
+			// Take care of long angle case see http://en.wikipedia.org/wiki/Slerp
+			const b3Scalar sign = (product < 0) ? b3Scalar(-1) : b3Scalar(1);
+
+			const b3Scalar theta = b3Acos(sign * product);
+			const b3Scalar s1 = b3Sin(sign * t * theta);
+			const b3Scalar d = b3Scalar(1.0) / b3Sin(theta);
+			const b3Scalar s0 = b3Sin((b3Scalar(1.0) - t) * theta);
+
+			return b3Quaternion(
+				(m_floats[0] * s0 + q.getX() * s1) * d,
+				(m_floats[1] * s0 + q.getY() * s1) * d,
+				(m_floats[2] * s0 + q.getZ() * s1) * d,
+				(m_floats[3] * s0 + q.m_floats[3] * s1) * d);
 		}
 		else
 		{
@@ -546,301 +546,294 @@ public:
 		}
 	}
 
-	static const b3Quaternion&	getIdentity()
+	static const b3Quaternion& getIdentity()
 	{
-		static const b3Quaternion identityQuat(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.),b3Scalar(1.));
+		static const b3Quaternion identityQuat(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.), b3Scalar(1.));
 		return identityQuat;
 	}
 
 	B3_FORCE_INLINE const b3Scalar& getW() const { return m_floats[3]; }
-
-	
 };
 
-
-
-
-
 /**@brief Return the product of two quaternions */
 B3_FORCE_INLINE b3Quaternion
-operator*(const b3Quaternion& q1, const b3Quaternion& q2) 
+operator*(const b3Quaternion& q1, const b3Quaternion& q2)
 {
-#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 	__m128 vQ1 = q1.get128();
 	__m128 vQ2 = q2.get128();
 	__m128 A0, A1, B1, A2, B2;
-    
-	A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(0,1,2,0)); // X Y  z x     //      vtrn
-	B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3,3,3,0)); // W W  W X     // vdup vext
+
+	A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(0, 1, 2, 0));  // X Y  z x     //      vtrn
+	B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3, 3, 3, 0));  // W W  W X     // vdup vext
 
 	A1 = A1 * B1;
-	
-	A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1,2,0,1)); // Y Z  X Y     // vext 
-	B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2,0,1,1)); // z x  Y Y     // vtrn vdup
+
+	A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1, 2, 0, 1));  // Y Z  X Y     // vext
+	B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2, 0, 1, 1));  // z x  Y Y     // vtrn vdup
 
 	A2 = A2 * B2;
 
-	B1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2,0,1,2)); // z x Y Z      // vtrn vext
-	B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1,2,0,2)); // Y Z x z      // vext vtrn
-	
-	B1 = B1 * B2;	//	A3 *= B3
+	B1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2, 0, 1, 2));  // z x Y Z      // vtrn vext
+	B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1, 2, 0, 2));  // Y Z x z      // vext vtrn
+
+	B1 = B1 * B2;  //	A3 *= B3
 
-	A0 = b3_splat_ps(vQ1, 3);	//	A0
-	A0 = A0 * vQ2;	//	A0 * B0
+	A0 = b3_splat_ps(vQ1, 3);  //	A0
+	A0 = A0 * vQ2;             //	A0 * B0
+
+	A1 = A1 + A2;  //	AB12
+	A0 = A0 - B1;  //	AB03 = AB0 - AB3
+
+	A1 = _mm_xor_ps(A1, b3vPPPM);  //	change sign of the last element
+	A0 = A0 + A1;                  //	AB03 + AB12
 
-	A1 = A1 + A2;	//	AB12
-	A0 =  A0 - B1;	//	AB03 = AB0 - AB3 
-	
-    A1 = _mm_xor_ps(A1, b3vPPPM);	//	change sign of the last element
-	A0 = A0 + A1;	//	AB03 + AB12
-	
 	return b3Quaternion(A0);
 
-#elif defined(B3_USE_NEON)     
+#elif defined(B3_USE_NEON)
 
 	float32x4_t vQ1 = q1.get128();
 	float32x4_t vQ2 = q2.get128();
 	float32x4_t A0, A1, B1, A2, B2, A3, B3;
-    float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
-    
-    {
-    float32x2x2_t tmp;
-    tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
-    vQ1zx = tmp.val[0];
+	float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
+
+	{
+		float32x2x2_t tmp;
+		tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1));  // {z x}, {w y}
+		vQ1zx = tmp.val[0];
 
-    tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
-    vQ2zx = tmp.val[0];
-    }
-    vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); 
+		tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2));  // {z x}, {w y}
+		vQ2zx = tmp.val[0];
+	}
+	vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
 
-    vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+	vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
 
-    vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
-    vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+	vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+	vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
 
-    A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                    // X Y  z x 
-    B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W  W X 
+	A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                     // X Y  z x
+	B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);  // W W  W X
 
 	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
-    B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+	B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
 
-    A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
-    B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
+	A3 = vcombine_f32(vQ1zx, vQ1yz);  // Z X Y Z
+	B3 = vcombine_f32(vQ2yz, vQ2xz);  // Y Z x z
 
 	A1 = vmulq_f32(A1, B1);
 	A2 = vmulq_f32(A2, B2);
-	A3 = vmulq_f32(A3, B3);	//	A3 *= B3
-	A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); //	A0 * B0
-
-	A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
-	A0 = vsubq_f32(A0, A3);	//	AB03 = AB0 - AB3 
-	
-    //	change the sign of the last element
-    A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM);	
-	A0 = vaddq_f32(A0, A1);	//	AB03 + AB12
-	
+	A3 = vmulq_f32(A3, B3);                           //	A3 *= B3
+	A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1);  //	A0 * B0
+
+	A1 = vaddq_f32(A1, A2);  //	AB12 = AB1 + AB2
+	A0 = vsubq_f32(A0, A3);  //	AB03 = AB0 - AB3
+
+	//	change the sign of the last element
+	A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM);
+	A0 = vaddq_f32(A0, A1);  //	AB03 + AB12
+
 	return b3Quaternion(A0);
 
 #else
 	return b3Quaternion(
-        q1.getW() * q2.getX() + q1.getX() * q2.getW() + q1.getY() * q2.getZ() - q1.getZ() * q2.getY(),
+		q1.getW() * q2.getX() + q1.getX() * q2.getW() + q1.getY() * q2.getZ() - q1.getZ() * q2.getY(),
 		q1.getW() * q2.getY() + q1.getY() * q2.getW() + q1.getZ() * q2.getX() - q1.getX() * q2.getZ(),
 		q1.getW() * q2.getZ() + q1.getZ() * q2.getW() + q1.getX() * q2.getY() - q1.getY() * q2.getX(),
-		q1.getW() * q2.getW() - q1.getX() * q2.getX() - q1.getY() * q2.getY() - q1.getZ() * q2.getZ()); 
+		q1.getW() * q2.getW() - q1.getX() * q2.getX() - q1.getY() * q2.getY() - q1.getZ() * q2.getZ());
 #endif
 }
 
 B3_FORCE_INLINE b3Quaternion
 operator*(const b3Quaternion& q, const b3Vector3& w)
 {
-#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 	__m128 vQ1 = q.get128();
 	__m128 vQ2 = w.get128();
 	__m128 A1, B1, A2, B2, A3, B3;
-	
-	A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(3,3,3,0));
-	B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(0,1,2,0));
+
+	A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(3, 3, 3, 0));
+	B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(0, 1, 2, 0));
 
 	A1 = A1 * B1;
-	
-	A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1,2,0,1));
-	B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2,0,1,1));
+
+	A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1, 2, 0, 1));
+	B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2, 0, 1, 1));
 
 	A2 = A2 * B2;
 
-	A3 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2,0,1,2));
-	B3 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1,2,0,2));
-	
-	A3 = A3 * B3;	//	A3 *= B3
+	A3 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2, 0, 1, 2));
+	B3 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1, 2, 0, 2));
+
+	A3 = A3 * B3;  //	A3 *= B3
+
+	A1 = A1 + A2;                  //	AB12
+	A1 = _mm_xor_ps(A1, b3vPPPM);  //	change sign of the last element
+	A1 = A1 - A3;                  //	AB123 = AB12 - AB3
 
-	A1 = A1 + A2;	//	AB12
-	A1 = _mm_xor_ps(A1, b3vPPPM);	//	change sign of the last element
-    A1 = A1 - A3;	//	AB123 = AB12 - AB3 
-	
 	return b3Quaternion(A1);
-    
-#elif defined(B3_USE_NEON)     
+
+#elif defined(B3_USE_NEON)
 
 	float32x4_t vQ1 = q.get128();
 	float32x4_t vQ2 = w.get128();
 	float32x4_t A1, B1, A2, B2, A3, B3;
-    float32x2_t vQ1wx, vQ2zx, vQ1yz, vQ2yz, vQ1zx, vQ2xz;
-    
-    vQ1wx = vext_f32(vget_high_f32(vQ1), vget_low_f32(vQ1), 1); 
-    {
-    float32x2x2_t tmp;
+	float32x2_t vQ1wx, vQ2zx, vQ1yz, vQ2yz, vQ1zx, vQ2xz;
 
-    tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
-    vQ2zx = tmp.val[0];
+	vQ1wx = vext_f32(vget_high_f32(vQ1), vget_low_f32(vQ1), 1);
+	{
+		float32x2x2_t tmp;
 
-    tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
-    vQ1zx = tmp.val[0];
-    }
+		tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2));  // {z x}, {w y}
+		vQ2zx = tmp.val[0];
 
-    vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+		tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1));  // {z x}, {w y}
+		vQ1zx = tmp.val[0];
+	}
+
+	vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
 
-    vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
-    vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+	vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+	vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
 
-    A1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ1), 1), vQ1wx); // W W  W X 
-    B1 = vcombine_f32(vget_low_f32(vQ2), vQ2zx);                    // X Y  z x 
+	A1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ1), 1), vQ1wx);  // W W  W X
+	B1 = vcombine_f32(vget_low_f32(vQ2), vQ2zx);                     // X Y  z x
 
 	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
-    B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+	B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
 
-    A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
-    B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
+	A3 = vcombine_f32(vQ1zx, vQ1yz);  // Z X Y Z
+	B3 = vcombine_f32(vQ2yz, vQ2xz);  // Y Z x z
 
 	A1 = vmulq_f32(A1, B1);
 	A2 = vmulq_f32(A2, B2);
-	A3 = vmulq_f32(A3, B3);	//	A3 *= B3
-
-	A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
-	
-    //	change the sign of the last element
-    A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM);	
-	
-    A1 = vsubq_f32(A1, A3);	//	AB123 = AB12 - AB3
-	
+	A3 = vmulq_f32(A3, B3);  //	A3 *= B3
+
+	A1 = vaddq_f32(A1, A2);  //	AB12 = AB1 + AB2
+
+	//	change the sign of the last element
+	A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM);
+
+	A1 = vsubq_f32(A1, A3);  //	AB123 = AB12 - AB3
+
 	return b3Quaternion(A1);
-    
+
 #else
-	return b3Quaternion( 
-         q.getW() * w.getX() + q.getY() * w.getZ() - q.getZ() * w.getY(),
-		 q.getW() * w.getY() + q.getZ() * w.getX() - q.getX() * w.getZ(),
-		 q.getW() * w.getZ() + q.getX() * w.getY() - q.getY() * w.getX(),
-		-q.getX() * w.getX() - q.getY() * w.getY() - q.getZ() * w.getZ()); 
+	return b3Quaternion(
+		q.getW() * w.getX() + q.getY() * w.getZ() - q.getZ() * w.getY(),
+		q.getW() * w.getY() + q.getZ() * w.getX() - q.getX() * w.getZ(),
+		q.getW() * w.getZ() + q.getX() * w.getY() - q.getY() * w.getX(),
+		-q.getX() * w.getX() - q.getY() * w.getY() - q.getZ() * w.getZ());
 #endif
 }
 
 B3_FORCE_INLINE b3Quaternion
 operator*(const b3Vector3& w, const b3Quaternion& q)
 {
-#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 	__m128 vQ1 = w.get128();
 	__m128 vQ2 = q.get128();
 	__m128 A1, B1, A2, B2, A3, B3;
-	
-	A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(0,1,2,0));  // X Y  z x
-	B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3,3,3,0));  // W W  W X 
+
+	A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(0, 1, 2, 0));  // X Y  z x
+	B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3, 3, 3, 0));  // W W  W X
 
 	A1 = A1 * B1;
-	
-	A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1,2,0,1));
-	B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2,0,1,1));
 
-	A2 = A2 *B2;
+	A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1, 2, 0, 1));
+	B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2, 0, 1, 1));
+
+	A2 = A2 * B2;
+
+	A3 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2, 0, 1, 2));
+	B3 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1, 2, 0, 2));
+
+	A3 = A3 * B3;  //	A3 *= B3
 
-	A3 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2,0,1,2));
-	B3 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1,2,0,2));
-	
-	A3 = A3 * B3;	//	A3 *= B3
+	A1 = A1 + A2;                  //	AB12
+	A1 = _mm_xor_ps(A1, b3vPPPM);  //	change sign of the last element
+	A1 = A1 - A3;                  //	AB123 = AB12 - AB3
 
-	A1 = A1 + A2;	//	AB12
-	A1 = _mm_xor_ps(A1, b3vPPPM);	//	change sign of the last element
-	A1 = A1 - A3;	//	AB123 = AB12 - AB3 
-	
 	return b3Quaternion(A1);
 
-#elif defined(B3_USE_NEON)     
+#elif defined(B3_USE_NEON)
 
 	float32x4_t vQ1 = w.get128();
 	float32x4_t vQ2 = q.get128();
-	float32x4_t  A1, B1, A2, B2, A3, B3;
-    float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
-    
-    {
-    float32x2x2_t tmp;
-   
-    tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
-    vQ1zx = tmp.val[0];
+	float32x4_t A1, B1, A2, B2, A3, B3;
+	float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
 
-    tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
-    vQ2zx = tmp.val[0];
-    }
-    vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); 
+	{
+		float32x2x2_t tmp;
+
+		tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1));  // {z x}, {w y}
+		vQ1zx = tmp.val[0];
+
+		tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2));  // {z x}, {w y}
+		vQ2zx = tmp.val[0];
+	}
+	vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
 
-    vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+	vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
 
-    vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
-    vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+	vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+	vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
 
-    A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                    // X Y  z x 
-    B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W  W X 
+	A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                     // X Y  z x
+	B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);  // W W  W X
 
 	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
-    B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+	B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
 
-    A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
-    B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
+	A3 = vcombine_f32(vQ1zx, vQ1yz);  // Z X Y Z
+	B3 = vcombine_f32(vQ2yz, vQ2xz);  // Y Z x z
 
 	A1 = vmulq_f32(A1, B1);
 	A2 = vmulq_f32(A2, B2);
-	A3 = vmulq_f32(A3, B3);	//	A3 *= B3
-
-	A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
-	
-    //	change the sign of the last element
-    A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM);	
-	
-    A1 = vsubq_f32(A1, A3);	//	AB123 = AB12 - AB3
-	
+	A3 = vmulq_f32(A3, B3);  //	A3 *= B3
+
+	A1 = vaddq_f32(A1, A2);  //	AB12 = AB1 + AB2
+
+	//	change the sign of the last element
+	A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM);
+
+	A1 = vsubq_f32(A1, A3);  //	AB123 = AB12 - AB3
+
 	return b3Quaternion(A1);
-    
+
 #else
-	return b3Quaternion( 
-        +w.getX() * q.getW() + w.getY() * q.getZ() - w.getZ() * q.getY(),
+	return b3Quaternion(
+		+w.getX() * q.getW() + w.getY() * q.getZ() - w.getZ() * q.getY(),
 		+w.getY() * q.getW() + w.getZ() * q.getX() - w.getX() * q.getZ(),
 		+w.getZ() * q.getW() + w.getX() * q.getY() - w.getY() * q.getX(),
-		-w.getX() * q.getX() - w.getY() * q.getY() - w.getZ() * q.getZ()); 
+		-w.getX() * q.getX() - w.getY() * q.getY() - w.getZ() * q.getZ());
 #endif
 }
 
 /**@brief Calculate the dot product between two quaternions */
-B3_FORCE_INLINE b3Scalar 
-b3Dot(const b3Quaternion& q1, const b3Quaternion& q2) 
-{ 
-	return q1.dot(q2); 
+B3_FORCE_INLINE b3Scalar
+b3Dot(const b3Quaternion& q1, const b3Quaternion& q2)
+{
+	return q1.dot(q2);
 }
 
-
 /**@brief Return the length of a quaternion */
 B3_FORCE_INLINE b3Scalar
-b3Length(const b3Quaternion& q) 
-{ 
-	return q.length(); 
+b3Length(const b3Quaternion& q)
+{
+	return q.length();
 }
 
 /**@brief Return the angle between two quaternions*/
 B3_FORCE_INLINE b3Scalar
-b3Angle(const b3Quaternion& q1, const b3Quaternion& q2) 
-{ 
-	return q1.angle(q2); 
+b3Angle(const b3Quaternion& q1, const b3Quaternion& q2)
+{
+	return q1.angle(q2);
 }
 
 /**@brief Return the inverse of a quaternion*/
 B3_FORCE_INLINE b3Quaternion
-b3Inverse(const b3Quaternion& q) 
+b3Inverse(const b3Quaternion& q)
 {
 	return q.inverse();
 }
@@ -851,7 +844,7 @@ b3Inverse(const b3Quaternion& q)
  * @param t The ration between q1 and q2.  t = 0 return q1, t=1 returns q2 
  * Slerp assumes constant velocity between positions. */
 B3_FORCE_INLINE b3Quaternion
-b3Slerp(const b3Quaternion& q1, const b3Quaternion& q2, const b3Scalar& t) 
+b3Slerp(const b3Quaternion& q1, const b3Quaternion& q2, const b3Scalar& t)
 {
 	return q1.slerp(q2, t);
 }
@@ -859,7 +852,7 @@ b3Slerp(const b3Quaternion& q1, const b3Quaternion& q2, const b3Scalar& t)
 B3_FORCE_INLINE b3Quaternion
 b3QuatMul(const b3Quaternion& rot0, const b3Quaternion& rot1)
 {
-	return rot0*rot1;
+	return rot0 * rot1;
 }
 
 B3_FORCE_INLINE b3Quaternion
@@ -868,51 +861,45 @@ b3QuatNormalized(const b3Quaternion& orn)
 	return orn.normalized();
 }
 
-
-
-B3_FORCE_INLINE b3Vector3 
-b3QuatRotate(const b3Quaternion& rotation, const b3Vector3& v) 
+B3_FORCE_INLINE b3Vector3
+b3QuatRotate(const b3Quaternion& rotation, const b3Vector3& v)
 {
 	b3Quaternion q = rotation * v;
 	q *= rotation.inverse();
-#if defined (B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 	return b3MakeVector3(_mm_and_ps(q.get128(), b3vFFF0fMask));
 #elif defined(B3_USE_NEON)
-    return b3MakeVector3((float32x4_t)vandq_s32((int32x4_t)q.get128(), b3vFFF0Mask));
-#else	
-	return b3MakeVector3(q.getX(),q.getY(),q.getZ());
+	return b3MakeVector3((float32x4_t)vandq_s32((int32x4_t)q.get128(), b3vFFF0Mask));
+#else
+	return b3MakeVector3(q.getX(), q.getY(), q.getZ());
 #endif
 }
 
-B3_FORCE_INLINE b3Quaternion 
-b3ShortestArcQuat(const b3Vector3& v0, const b3Vector3& v1) // Game Programming Gems 2.10. make sure v0,v1 are normalized
+B3_FORCE_INLINE b3Quaternion
+b3ShortestArcQuat(const b3Vector3& v0, const b3Vector3& v1)  // Game Programming Gems 2.10. make sure v0,v1 are normalized
 {
 	b3Vector3 c = v0.cross(v1);
-	b3Scalar  d = v0.dot(v1);
+	b3Scalar d = v0.dot(v1);
 
 	if (d < -1.0 + B3_EPSILON)
 	{
-		b3Vector3 n,unused;
-		b3PlaneSpace1(v0,n,unused);
-		return b3Quaternion(n.getX(),n.getY(),n.getZ(),0.0f); // just pick any vector that is orthogonal to v0
+		b3Vector3 n, unused;
+		b3PlaneSpace1(v0, n, unused);
+		return b3Quaternion(n.getX(), n.getY(), n.getZ(), 0.0f);  // just pick any vector that is orthogonal to v0
 	}
 
-	b3Scalar  s = b3Sqrt((1.0f + d) * 2.0f);
+	b3Scalar s = b3Sqrt((1.0f + d) * 2.0f);
 	b3Scalar rs = 1.0f / s;
 
-	return b3Quaternion(c.getX()*rs,c.getY()*rs,c.getZ()*rs,s * 0.5f);
-	
+	return b3Quaternion(c.getX() * rs, c.getY() * rs, c.getZ() * rs, s * 0.5f);
 }
 
-B3_FORCE_INLINE b3Quaternion 
-b3ShortestArcQuatNormalize2(b3Vector3& v0,b3Vector3& v1)
+B3_FORCE_INLINE b3Quaternion
+b3ShortestArcQuatNormalize2(b3Vector3& v0, b3Vector3& v1)
 {
 	v0.normalize();
 	v1.normalize();
-	return b3ShortestArcQuat(v0,v1);
+	return b3ShortestArcQuat(v0, v1);
 }
 
-#endif //B3_SIMD__QUATERNION_H_
-
-
-
+#endif  //B3_SIMD__QUATERNION_H_
diff --git a/thirdparty/bullet/Bullet3Common/b3Random.h b/thirdparty/bullet/Bullet3Common/b3Random.h
index dc040f1562..c2e21496c7 100644
--- a/thirdparty/bullet/Bullet3Common/b3Random.h
+++ b/thirdparty/bullet/Bullet3Common/b3Random.h
@@ -12,8 +12,6 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
-
-
 #ifndef B3_GEN_RANDOM_H
 #define B3_GEN_RANDOM_H
 
@@ -26,8 +24,8 @@ subject to the following restrictions:
 
 #define B3_RAND_MAX UINT_MAX
 
-B3_FORCE_INLINE void         b3Srand(unsigned int seed) { init_genrand(seed); }
-B3_FORCE_INLINE unsigned int b3rand()                   { return genrand_int32(); }
+B3_FORCE_INLINE void b3Srand(unsigned int seed) { init_genrand(seed); }
+B3_FORCE_INLINE unsigned int b3rand() { return genrand_int32(); }
 
 #else
 
@@ -35,8 +33,8 @@ B3_FORCE_INLINE unsigned int b3rand()                   { return genrand_int32()
 
 #define B3_RAND_MAX RAND_MAX
 
-B3_FORCE_INLINE void         b3Srand(unsigned int seed) { srand(seed); } 
-B3_FORCE_INLINE unsigned int b3rand()                   { return rand(); }
+B3_FORCE_INLINE void b3Srand(unsigned int seed) { srand(seed); }
+B3_FORCE_INLINE unsigned int b3rand() { return rand(); }
 
 #endif
 
@@ -45,6 +43,4 @@ inline b3Scalar b3RandRange(b3Scalar minRange, b3Scalar maxRange)
 	return (b3rand() / (b3Scalar(B3_RAND_MAX) + b3Scalar(1.0))) * (maxRange - minRange) + minRange;
 }
 
-
-#endif //B3_GEN_RANDOM_H
-
+#endif  //B3_GEN_RANDOM_H
diff --git a/thirdparty/bullet/Bullet3Common/b3ResizablePool.h b/thirdparty/bullet/Bullet3Common/b3ResizablePool.h
index 06ad8a778d..cafe3ff396 100644
--- a/thirdparty/bullet/Bullet3Common/b3ResizablePool.h
+++ b/thirdparty/bullet/Bullet3Common/b3ResizablePool.h
@@ -4,10 +4,10 @@
 
 #include "Bullet3Common/b3AlignedObjectArray.h"
 
-enum 
+enum
 {
-	B3_POOL_HANDLE_TERMINAL_FREE=-1,
-	B3_POOL_HANDLE_TERMINAL_USED =-2
+	B3_POOL_HANDLE_TERMINAL_FREE = -1,
+	B3_POOL_HANDLE_TERMINAL_USED = -2
 };
 
 template <typename U>
@@ -20,25 +20,23 @@ struct b3PoolBodyHandle : public U
 	{
 		m_nextFreeHandle = next;
 	}
-	int	getNextFree() const
+	int getNextFree() const
 	{
 		return m_nextFreeHandle;
 	}
 };
 
-template <typename T> 
+template <typename T>
 class b3ResizablePool
 {
-
 protected:
-	b3AlignedObjectArray<T>	m_bodyHandles;
-	int m_numUsedHandles;						// number of active handles
-	int	m_firstFreeHandle;		// free handles list
+	b3AlignedObjectArray<T> m_bodyHandles;
+	int m_numUsedHandles;   // number of active handles
+	int m_firstFreeHandle;  // free handles list
 
 	T* getHandleInternal(int handle)
 	{
 		return &m_bodyHandles[handle];
-
 	}
 	const T* getHandleInternal(int handle) const
 	{
@@ -46,17 +44,16 @@ protected:
 	}
 
 public:
-	
 	b3ResizablePool()
 	{
 		initHandles();
 	}
-	
+
 	virtual ~b3ResizablePool()
 	{
 		exitHandles();
 	}
-///handle management
+	///handle management
 
 	int getNumHandles() const
 	{
@@ -65,44 +62,40 @@ public:
 
 	void getUsedHandles(b3AlignedObjectArray<int>& usedHandles) const
 	{
-
-		for (int i=0;i<m_bodyHandles.size();i++)
+		for (int i = 0; i < m_bodyHandles.size(); i++)
 		{
-			if (m_bodyHandles[i].getNextFree()==B3_POOL_HANDLE_TERMINAL_USED)
+			if (m_bodyHandles[i].getNextFree() == B3_POOL_HANDLE_TERMINAL_USED)
 			{
 				usedHandles.push_back(i);
 			}
 		}
 	}
 
-	
-
 	T* getHandle(int handle)
 	{
-		b3Assert(handle>=0);
-		b3Assert(handle<m_bodyHandles.size());
-		if ((handle<0) || (handle>=m_bodyHandles.size()))
+		b3Assert(handle >= 0);
+		b3Assert(handle < m_bodyHandles.size());
+		if ((handle < 0) || (handle >= m_bodyHandles.size()))
 		{
 			return 0;
 		}
 
-		if (m_bodyHandles[handle].getNextFree()==B3_POOL_HANDLE_TERMINAL_USED)
+		if (m_bodyHandles[handle].getNextFree() == B3_POOL_HANDLE_TERMINAL_USED)
 		{
 			return &m_bodyHandles[handle];
 		}
 		return 0;
-
 	}
 	const T* getHandle(int handle) const
 	{
-		b3Assert(handle>=0);
-		b3Assert(handle<m_bodyHandles.size());
-		if ((handle<0) || (handle>=m_bodyHandles.size()))
+		b3Assert(handle >= 0);
+		b3Assert(handle < m_bodyHandles.size());
+		if ((handle < 0) || (handle >= m_bodyHandles.size()))
 		{
 			return 0;
 		}
 
-		if (m_bodyHandles[handle].getNextFree()==B3_POOL_HANDLE_TERMINAL_USED)
+		if (m_bodyHandles[handle].getNextFree() == B3_POOL_HANDLE_TERMINAL_USED)
 		{
 			return &m_bodyHandles[handle];
 		}
@@ -120,7 +113,6 @@ public:
 			for (int i = curCapacity; i < newCapacity; i++)
 				m_bodyHandles[i].setNextFree(i + 1);
 
-
 			m_bodyHandles[newCapacity - 1].setNextFree(-1);
 		}
 		m_firstFreeHandle = curCapacity;
@@ -142,19 +134,18 @@ public:
 
 	int allocHandle()
 	{
-		b3Assert(m_firstFreeHandle>=0);
+		b3Assert(m_firstFreeHandle >= 0);
 
 		int handle = m_firstFreeHandle;
 		m_firstFreeHandle = getHandleInternal(handle)->getNextFree();
 		m_numUsedHandles++;
 
-		if (m_firstFreeHandle<0)
+		if (m_firstFreeHandle < 0)
 		{
 			//int curCapacity = m_bodyHandles.size();
-			int additionalCapacity= m_bodyHandles.size();
+			int additionalCapacity = m_bodyHandles.size();
 			increaseHandleCapacity(additionalCapacity);
 
-
 			getHandleInternal(handle)->setNextFree(m_firstFreeHandle);
 		}
 		getHandleInternal(handle)->setNextFree(B3_POOL_HANDLE_TERMINAL_USED);
@@ -162,12 +153,11 @@ public:
 		return handle;
 	}
 
-
 	void freeHandle(int handle)
 	{
 		b3Assert(handle >= 0);
 
-		if (m_bodyHandles[handle].getNextFree()==B3_POOL_HANDLE_TERMINAL_USED)
+		if (m_bodyHandles[handle].getNextFree() == B3_POOL_HANDLE_TERMINAL_USED)
 		{
 			getHandleInternal(handle)->clear();
 			getHandleInternal(handle)->setNextFree(m_firstFreeHandle);
@@ -176,7 +166,6 @@ public:
 		}
 	}
 };
-	///end handle management
-	
-	#endif //B3_RESIZABLE_POOL_H
-	
-\ No newline at end of file
+///end handle management
+
+#endif  //B3_RESIZABLE_POOL_H
diff --git a/thirdparty/bullet/Bullet3Common/b3Scalar.h b/thirdparty/bullet/Bullet3Common/b3Scalar.h
index dbc7fea397..0db5eb6f4f 100644
--- a/thirdparty/bullet/Bullet3Common/b3Scalar.h
+++ b/thirdparty/bullet/Bullet3Common/b3Scalar.h
@@ -12,8 +12,6 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
-
-
 #ifndef B3_SCALAR_H
 #define B3_SCALAR_H
 
@@ -22,238 +20,252 @@ subject to the following restrictions:
 #pragma unmanaged
 #endif
 
-
-
 #include <math.h>
-#include <stdlib.h>//size_t for MSVC 6.0
+#include <stdlib.h>  //size_t for MSVC 6.0
 #include <float.h>
 
 //Original repository is at http://github.com/erwincoumans/bullet3
 #define B3_BULLET_VERSION 300
 
-inline int	b3GetVersion()
+inline int b3GetVersion()
 {
 	return B3_BULLET_VERSION;
 }
 
-#if defined(DEBUG) || defined (_DEBUG)
+#if defined(DEBUG) || defined(_DEBUG)
 #define B3_DEBUG
 #endif
 
-#include "b3Logging.h"//for b3Error
-
+#include "b3Logging.h"  //for b3Error
 
 #ifdef _WIN32
 
-		#if defined(__MINGW32__) || defined(__CYGWIN__) || (defined (_MSC_VER) && _MSC_VER < 1300)
+#if defined(__MINGW32__) || defined(__CYGWIN__) || (defined(_MSC_VER) && _MSC_VER < 1300)
 
-			#define B3_FORCE_INLINE inline
-			#define B3_ATTRIBUTE_ALIGNED16(a) a
-			#define B3_ATTRIBUTE_ALIGNED64(a) a
-			#define B3_ATTRIBUTE_ALIGNED128(a) a
-		#else
-			//#define B3_HAS_ALIGNED_ALLOCATOR
-			#pragma warning(disable : 4324) // disable padding warning
+#define B3_FORCE_INLINE inline
+#define B3_ATTRIBUTE_ALIGNED16(a) a
+#define B3_ATTRIBUTE_ALIGNED64(a) a
+#define B3_ATTRIBUTE_ALIGNED128(a) a
+#else
+//#define B3_HAS_ALIGNED_ALLOCATOR
+#pragma warning(disable : 4324)  // disable padding warning
 //			#pragma warning(disable:4530) // Disable the exception disable but used in MSCV Stl warning.
-			#pragma warning(disable:4996) //Turn off warnings about deprecated C routines
+#pragma warning(disable : 4996)  //Turn off warnings about deprecated C routines
 //			#pragma warning(disable:4786) // Disable the "debug name too long" warning
 
-			#define B3_FORCE_INLINE __forceinline
-			#define B3_ATTRIBUTE_ALIGNED16(a) __declspec(align(16)) a
-			#define B3_ATTRIBUTE_ALIGNED64(a) __declspec(align(64)) a
-			#define B3_ATTRIBUTE_ALIGNED128(a) __declspec (align(128)) a
-		#ifdef _XBOX
-			#define B3_USE_VMX128
-
-			#include <ppcintrinsics.h>
- 			#define B3_HAVE_NATIVE_FSEL
- 			#define b3Fsel(a,b,c) __fsel((a),(b),(c))
-		#else
-
-#if (defined (_WIN32) && (_MSC_VER) && _MSC_VER >= 1400) && (!defined (B3_USE_DOUBLE_PRECISION))
-	#if (defined (_M_IX86) || defined (_M_X64))
-			#define B3_USE_SSE
-			#ifdef B3_USE_SSE
-			//B3_USE_SSE_IN_API is disabled under Windows by default, because 
-			//it makes it harder to integrate Bullet into your application under Windows 
-			//(structured embedding Bullet structs/classes need to be 16-byte aligned)
-			//with relatively little performance gain
-			//If you are not embedded Bullet data in your classes, or make sure that you align those classes on 16-byte boundaries
-			//you can manually enable this line or set it in the build system for a bit of performance gain (a few percent, dependent on usage)
-			//#define B3_USE_SSE_IN_API
-			#endif //B3_USE_SSE
-			#include <emmintrin.h>
-	#endif
+#define B3_FORCE_INLINE __forceinline
+#define B3_ATTRIBUTE_ALIGNED16(a) __declspec(align(16)) a
+#define B3_ATTRIBUTE_ALIGNED64(a) __declspec(align(64)) a
+#define B3_ATTRIBUTE_ALIGNED128(a) __declspec(align(128)) a
+#ifdef _XBOX
+#define B3_USE_VMX128
+
+#include <ppcintrinsics.h>
+#define B3_HAVE_NATIVE_FSEL
+#define b3Fsel(a, b, c) __fsel((a), (b), (c))
+#else
+
+#if (defined(_WIN32) && (_MSC_VER) && _MSC_VER >= 1400) && (!defined(B3_USE_DOUBLE_PRECISION))
+#if (defined(_M_IX86) || defined(_M_X64))
+#define B3_USE_SSE
+#ifdef B3_USE_SSE
+//B3_USE_SSE_IN_API is disabled under Windows by default, because
+//it makes it harder to integrate Bullet into your application under Windows
+//(structured embedding Bullet structs/classes need to be 16-byte aligned)
+//with relatively little performance gain
+//If you are not embedded Bullet data in your classes, or make sure that you align those classes on 16-byte boundaries
+//you can manually enable this line or set it in the build system for a bit of performance gain (a few percent, dependent on usage)
+//#define B3_USE_SSE_IN_API
+#endif  //B3_USE_SSE
+#include <emmintrin.h>
+#endif
 #endif
 
-		#endif//_XBOX
+#endif  //_XBOX
 
-		#endif //__MINGW32__
+#endif  //__MINGW32__
 
 #ifdef B3_DEBUG
-	#ifdef _MSC_VER
-		#include <stdio.h>
-		#define b3Assert(x) { if(!(x)){b3Error("Assert "__FILE__ ":%u ("#x")\n", __LINE__);__debugbreak();	}}
-	#else//_MSC_VER
-		#include <assert.h>
-		#define b3Assert assert
-	#endif//_MSC_VER
+#ifdef _MSC_VER
+#include <stdio.h>
+#define b3Assert(x)               \
+	{                             \
+		if (!(x))                 \
+		{                         \
+			b3Error(              \
+				"Assert "__FILE__ \
+				":%u (" #x ")\n", \
+				__LINE__);        \
+			__debugbreak();       \
+		}                         \
+	}
+#else  //_MSC_VER
+#include <assert.h>
+#define b3Assert assert
+#endif  //_MSC_VER
 #else
-		#define b3Assert(x)
+#define b3Assert(x)
 #endif
-		//b3FullAssert is optional, slows down a lot
-		#define b3FullAssert(x)
+//b3FullAssert is optional, slows down a lot
+#define b3FullAssert(x)
 
-		#define b3Likely(_c)  _c
-		#define b3Unlikely(_c) _c
+#define b3Likely(_c) _c
+#define b3Unlikely(_c) _c
 
 #else
-	
-#if defined	(__CELLOS_LV2__)
-		#define B3_FORCE_INLINE inline __attribute__((always_inline))
-		#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
-		#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
-		#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
-		#ifndef assert
-		#include <assert.h>
-		#endif
+
+#if defined(__CELLOS_LV2__)
+#define B3_FORCE_INLINE inline __attribute__((always_inline))
+#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__((aligned(16)))
+#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__((aligned(64)))
+#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__((aligned(128)))
+#ifndef assert
+#include <assert.h>
+#endif
 #ifdef B3_DEBUG
 #ifdef __SPU__
 #include <spu_printf.h>
 #define printf spu_printf
-	#define b3Assert(x) {if(!(x)){b3Error("Assert "__FILE__ ":%u ("#x")\n", __LINE__);spu_hcmpeq(0,0);}}
+#define b3Assert(x)               \
+	{                             \
+		if (!(x))                 \
+		{                         \
+			b3Error(              \
+				"Assert "__FILE__ \
+				":%u (" #x ")\n", \
+				__LINE__);        \
+			spu_hcmpeq(0, 0);     \
+		}                         \
+	}
 #else
-	#define b3Assert assert
+#define b3Assert assert
 #endif
-	
+
 #else
-		#define b3Assert(x)
+#define b3Assert(x)
 #endif
-		//b3FullAssert is optional, slows down a lot
-		#define b3FullAssert(x)
+//b3FullAssert is optional, slows down a lot
+#define b3FullAssert(x)
 
-		#define b3Likely(_c)  _c
-		#define b3Unlikely(_c) _c
+#define b3Likely(_c) _c
+#define b3Unlikely(_c) _c
 
 #else
 
 #ifdef USE_LIBSPE2
 
-		#define B3_FORCE_INLINE __inline
-		#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
-		#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
-		#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
-		#ifndef assert
-		#include <assert.h>
-		#endif
+#define B3_FORCE_INLINE __inline
+#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__((aligned(16)))
+#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__((aligned(64)))
+#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__((aligned(128)))
+#ifndef assert
+#include <assert.h>
+#endif
 #ifdef B3_DEBUG
-		#define b3Assert assert
+#define b3Assert assert
 #else
-		#define b3Assert(x)
+#define b3Assert(x)
 #endif
-		//b3FullAssert is optional, slows down a lot
-		#define b3FullAssert(x)
+//b3FullAssert is optional, slows down a lot
+#define b3FullAssert(x)
 
+#define b3Likely(_c) __builtin_expect((_c), 1)
+#define b3Unlikely(_c) __builtin_expect((_c), 0)
 
-		#define b3Likely(_c)   __builtin_expect((_c), 1)
-		#define b3Unlikely(_c) __builtin_expect((_c), 0)
-		
-
 #else
-	//non-windows systems
-
-#if (defined (__APPLE__) && (!defined (B3_USE_DOUBLE_PRECISION)))
-    #if defined (__i386__) || defined (__x86_64__)
-        #define B3_USE_SSE
-		//B3_USE_SSE_IN_API is enabled on Mac OSX by default, because memory is automatically aligned on 16-byte boundaries
-		//if apps run into issues, we will disable the next line
-		#define B3_USE_SSE_IN_API
-        #ifdef B3_USE_SSE
-            // include appropriate SSE level
-            #if defined (__SSE4_1__)
-                #include <smmintrin.h>
-            #elif defined (__SSSE3__)
-                #include <tmmintrin.h>
-            #elif defined (__SSE3__)
-                #include <pmmintrin.h>
-            #else
-                #include <emmintrin.h>
-            #endif
-        #endif //B3_USE_SSE
-    #elif defined( __armv7__ )
-        #ifdef __clang__
-            #define B3_USE_NEON 1
-
-            #if defined B3_USE_NEON && defined (__clang__)
-                #include <arm_neon.h>
-            #endif//B3_USE_NEON
-       #endif //__clang__
-    #endif//__arm__
-
-	#define B3_FORCE_INLINE inline __attribute__ ((always_inline))
+//non-windows systems
+
+#if (defined(__APPLE__) && (!defined(B3_USE_DOUBLE_PRECISION)))
+#if defined(__i386__) || defined(__x86_64__)
+#define B3_USE_SSE
+//B3_USE_SSE_IN_API is enabled on Mac OSX by default, because memory is automatically aligned on 16-byte boundaries
+//if apps run into issues, we will disable the next line
+#define B3_USE_SSE_IN_API
+#ifdef B3_USE_SSE
+// include appropriate SSE level
+#if defined(__SSE4_1__)
+#include <smmintrin.h>
+#elif defined(__SSSE3__)
+#include <tmmintrin.h>
+#elif defined(__SSE3__)
+#include <pmmintrin.h>
+#else
+#include <emmintrin.h>
+#endif
+#endif  //B3_USE_SSE
+#elif defined(__armv7__)
+#ifdef __clang__
+#define B3_USE_NEON 1
+
+#if defined B3_USE_NEON && defined(__clang__)
+#include <arm_neon.h>
+#endif  //B3_USE_NEON
+#endif  //__clang__
+#endif  //__arm__
+
+#define B3_FORCE_INLINE inline __attribute__((always_inline))
 ///@todo: check out alignment methods for other platforms/compilers
-	#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
-	#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
-	#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
-	#ifndef assert
-	#include <assert.h>
-	#endif
-
-	#if defined(DEBUG) || defined (_DEBUG)
-	 #if defined (__i386__) || defined (__x86_64__)
-	#include <stdio.h>
-	 #define b3Assert(x)\
-	{\
-	if(!(x))\
-	{\
-		b3Error("Assert %s in line %d, file %s\n",#x, __LINE__, __FILE__);\
-		asm volatile ("int3");\
-	}\
+#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__((aligned(16)))
+#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__((aligned(64)))
+#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__((aligned(128)))
+#ifndef assert
+#include <assert.h>
+#endif
+
+#if defined(DEBUG) || defined(_DEBUG)
+#if defined(__i386__) || defined(__x86_64__)
+#include <stdio.h>
+#define b3Assert(x)                                                             \
+	{                                                                           \
+		if (!(x))                                                               \
+		{                                                                       \
+			b3Error("Assert %s in line %d, file %s\n", #x, __LINE__, __FILE__); \
+			asm volatile("int3");                                               \
+		}                                                                       \
 	}
-	#else//defined (__i386__) || defined (__x86_64__)
-		#define b3Assert assert
-	#endif//defined (__i386__) || defined (__x86_64__)
-	#else//defined(DEBUG) || defined (_DEBUG)
-		#define b3Assert(x)
-	#endif//defined(DEBUG) || defined (_DEBUG)
-
-	//b3FullAssert is optional, slows down a lot
-	#define b3FullAssert(x)
-	#define b3Likely(_c)  _c
-	#define b3Unlikely(_c) _c
+#else  //defined (__i386__) || defined (__x86_64__)
+#define b3Assert assert
+#endif  //defined (__i386__) || defined (__x86_64__)
+#else   //defined(DEBUG) || defined (_DEBUG)
+#define b3Assert(x)
+#endif  //defined(DEBUG) || defined (_DEBUG)
+
+//b3FullAssert is optional, slows down a lot
+#define b3FullAssert(x)
+#define b3Likely(_c) _c
+#define b3Unlikely(_c) _c
 
 #else
 
-		#define B3_FORCE_INLINE inline
-		///@todo: check out alignment methods for other platforms/compilers
-		#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
-		#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
-		#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
-		///#define B3_ATTRIBUTE_ALIGNED16(a) a
-		///#define B3_ATTRIBUTE_ALIGNED64(a) a
-		///#define B3_ATTRIBUTE_ALIGNED128(a) a
-		#ifndef assert
-		#include <assert.h>
-		#endif
-
-#if defined(DEBUG) || defined (_DEBUG)
-		#define b3Assert assert
+#define B3_FORCE_INLINE inline
+///@todo: check out alignment methods for other platforms/compilers
+#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__((aligned(16)))
+#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__((aligned(64)))
+#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__((aligned(128)))
+///#define B3_ATTRIBUTE_ALIGNED16(a) a
+///#define B3_ATTRIBUTE_ALIGNED64(a) a
+///#define B3_ATTRIBUTE_ALIGNED128(a) a
+#ifndef assert
+#include <assert.h>
+#endif
+
+#if defined(DEBUG) || defined(_DEBUG)
+#define b3Assert assert
 #else
-		#define b3Assert(x)
+#define b3Assert(x)
 #endif
 
-		//b3FullAssert is optional, slows down a lot
-		#define b3FullAssert(x)
-		#define b3Likely(_c)  _c
-		#define b3Unlikely(_c) _c
-#endif //__APPLE__ 
+//b3FullAssert is optional, slows down a lot
+#define b3FullAssert(x)
+#define b3Likely(_c) _c
+#define b3Unlikely(_c) _c
+#endif  //__APPLE__
 
-#endif // LIBSPE2
+#endif  // LIBSPE2
 
-#endif	//__CELLOS_LV2__
+#endif  //__CELLOS_LV2__
 #endif
 
-
 ///The b3Scalar type abstracts floating point numbers, to easily switch between double and single floating point precision.
 #if defined(B3_USE_DOUBLE_PRECISION)
 typedef double b3Scalar;
@@ -267,34 +279,34 @@ typedef float b3Scalar;
 
 #ifdef B3_USE_SSE
 typedef __m128 b3SimdFloat4;
-#endif//B3_USE_SSE
+#endif  //B3_USE_SSE
 
-#if defined B3_USE_SSE_IN_API && defined (B3_USE_SSE)
+#if defined B3_USE_SSE_IN_API && defined(B3_USE_SSE)
 #ifdef _WIN32
 
 #ifndef B3_NAN
 static int b3NanMask = 0x7F800001;
-#define B3_NAN (*(float*)&b3NanMask)
+#define B3_NAN (*(float *)&b3NanMask)
 #endif
 
 #ifndef B3_INFINITY_MASK
-static  int b3InfinityMask = 0x7F800000;
-#define B3_INFINITY_MASK (*(float*)&b3InfinityMask)
+static int b3InfinityMask = 0x7F800000;
+#define B3_INFINITY_MASK (*(float *)&b3InfinityMask)
 #endif
 
-inline __m128 operator + (const __m128 A, const __m128 B)
+inline __m128 operator+(const __m128 A, const __m128 B)
 {
-    return _mm_add_ps(A, B);
+	return _mm_add_ps(A, B);
 }
 
-inline __m128 operator - (const __m128 A, const __m128 B)
+inline __m128 operator-(const __m128 A, const __m128 B)
 {
-    return _mm_sub_ps(A, B);
+	return _mm_sub_ps(A, B);
 }
 
-inline __m128 operator * (const __m128 A, const __m128 B)
+inline __m128 operator*(const __m128 A, const __m128 B)
 {
-    return _mm_mul_ps(A, B);
+	return _mm_mul_ps(A, B);
 }
 
 #define b3CastfTo128i(a) (_mm_castps_si128(a))
@@ -302,18 +314,19 @@ inline __m128 operator * (const __m128 A, const __m128 B)
 #define b3CastiTo128f(a) (_mm_castsi128_ps(a))
 #define b3CastdTo128f(a) (_mm_castpd_ps(a))
 #define b3CastdTo128i(a) (_mm_castpd_si128(a))
-#define b3Assign128(r0,r1,r2,r3) _mm_setr_ps(r0,r1,r2,r3)
+#define b3Assign128(r0, r1, r2, r3) _mm_setr_ps(r0, r1, r2, r3)
 
-#else//_WIN32
+#else  //_WIN32
 
 #define b3CastfTo128i(a) ((__m128i)(a))
 #define b3CastfTo128d(a) ((__m128d)(a))
-#define b3CastiTo128f(a)  ((__m128) (a))
-#define b3CastdTo128f(a) ((__m128) (a))
+#define b3CastiTo128f(a) ((__m128)(a))
+#define b3CastdTo128f(a) ((__m128)(a))
 #define b3CastdTo128i(a) ((__m128i)(a))
-#define b3Assign128(r0,r1,r2,r3) (__m128){r0,r1,r2,r3}
-#endif//_WIN32
-#endif //B3_USE_SSE_IN_API
+#define b3Assign128(r0, r1, r2, r3) \
+	(__m128) { r0, r1, r2, r3 }
+#endif  //_WIN32
+#endif  //B3_USE_SSE_IN_API
 
 #ifdef B3_USE_NEON
 #include <arm_neon.h>
@@ -321,142 +334,160 @@ inline __m128 operator * (const __m128 A, const __m128 B)
 typedef float32x4_t b3SimdFloat4;
 #define B3_INFINITY INFINITY
 #define B3_NAN NAN
-#define b3Assign128(r0,r1,r2,r3) (float32x4_t){r0,r1,r2,r3}
+#define b3Assign128(r0, r1, r2, r3) \
+	(float32x4_t) { r0, r1, r2, r3 }
 #endif
 
-
-
-
-
-#define B3_DECLARE_ALIGNED_ALLOCATOR() \
-   B3_FORCE_INLINE void* operator new(size_t sizeInBytes)   { return b3AlignedAlloc(sizeInBytes,16); }   \
-   B3_FORCE_INLINE void  operator delete(void* ptr)         { b3AlignedFree(ptr); }   \
-   B3_FORCE_INLINE void* operator new(size_t, void* ptr)   { return ptr; }   \
-   B3_FORCE_INLINE void  operator delete(void*, void*)      { }   \
-   B3_FORCE_INLINE void* operator new[](size_t sizeInBytes)   { return b3AlignedAlloc(sizeInBytes,16); }   \
-   B3_FORCE_INLINE void  operator delete[](void* ptr)         { b3AlignedFree(ptr); }   \
-   B3_FORCE_INLINE void* operator new[](size_t, void* ptr)   { return ptr; }   \
-   B3_FORCE_INLINE void  operator delete[](void*, void*)      { }   \
-
-
+#define B3_DECLARE_ALIGNED_ALLOCATOR()                                                                   \
+	B3_FORCE_INLINE void *operator new(size_t sizeInBytes) { return b3AlignedAlloc(sizeInBytes, 16); }   \
+	B3_FORCE_INLINE void operator delete(void *ptr) { b3AlignedFree(ptr); }                              \
+	B3_FORCE_INLINE void *operator new(size_t, void *ptr) { return ptr; }                                \
+	B3_FORCE_INLINE void operator delete(void *, void *) {}                                              \
+	B3_FORCE_INLINE void *operator new[](size_t sizeInBytes) { return b3AlignedAlloc(sizeInBytes, 16); } \
+	B3_FORCE_INLINE void operator delete[](void *ptr) { b3AlignedFree(ptr); }                            \
+	B3_FORCE_INLINE void *operator new[](size_t, void *ptr) { return ptr; }                              \
+	B3_FORCE_INLINE void operator delete[](void *, void *) {}
 
 #if defined(B3_USE_DOUBLE_PRECISION) || defined(B3_FORCE_DOUBLE_FUNCTIONS)
-		
-B3_FORCE_INLINE b3Scalar b3Sqrt(b3Scalar x) { return sqrt(x); }
+
+B3_FORCE_INLINE b3Scalar b3Sqrt(b3Scalar x)
+{
+	return sqrt(x);
+}
 B3_FORCE_INLINE b3Scalar b3Fabs(b3Scalar x) { return fabs(x); }
 B3_FORCE_INLINE b3Scalar b3Cos(b3Scalar x) { return cos(x); }
 B3_FORCE_INLINE b3Scalar b3Sin(b3Scalar x) { return sin(x); }
 B3_FORCE_INLINE b3Scalar b3Tan(b3Scalar x) { return tan(x); }
-B3_FORCE_INLINE b3Scalar b3Acos(b3Scalar x) { if (x<b3Scalar(-1))	x=b3Scalar(-1); if (x>b3Scalar(1))	x=b3Scalar(1); return acos(x); }
-B3_FORCE_INLINE b3Scalar b3Asin(b3Scalar x) { if (x<b3Scalar(-1))	x=b3Scalar(-1); if (x>b3Scalar(1))	x=b3Scalar(1); return asin(x); }
+B3_FORCE_INLINE b3Scalar b3Acos(b3Scalar x)
+{
+	if (x < b3Scalar(-1)) x = b3Scalar(-1);
+	if (x > b3Scalar(1)) x = b3Scalar(1);
+	return acos(x);
+}
+B3_FORCE_INLINE b3Scalar b3Asin(b3Scalar x)
+{
+	if (x < b3Scalar(-1)) x = b3Scalar(-1);
+	if (x > b3Scalar(1)) x = b3Scalar(1);
+	return asin(x);
+}
 B3_FORCE_INLINE b3Scalar b3Atan(b3Scalar x) { return atan(x); }
 B3_FORCE_INLINE b3Scalar b3Atan2(b3Scalar x, b3Scalar y) { return atan2(x, y); }
 B3_FORCE_INLINE b3Scalar b3Exp(b3Scalar x) { return exp(x); }
 B3_FORCE_INLINE b3Scalar b3Log(b3Scalar x) { return log(x); }
-B3_FORCE_INLINE b3Scalar b3Pow(b3Scalar x,b3Scalar y) { return pow(x,y); }
-B3_FORCE_INLINE b3Scalar b3Fmod(b3Scalar x,b3Scalar y) { return fmod(x,y); }
+B3_FORCE_INLINE b3Scalar b3Pow(b3Scalar x, b3Scalar y) { return pow(x, y); }
+B3_FORCE_INLINE b3Scalar b3Fmod(b3Scalar x, b3Scalar y) { return fmod(x, y); }
 
 #else
-		
-B3_FORCE_INLINE b3Scalar b3Sqrt(b3Scalar y) 
-{ 
+
+B3_FORCE_INLINE b3Scalar b3Sqrt(b3Scalar y)
+{
 #ifdef USE_APPROXIMATION
-    double x, z, tempf;
-    unsigned long *tfptr = ((unsigned long *)&tempf) + 1;
+	double x, z, tempf;
+	unsigned long *tfptr = ((unsigned long *)&tempf) + 1;
 
 	tempf = y;
-	*tfptr = (0xbfcdd90a - *tfptr)>>1; /* estimate of 1/sqrt(y) */
-	x =  tempf;
-	z =  y*b3Scalar(0.5);
-	x = (b3Scalar(1.5)*x)-(x*x)*(x*z);         /* iteration formula     */
-	x = (b3Scalar(1.5)*x)-(x*x)*(x*z);
-	x = (b3Scalar(1.5)*x)-(x*x)*(x*z);
-	x = (b3Scalar(1.5)*x)-(x*x)*(x*z);
-	x = (b3Scalar(1.5)*x)-(x*x)*(x*z);
-	return x*y;
+	*tfptr = (0xbfcdd90a - *tfptr) >> 1; /* estimate of 1/sqrt(y) */
+	x = tempf;
+	z = y * b3Scalar(0.5);
+	x = (b3Scalar(1.5) * x) - (x * x) * (x * z); /* iteration formula     */
+	x = (b3Scalar(1.5) * x) - (x * x) * (x * z);
+	x = (b3Scalar(1.5) * x) - (x * x) * (x * z);
+	x = (b3Scalar(1.5) * x) - (x * x) * (x * z);
+	x = (b3Scalar(1.5) * x) - (x * x) * (x * z);
+	return x * y;
 #else
-	return sqrtf(y); 
+	return sqrtf(y);
 #endif
 }
 B3_FORCE_INLINE b3Scalar b3Fabs(b3Scalar x) { return fabsf(x); }
 B3_FORCE_INLINE b3Scalar b3Cos(b3Scalar x) { return cosf(x); }
 B3_FORCE_INLINE b3Scalar b3Sin(b3Scalar x) { return sinf(x); }
 B3_FORCE_INLINE b3Scalar b3Tan(b3Scalar x) { return tanf(x); }
-B3_FORCE_INLINE b3Scalar b3Acos(b3Scalar x) { 
-	if (x<b3Scalar(-1))	
-		x=b3Scalar(-1); 
-	if (x>b3Scalar(1))	
-		x=b3Scalar(1);
-	return acosf(x); 
+B3_FORCE_INLINE b3Scalar b3Acos(b3Scalar x)
+{
+	if (x < b3Scalar(-1))
+		x = b3Scalar(-1);
+	if (x > b3Scalar(1))
+		x = b3Scalar(1);
+	return acosf(x);
 }
-B3_FORCE_INLINE b3Scalar b3Asin(b3Scalar x) { 
-	if (x<b3Scalar(-1))	
-		x=b3Scalar(-1); 
-	if (x>b3Scalar(1))	
-		x=b3Scalar(1);
-	return asinf(x); 
+B3_FORCE_INLINE b3Scalar b3Asin(b3Scalar x)
+{
+	if (x < b3Scalar(-1))
+		x = b3Scalar(-1);
+	if (x > b3Scalar(1))
+		x = b3Scalar(1);
+	return asinf(x);
 }
 B3_FORCE_INLINE b3Scalar b3Atan(b3Scalar x) { return atanf(x); }
 B3_FORCE_INLINE b3Scalar b3Atan2(b3Scalar x, b3Scalar y) { return atan2f(x, y); }
 B3_FORCE_INLINE b3Scalar b3Exp(b3Scalar x) { return expf(x); }
 B3_FORCE_INLINE b3Scalar b3Log(b3Scalar x) { return logf(x); }
-B3_FORCE_INLINE b3Scalar b3Pow(b3Scalar x,b3Scalar y) { return powf(x,y); }
-B3_FORCE_INLINE b3Scalar b3Fmod(b3Scalar x,b3Scalar y) { return fmodf(x,y); }
-	
+B3_FORCE_INLINE b3Scalar b3Pow(b3Scalar x, b3Scalar y) { return powf(x, y); }
+B3_FORCE_INLINE b3Scalar b3Fmod(b3Scalar x, b3Scalar y) { return fmodf(x, y); }
+
 #endif
 
-#define B3_2_PI         b3Scalar(6.283185307179586232)
-#define B3_PI           (B3_2_PI * b3Scalar(0.5))
-#define B3_HALF_PI      (B3_2_PI * b3Scalar(0.25))
+#define B3_2_PI b3Scalar(6.283185307179586232)
+#define B3_PI (B3_2_PI * b3Scalar(0.5))
+#define B3_HALF_PI (B3_2_PI * b3Scalar(0.25))
 #define B3_RADS_PER_DEG (B3_2_PI / b3Scalar(360.0))
-#define B3_DEGS_PER_RAD  (b3Scalar(360.0) / B3_2_PI)
+#define B3_DEGS_PER_RAD (b3Scalar(360.0) / B3_2_PI)
 #define B3_SQRT12 b3Scalar(0.7071067811865475244008443621048490)
 
-#define b3RecipSqrt(x) ((b3Scalar)(b3Scalar(1.0)/b3Sqrt(b3Scalar(x))))		/* reciprocal square root */
-
+#define b3RecipSqrt(x) ((b3Scalar)(b3Scalar(1.0) / b3Sqrt(b3Scalar(x)))) /* reciprocal square root */
 
 #ifdef B3_USE_DOUBLE_PRECISION
-#define B3_EPSILON      DBL_EPSILON
-#define B3_INFINITY     DBL_MAX
+#define B3_EPSILON DBL_EPSILON
+#define B3_INFINITY DBL_MAX
 #else
-#define B3_EPSILON      FLT_EPSILON
-#define B3_INFINITY     FLT_MAX
+#define B3_EPSILON FLT_EPSILON
+#define B3_INFINITY FLT_MAX
 #endif
 
-B3_FORCE_INLINE b3Scalar b3Atan2Fast(b3Scalar y, b3Scalar x) 
+B3_FORCE_INLINE b3Scalar b3Atan2Fast(b3Scalar y, b3Scalar x)
 {
 	b3Scalar coeff_1 = B3_PI / 4.0f;
 	b3Scalar coeff_2 = 3.0f * coeff_1;
 	b3Scalar abs_y = b3Fabs(y);
 	b3Scalar angle;
-	if (x >= 0.0f) {
+	if (x >= 0.0f)
+	{
 		b3Scalar r = (x - abs_y) / (x + abs_y);
 		angle = coeff_1 - coeff_1 * r;
-	} else {
+	}
+	else
+	{
 		b3Scalar r = (x + abs_y) / (abs_y - x);
 		angle = coeff_2 - coeff_1 * r;
 	}
 	return (y < 0.0f) ? -angle : angle;
 }
 
-B3_FORCE_INLINE bool      b3FuzzyZero(b3Scalar x) { return b3Fabs(x) < B3_EPSILON; }
+B3_FORCE_INLINE bool b3FuzzyZero(b3Scalar x) { return b3Fabs(x) < B3_EPSILON; }
 
-B3_FORCE_INLINE bool	b3Equal(b3Scalar a, b3Scalar eps) {
+B3_FORCE_INLINE bool b3Equal(b3Scalar a, b3Scalar eps)
+{
 	return (((a) <= eps) && !((a) < -eps));
 }
-B3_FORCE_INLINE bool	b3GreaterEqual (b3Scalar a, b3Scalar eps) {
+B3_FORCE_INLINE bool b3GreaterEqual(b3Scalar a, b3Scalar eps)
+{
 	return (!((a) <= eps));
 }
 
-
-B3_FORCE_INLINE int       b3IsNegative(b3Scalar x) {
-    return x < b3Scalar(0.0) ? 1 : 0;
+B3_FORCE_INLINE int b3IsNegative(b3Scalar x)
+{
+	return x < b3Scalar(0.0) ? 1 : 0;
 }
 
 B3_FORCE_INLINE b3Scalar b3Radians(b3Scalar x) { return x * B3_RADS_PER_DEG; }
 B3_FORCE_INLINE b3Scalar b3Degrees(b3Scalar x) { return x * B3_DEGS_PER_RAD; }
 
-#define B3_DECLARE_HANDLE(name) typedef struct name##__ { int unused; } *name
+#define B3_DECLARE_HANDLE(name) \
+	typedef struct name##__     \
+	{                           \
+		int unused;             \
+	} * name
 
 #ifndef b3Fsel
 B3_FORCE_INLINE b3Scalar b3Fsel(b3Scalar a, b3Scalar b, b3Scalar c)
@@ -464,60 +495,57 @@ B3_FORCE_INLINE b3Scalar b3Fsel(b3Scalar a, b3Scalar b, b3Scalar c)
 	return a >= 0 ? b : c;
 }
 #endif
-#define b3Fsels(a,b,c) (b3Scalar)b3Fsel(a,b,c)
-
+#define b3Fsels(a, b, c) (b3Scalar) b3Fsel(a, b, c)
 
 B3_FORCE_INLINE bool b3MachineIsLittleEndian()
 {
-   long int i = 1;
-   const char *p = (const char *) &i;
-   if (p[0] == 1)  // Lowest address contains the least significant byte
-	   return true;
-   else
-	   return false;
+	long int i = 1;
+	const char *p = (const char *)&i;
+	if (p[0] == 1)  // Lowest address contains the least significant byte
+		return true;
+	else
+		return false;
 }
 
-
-
 ///b3Select avoids branches, which makes performance much better for consoles like Playstation 3 and XBox 360
 ///Thanks Phil Knight. See also http://www.cellperformance.com/articles/2006/04/more_techniques_for_eliminatin_1.html
-B3_FORCE_INLINE unsigned b3Select(unsigned condition, unsigned valueIfConditionNonZero, unsigned valueIfConditionZero) 
+B3_FORCE_INLINE unsigned b3Select(unsigned condition, unsigned valueIfConditionNonZero, unsigned valueIfConditionZero)
 {
-    // Set testNz to 0xFFFFFFFF if condition is nonzero, 0x00000000 if condition is zero
-    // Rely on positive value or'ed with its negative having sign bit on
-    // and zero value or'ed with its negative (which is still zero) having sign bit off 
-    // Use arithmetic shift right, shifting the sign bit through all 32 bits
-    unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
-    unsigned testEqz = ~testNz;
-    return ((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz)); 
+	// Set testNz to 0xFFFFFFFF if condition is nonzero, 0x00000000 if condition is zero
+	// Rely on positive value or'ed with its negative having sign bit on
+	// and zero value or'ed with its negative (which is still zero) having sign bit off
+	// Use arithmetic shift right, shifting the sign bit through all 32 bits
+	unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
+	unsigned testEqz = ~testNz;
+	return ((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz));
 }
 B3_FORCE_INLINE int b3Select(unsigned condition, int valueIfConditionNonZero, int valueIfConditionZero)
 {
-    unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
-    unsigned testEqz = ~testNz; 
-    return static_cast<int>((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz));
+	unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
+	unsigned testEqz = ~testNz;
+	return static_cast<int>((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz));
 }
 B3_FORCE_INLINE float b3Select(unsigned condition, float valueIfConditionNonZero, float valueIfConditionZero)
 {
 #ifdef B3_HAVE_NATIVE_FSEL
-    return (float)b3Fsel((b3Scalar)condition - b3Scalar(1.0f), valueIfConditionNonZero, valueIfConditionZero);
+	return (float)b3Fsel((b3Scalar)condition - b3Scalar(1.0f), valueIfConditionNonZero, valueIfConditionZero);
 #else
-    return (condition != 0) ? valueIfConditionNonZero : valueIfConditionZero; 
+	return (condition != 0) ? valueIfConditionNonZero : valueIfConditionZero;
 #endif
 }
 
-template<typename T> B3_FORCE_INLINE void b3Swap(T& a, T& b)
+template <typename T>
+B3_FORCE_INLINE void b3Swap(T &a, T &b)
 {
 	T tmp = a;
 	a = b;
 	b = tmp;
 }
 
-
 //PCK: endian swapping functions
 B3_FORCE_INLINE unsigned b3SwapEndian(unsigned val)
 {
-	return (((val & 0xff000000) >> 24) | ((val & 0x00ff0000) >> 8) | ((val & 0x0000ff00) << 8)  | ((val & 0x000000ff) << 24));
+	return (((val & 0xff000000) >> 24) | ((val & 0x00ff0000) >> 8) | ((val & 0x0000ff00) << 8) | ((val & 0x000000ff) << 24));
 }
 
 B3_FORCE_INLINE unsigned short b3SwapEndian(unsigned short val)
@@ -532,87 +560,85 @@ B3_FORCE_INLINE unsigned b3SwapEndian(int val)
 
 B3_FORCE_INLINE unsigned short b3SwapEndian(short val)
 {
-	return b3SwapEndian((unsigned short) val);
+	return b3SwapEndian((unsigned short)val);
 }
 
 ///b3SwapFloat uses using char pointers to swap the endianness
 ////b3SwapFloat/b3SwapDouble will NOT return a float, because the machine might 'correct' invalid floating point values
-///Not all values of sign/exponent/mantissa are valid floating point numbers according to IEEE 754. 
-///When a floating point unit is faced with an invalid value, it may actually change the value, or worse, throw an exception. 
-///In most systems, running user mode code, you wouldn't get an exception, but instead the hardware/os/runtime will 'fix' the number for you. 
+///Not all values of sign/exponent/mantissa are valid floating point numbers according to IEEE 754.
+///When a floating point unit is faced with an invalid value, it may actually change the value, or worse, throw an exception.
+///In most systems, running user mode code, you wouldn't get an exception, but instead the hardware/os/runtime will 'fix' the number for you.
 ///so instead of returning a float/double, we return integer/long long integer
-B3_FORCE_INLINE unsigned int  b3SwapEndianFloat(float d)
+B3_FORCE_INLINE unsigned int b3SwapEndianFloat(float d)
 {
-    unsigned int a = 0;
-    unsigned char *dst = (unsigned char *)&a;
-    unsigned char *src = (unsigned char *)&d;
-
-    dst[0] = src[3];
-    dst[1] = src[2];
-    dst[2] = src[1];
-    dst[3] = src[0];
-    return a;
+	unsigned int a = 0;
+	unsigned char *dst = (unsigned char *)&a;
+	unsigned char *src = (unsigned char *)&d;
+
+	dst[0] = src[3];
+	dst[1] = src[2];
+	dst[2] = src[1];
+	dst[3] = src[0];
+	return a;
 }
 
 // unswap using char pointers
-B3_FORCE_INLINE float b3UnswapEndianFloat(unsigned int a) 
+B3_FORCE_INLINE float b3UnswapEndianFloat(unsigned int a)
 {
-    float d = 0.0f;
-    unsigned char *src = (unsigned char *)&a;
-    unsigned char *dst = (unsigned char *)&d;
+	float d = 0.0f;
+	unsigned char *src = (unsigned char *)&a;
+	unsigned char *dst = (unsigned char *)&d;
 
-    dst[0] = src[3];
-    dst[1] = src[2];
-    dst[2] = src[1];
-    dst[3] = src[0];
+	dst[0] = src[3];
+	dst[1] = src[2];
+	dst[2] = src[1];
+	dst[3] = src[0];
 
-    return d;
+	return d;
 }
 
-
 // swap using char pointers
-B3_FORCE_INLINE void  b3SwapEndianDouble(double d, unsigned char* dst)
+B3_FORCE_INLINE void b3SwapEndianDouble(double d, unsigned char *dst)
 {
-    unsigned char *src = (unsigned char *)&d;
-
-    dst[0] = src[7];
-    dst[1] = src[6];
-    dst[2] = src[5];
-    dst[3] = src[4];
-    dst[4] = src[3];
-    dst[5] = src[2];
-    dst[6] = src[1];
-    dst[7] = src[0];
-
+	unsigned char *src = (unsigned char *)&d;
+
+	dst[0] = src[7];
+	dst[1] = src[6];
+	dst[2] = src[5];
+	dst[3] = src[4];
+	dst[4] = src[3];
+	dst[5] = src[2];
+	dst[6] = src[1];
+	dst[7] = src[0];
 }
 
 // unswap using char pointers
-B3_FORCE_INLINE double b3UnswapEndianDouble(const unsigned char *src) 
+B3_FORCE_INLINE double b3UnswapEndianDouble(const unsigned char *src)
 {
-    double d = 0.0;
-    unsigned char *dst = (unsigned char *)&d;
-
-    dst[0] = src[7];
-    dst[1] = src[6];
-    dst[2] = src[5];
-    dst[3] = src[4];
-    dst[4] = src[3];
-    dst[5] = src[2];
-    dst[6] = src[1];
-    dst[7] = src[0];
+	double d = 0.0;
+	unsigned char *dst = (unsigned char *)&d;
+
+	dst[0] = src[7];
+	dst[1] = src[6];
+	dst[2] = src[5];
+	dst[3] = src[4];
+	dst[4] = src[3];
+	dst[5] = src[2];
+	dst[6] = src[1];
+	dst[7] = src[0];
 
 	return d;
 }
 
 // returns normalized value in range [-B3_PI, B3_PI]
-B3_FORCE_INLINE b3Scalar b3NormalizeAngle(b3Scalar angleInRadians) 
+B3_FORCE_INLINE b3Scalar b3NormalizeAngle(b3Scalar angleInRadians)
 {
 	angleInRadians = b3Fmod(angleInRadians, B3_2_PI);
-	if(angleInRadians < -B3_PI)
+	if (angleInRadians < -B3_PI)
 	{
 		return angleInRadians + B3_2_PI;
 	}
-	else if(angleInRadians > B3_PI)
+	else if (angleInRadians > B3_PI)
 	{
 		return angleInRadians - B3_2_PI;
 	}
@@ -626,38 +652,34 @@ B3_FORCE_INLINE b3Scalar b3NormalizeAngle(b3Scalar angleInRadians)
 struct b3TypedObject
 {
 	b3TypedObject(int objectType)
-		:m_objectType(objectType)
+		: m_objectType(objectType)
 	{
 	}
-	int	m_objectType;
+	int m_objectType;
 	inline int getObjectType() const
 	{
 		return m_objectType;
 	}
 };
 
-
-  
 ///align a pointer to the provided alignment, upwards
-template <typename T>T* b3AlignPointer(T* unalignedPtr, size_t alignment)
+template <typename T>
+T *b3AlignPointer(T *unalignedPtr, size_t alignment)
 {
-		
 	struct b3ConvertPointerSizeT
 	{
-		union 
-		{
-				T* ptr;
-				size_t integer;
+		union {
+			T *ptr;
+			size_t integer;
 		};
 	};
-    b3ConvertPointerSizeT converter;
-    
-    
+	b3ConvertPointerSizeT converter;
+
 	const size_t bit_mask = ~(alignment - 1);
-    converter.ptr = unalignedPtr;
-	converter.integer += alignment-1;
+	converter.ptr = unalignedPtr;
+	converter.integer += alignment - 1;
 	converter.integer &= bit_mask;
 	return converter.ptr;
 }
 
-#endif //B3_SCALAR_H
+#endif  //B3_SCALAR_H
diff --git a/thirdparty/bullet/Bullet3Common/b3StackAlloc.h b/thirdparty/bullet/Bullet3Common/b3StackAlloc.h
index de7de056b5..4972236ac7 100644
--- a/thirdparty/bullet/Bullet3Common/b3StackAlloc.h
+++ b/thirdparty/bullet/Bullet3Common/b3StackAlloc.h
@@ -20,97 +20,99 @@ Nov.2006
 #ifndef B3_STACK_ALLOC
 #define B3_STACK_ALLOC
 
-#include "b3Scalar.h" //for b3Assert
+#include "b3Scalar.h"  //for b3Assert
 #include "b3AlignedAllocator.h"
 
 ///The b3Block class is an internal structure for the b3StackAlloc memory allocator.
 struct b3Block
 {
-	b3Block*			previous;
-	unsigned char*		address;
+	b3Block* previous;
+	unsigned char* address;
 };
 
 ///The StackAlloc class provides some fast stack-based memory allocator (LIFO last-in first-out)
 class b3StackAlloc
 {
 public:
+	b3StackAlloc(unsigned int size)
+	{
+		ctor();
+		create(size);
+	}
+	~b3StackAlloc() { destroy(); }
 
-	b3StackAlloc(unsigned int size)	{ ctor();create(size); }
-	~b3StackAlloc()		{ destroy(); }
-	
-	inline void		create(unsigned int size)
+	inline void create(unsigned int size)
 	{
 		destroy();
-		data		=  (unsigned char*) b3AlignedAlloc(size,16);
-		totalsize	=	size;
+		data = (unsigned char*)b3AlignedAlloc(size, 16);
+		totalsize = size;
 	}
-	inline void		destroy()
+	inline void destroy()
 	{
-		b3Assert(usedsize==0);
+		b3Assert(usedsize == 0);
 		//Raise(L"StackAlloc is still in use");
 
-		if(usedsize==0)
+		if (usedsize == 0)
 		{
-			if(!ischild && data)		
+			if (!ischild && data)
 				b3AlignedFree(data);
 
-			data				=	0;
-			usedsize			=	0;
+			data = 0;
+			usedsize = 0;
 		}
-		
 	}
 
-	int	getAvailableMemory() const
+	int getAvailableMemory() const
 	{
 		return static_cast<int>(totalsize - usedsize);
 	}
 
-	unsigned char*			allocate(unsigned int size)
+	unsigned char* allocate(unsigned int size)
 	{
-		const unsigned int	nus(usedsize+size);
-		if(nus<totalsize)
+		const unsigned int nus(usedsize + size);
+		if (nus < totalsize)
 		{
-			usedsize=nus;
-			return(data+(usedsize-size));
+			usedsize = nus;
+			return (data + (usedsize - size));
 		}
 		b3Assert(0);
 		//&& (L"Not enough memory"));
-		
-		return(0);
+
+		return (0);
 	}
-	B3_FORCE_INLINE b3Block*		beginBlock()
+	B3_FORCE_INLINE b3Block* beginBlock()
 	{
-		b3Block*	pb = (b3Block*)allocate(sizeof(b3Block));
-		pb->previous	=	current;
-		pb->address		=	data+usedsize;
-		current			=	pb;
-		return(pb);
+		b3Block* pb = (b3Block*)allocate(sizeof(b3Block));
+		pb->previous = current;
+		pb->address = data + usedsize;
+		current = pb;
+		return (pb);
 	}
-	B3_FORCE_INLINE void		endBlock(b3Block* block)
+	B3_FORCE_INLINE void endBlock(b3Block* block)
 	{
-		b3Assert(block==current);
+		b3Assert(block == current);
 		//Raise(L"Unmatched blocks");
-		if(block==current)
+		if (block == current)
 		{
-			current		=	block->previous;
-			usedsize	=	(unsigned int)((block->address-data)-sizeof(b3Block));
+			current = block->previous;
+			usedsize = (unsigned int)((block->address - data) - sizeof(b3Block));
 		}
 	}
 
 private:
-	void		ctor()
+	void ctor()
 	{
-		data		=	0;
-		totalsize	=	0;
-		usedsize	=	0;
-		current		=	0;
-		ischild		=	false;
+		data = 0;
+		totalsize = 0;
+		usedsize = 0;
+		current = 0;
+		ischild = false;
 	}
-	unsigned char*		data;
-	unsigned int		totalsize;
-	unsigned int		usedsize;
-	b3Block*	current;
-	bool		ischild;
+	unsigned char* data;
+	unsigned int totalsize;
+	unsigned int usedsize;
+	b3Block* current;
+	bool ischild;
 };
 
-#endif //B3_STACK_ALLOC
+#endif  //B3_STACK_ALLOC
diff --git a/thirdparty/bullet/Bullet3Common/b3Transform.h b/thirdparty/bullet/Bullet3Common/b3Transform.h
index fa480759be..149da9d148 100644
--- a/thirdparty/bullet/Bullet3Common/b3Transform.h
+++ b/thirdparty/bullet/Bullet3Common/b3Transform.h
@@ -12,11 +12,9 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
-
 #ifndef B3_TRANSFORM_H
 #define B3_TRANSFORM_H
 
-
 #include "b3Matrix3x3.h"
 
 #ifdef B3_USE_DOUBLE_PRECISION
@@ -25,46 +23,45 @@ subject to the following restrictions:
 #define b3TransformData b3TransformFloatData
 #endif
 
-
-
-
 /**@brief The b3Transform class supports rigid transforms with only translation and rotation and no scaling/shear.
  *It can be used in combination with b3Vector3, b3Quaternion and b3Matrix3x3 linear algebra classes. */
-B3_ATTRIBUTE_ALIGNED16(class) b3Transform {
-	
-  ///Storage for the rotation
+B3_ATTRIBUTE_ALIGNED16(class)
+b3Transform
+{
+	///Storage for the rotation
 	b3Matrix3x3 m_basis;
-  ///Storage for the translation
-	b3Vector3   m_origin;
+	///Storage for the translation
+	b3Vector3 m_origin;
 
 public:
-	
-  /**@brief No initialization constructor */
+	/**@brief No initialization constructor */
 	b3Transform() {}
-  /**@brief Constructor from b3Quaternion (optional b3Vector3 )
+	/**@brief Constructor from b3Quaternion (optional b3Vector3 )
    * @param q Rotation from quaternion 
    * @param c Translation from Vector (default 0,0,0) */
-	explicit B3_FORCE_INLINE b3Transform(const b3Quaternion& q, 
-		const b3Vector3& c = b3MakeVector3(b3Scalar(0), b3Scalar(0), b3Scalar(0))) 
+	explicit B3_FORCE_INLINE b3Transform(const b3Quaternion& q,
+										 const b3Vector3& c = b3MakeVector3(b3Scalar(0), b3Scalar(0), b3Scalar(0)))
 		: m_basis(q),
-		m_origin(c)
-	{}
+		  m_origin(c)
+	{
+	}
 
-  /**@brief Constructor from b3Matrix3x3 (optional b3Vector3)
+	/**@brief Constructor from b3Matrix3x3 (optional b3Vector3)
    * @param b Rotation from Matrix 
    * @param c Translation from Vector default (0,0,0)*/
-	explicit B3_FORCE_INLINE b3Transform(const b3Matrix3x3& b, 
-		const b3Vector3& c = b3MakeVector3(b3Scalar(0), b3Scalar(0), b3Scalar(0)))
+	explicit B3_FORCE_INLINE b3Transform(const b3Matrix3x3& b,
+										 const b3Vector3& c = b3MakeVector3(b3Scalar(0), b3Scalar(0), b3Scalar(0)))
 		: m_basis(b),
-		m_origin(c)
-	{}
-  /**@brief Copy constructor */
-	B3_FORCE_INLINE b3Transform (const b3Transform& other)
+		  m_origin(c)
+	{
+	}
+	/**@brief Copy constructor */
+	B3_FORCE_INLINE b3Transform(const b3Transform& other)
 		: m_basis(other.m_basis),
-		m_origin(other.m_origin)
+		  m_origin(other.m_origin)
 	{
 	}
-  /**@brief Assignment Operator */
+	/**@brief Assignment Operator */
 	B3_FORCE_INLINE b3Transform& operator=(const b3Transform& other)
 	{
 		m_basis = other.m_basis;
@@ -72,70 +69,70 @@ public:
 		return *this;
 	}
 
-
-  /**@brief Set the current transform as the value of the product of two transforms
+	/**@brief Set the current transform as the value of the product of two transforms
    * @param t1 Transform 1
    * @param t2 Transform 2
    * This = Transform1 * Transform2 */
-		B3_FORCE_INLINE void mult(const b3Transform& t1, const b3Transform& t2) {
-			m_basis = t1.m_basis * t2.m_basis;
-			m_origin = t1(t2.m_origin);
-		}
+	B3_FORCE_INLINE void mult(const b3Transform& t1, const b3Transform& t2)
+	{
+		m_basis = t1.m_basis * t2.m_basis;
+		m_origin = t1(t2.m_origin);
+	}
 
-/*		void multInverseLeft(const b3Transform& t1, const b3Transform& t2) {
+	/*		void multInverseLeft(const b3Transform& t1, const b3Transform& t2) {
 			b3Vector3 v = t2.m_origin - t1.m_origin;
 			m_basis = b3MultTransposeLeft(t1.m_basis, t2.m_basis);
 			m_origin = v * t1.m_basis;
 		}
 		*/
 
-/**@brief Return the transform of the vector */
+	/**@brief Return the transform of the vector */
 	B3_FORCE_INLINE b3Vector3 operator()(const b3Vector3& x) const
 	{
-        return x.dot3(m_basis[0], m_basis[1], m_basis[2]) + m_origin;
+		return x.dot3(m_basis[0], m_basis[1], m_basis[2]) + m_origin;
 	}
 
-  /**@brief Return the transform of the vector */
+	/**@brief Return the transform of the vector */
 	B3_FORCE_INLINE b3Vector3 operator*(const b3Vector3& x) const
 	{
 		return (*this)(x);
 	}
 
-  /**@brief Return the transform of the b3Quaternion */
+	/**@brief Return the transform of the b3Quaternion */
 	B3_FORCE_INLINE b3Quaternion operator*(const b3Quaternion& q) const
 	{
 		return getRotation() * q;
 	}
 
-  /**@brief Return the basis matrix for the rotation */
-	B3_FORCE_INLINE b3Matrix3x3&       getBasis()          { return m_basis; }
-  /**@brief Return the basis matrix for the rotation */
-	B3_FORCE_INLINE const b3Matrix3x3& getBasis()    const { return m_basis; }
+	/**@brief Return the basis matrix for the rotation */
+	B3_FORCE_INLINE b3Matrix3x3& getBasis() { return m_basis; }
+	/**@brief Return the basis matrix for the rotation */
+	B3_FORCE_INLINE const b3Matrix3x3& getBasis() const { return m_basis; }
 
-  /**@brief Return the origin vector translation */
-	B3_FORCE_INLINE b3Vector3&         getOrigin()         { return m_origin; }
-  /**@brief Return the origin vector translation */
-	B3_FORCE_INLINE const b3Vector3&   getOrigin()   const { return m_origin; }
+	/**@brief Return the origin vector translation */
+	B3_FORCE_INLINE b3Vector3& getOrigin() { return m_origin; }
+	/**@brief Return the origin vector translation */
+	B3_FORCE_INLINE const b3Vector3& getOrigin() const { return m_origin; }
 
-  /**@brief Return a quaternion representing the rotation */
-	b3Quaternion getRotation() const { 
+	/**@brief Return a quaternion representing the rotation */
+	b3Quaternion getRotation() const
+	{
 		b3Quaternion q;
 		m_basis.getRotation(q);
 		return q;
 	}
-	
-	
-  /**@brief Set from an array 
+
+	/**@brief Set from an array 
    * @param m A pointer to a 15 element array (12 rotation(row major padded on the right by 1), and 3 translation */
-	void setFromOpenGLMatrix(const b3Scalar *m)
+	void setFromOpenGLMatrix(const b3Scalar* m)
 	{
 		m_basis.setFromOpenGLSubMatrix(m);
-		m_origin.setValue(m[12],m[13],m[14]);
+		m_origin.setValue(m[12], m[13], m[14]);
 	}
 
-  /**@brief Fill an array representation
+	/**@brief Fill an array representation
    * @param m A pointer to a 15 element array (12 rotation(row major padded on the right by 1), and 3 translation */
-	void getOpenGLMatrix(b3Scalar *m) const 
+	void getOpenGLMatrix(b3Scalar * m) const
 	{
 		m_basis.getOpenGLSubMatrix(m);
 		m[12] = m_origin.getX();
@@ -144,80 +141,76 @@ public:
 		m[15] = b3Scalar(1.0);
 	}
 
-  /**@brief Set the translational element
+	/**@brief Set the translational element
    * @param origin The vector to set the translation to */
-	B3_FORCE_INLINE void setOrigin(const b3Vector3& origin) 
-	{ 
+	B3_FORCE_INLINE void setOrigin(const b3Vector3& origin)
+	{
 		m_origin = origin;
 	}
 
 	B3_FORCE_INLINE b3Vector3 invXform(const b3Vector3& inVec) const;
 
-
-  /**@brief Set the rotational element by b3Matrix3x3 */
+	/**@brief Set the rotational element by b3Matrix3x3 */
 	B3_FORCE_INLINE void setBasis(const b3Matrix3x3& basis)
-	{ 
+	{
 		m_basis = basis;
 	}
 
-  /**@brief Set the rotational element by b3Quaternion */
+	/**@brief Set the rotational element by b3Quaternion */
 	B3_FORCE_INLINE void setRotation(const b3Quaternion& q)
 	{
 		m_basis.setRotation(q);
 	}
 
-
-  /**@brief Set this transformation to the identity */
+	/**@brief Set this transformation to the identity */
 	void setIdentity()
 	{
 		m_basis.setIdentity();
 		m_origin.setValue(b3Scalar(0.0), b3Scalar(0.0), b3Scalar(0.0));
 	}
 
-  /**@brief Multiply this Transform by another(this = this * another) 
+	/**@brief Multiply this Transform by another(this = this * another) 
    * @param t The other transform */
-	b3Transform& operator*=(const b3Transform& t) 
+	b3Transform& operator*=(const b3Transform& t)
 	{
 		m_origin += m_basis * t.m_origin;
 		m_basis *= t.m_basis;
 		return *this;
 	}
 
-  /**@brief Return the inverse of this transform */
+	/**@brief Return the inverse of this transform */
 	b3Transform inverse() const
-	{ 
+	{
 		b3Matrix3x3 inv = m_basis.transpose();
 		return b3Transform(inv, inv * -m_origin);
 	}
 
-  /**@brief Return the inverse of this transform times the other transform
+	/**@brief Return the inverse of this transform times the other transform
    * @param t The other transform 
    * return this.inverse() * the other */
-	b3Transform inverseTimes(const b3Transform& t) const;  
+	b3Transform inverseTimes(const b3Transform& t) const;
 
-  /**@brief Return the product of this transform and the other */
+	/**@brief Return the product of this transform and the other */
 	b3Transform operator*(const b3Transform& t) const;
 
-  /**@brief Return an identity transform */
-	static const b3Transform&	getIdentity()
+	/**@brief Return an identity transform */
+	static const b3Transform& getIdentity()
 	{
 		static const b3Transform identityTransform(b3Matrix3x3::getIdentity());
 		return identityTransform;
 	}
 
-	void	serialize(struct	b3TransformData& dataOut) const;
-
-	void	serializeFloat(struct	b3TransformFloatData& dataOut) const;
+	void serialize(struct b3TransformData & dataOut) const;
 
-	void	deSerialize(const struct	b3TransformData& dataIn);
+	void serializeFloat(struct b3TransformFloatData & dataOut) const;
 
-	void	deSerializeDouble(const struct	b3TransformDoubleData& dataIn);
+	void deSerialize(const struct b3TransformData& dataIn);
 
-	void	deSerializeFloat(const struct	b3TransformFloatData& dataIn);
+	void deSerializeDouble(const struct b3TransformDoubleData& dataIn);
 
+	void deSerializeFloat(const struct b3TransformFloatData& dataIn);
 };
 
-
 B3_FORCE_INLINE b3Vector3
 b3Transform::invXform(const b3Vector3& inVec) const
 {
@@ -225,80 +218,69 @@ b3Transform::invXform(const b3Vector3& inVec) const
 	return (m_basis.transpose() * v);
 }
 
-B3_FORCE_INLINE b3Transform 
-b3Transform::inverseTimes(const b3Transform& t) const  
+B3_FORCE_INLINE b3Transform
+b3Transform::inverseTimes(const b3Transform& t) const
 {
 	b3Vector3 v = t.getOrigin() - m_origin;
-		return b3Transform(m_basis.transposeTimes(t.m_basis),
-			v * m_basis);
+	return b3Transform(m_basis.transposeTimes(t.m_basis),
+					   v * m_basis);
 }
 
-B3_FORCE_INLINE b3Transform 
-b3Transform::operator*(const b3Transform& t) const
+B3_FORCE_INLINE b3Transform
+	b3Transform::operator*(const b3Transform& t) const
 {
-	return b3Transform(m_basis * t.m_basis, 
-		(*this)(t.m_origin));
+	return b3Transform(m_basis * t.m_basis,
+					   (*this)(t.m_origin));
 }
 
 /**@brief Test if two transforms have all elements equal */
 B3_FORCE_INLINE bool operator==(const b3Transform& t1, const b3Transform& t2)
 {
-   return ( t1.getBasis()  == t2.getBasis() &&
-            t1.getOrigin() == t2.getOrigin() );
+	return (t1.getBasis() == t2.getBasis() &&
+			t1.getOrigin() == t2.getOrigin());
 }
 
-
 ///for serialization
-struct	b3TransformFloatData
+struct b3TransformFloatData
 {
-	b3Matrix3x3FloatData	m_basis;
-	b3Vector3FloatData	m_origin;
+	b3Matrix3x3FloatData m_basis;
+	b3Vector3FloatData m_origin;
 };
 
-struct	b3TransformDoubleData
+struct b3TransformDoubleData
 {
-	b3Matrix3x3DoubleData	m_basis;
-	b3Vector3DoubleData	m_origin;
+	b3Matrix3x3DoubleData m_basis;
+	b3Vector3DoubleData m_origin;
 };
 
-
-
-B3_FORCE_INLINE	void	b3Transform::serialize(b3TransformData& dataOut) const
+B3_FORCE_INLINE void b3Transform::serialize(b3TransformData& dataOut) const
 {
 	m_basis.serialize(dataOut.m_basis);
 	m_origin.serialize(dataOut.m_origin);
 }
 
-B3_FORCE_INLINE	void	b3Transform::serializeFloat(b3TransformFloatData& dataOut) const
+B3_FORCE_INLINE void b3Transform::serializeFloat(b3TransformFloatData& dataOut) const
 {
 	m_basis.serializeFloat(dataOut.m_basis);
 	m_origin.serializeFloat(dataOut.m_origin);
 }
 
-
-B3_FORCE_INLINE	void	b3Transform::deSerialize(const b3TransformData& dataIn)
+B3_FORCE_INLINE void b3Transform::deSerialize(const b3TransformData& dataIn)
 {
 	m_basis.deSerialize(dataIn.m_basis);
 	m_origin.deSerialize(dataIn.m_origin);
 }
 
-B3_FORCE_INLINE	void	b3Transform::deSerializeFloat(const b3TransformFloatData& dataIn)
+B3_FORCE_INLINE void b3Transform::deSerializeFloat(const b3TransformFloatData& dataIn)
 {
 	m_basis.deSerializeFloat(dataIn.m_basis);
 	m_origin.deSerializeFloat(dataIn.m_origin);
 }
 
-B3_FORCE_INLINE	void	b3Transform::deSerializeDouble(const b3TransformDoubleData& dataIn)
+B3_FORCE_INLINE void b3Transform::deSerializeDouble(const b3TransformDoubleData& dataIn)
 {
 	m_basis.deSerializeDouble(dataIn.m_basis);
 	m_origin.deSerializeDouble(dataIn.m_origin);
 }
 
-
-#endif //B3_TRANSFORM_H
-
-
-
-
-
-
+#endif  //B3_TRANSFORM_H
diff --git a/thirdparty/bullet/Bullet3Common/b3TransformUtil.h b/thirdparty/bullet/Bullet3Common/b3TransformUtil.h
index 6ce580c132..1850a9be5f 100644
--- a/thirdparty/bullet/Bullet3Common/b3TransformUtil.h
+++ b/thirdparty/bullet/Bullet3Common/b3TransformUtil.h
@@ -12,204 +12,189 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
-
 #ifndef B3_TRANSFORM_UTIL_H
 #define B3_TRANSFORM_UTIL_H
 
 #include "b3Transform.h"
-#define B3_ANGULAR_MOTION_THRESHOLD b3Scalar(0.5)*B3_HALF_PI
-
-
-
+#define B3_ANGULAR_MOTION_THRESHOLD b3Scalar(0.5) * B3_HALF_PI
 
-B3_FORCE_INLINE b3Vector3 b3AabbSupport(const b3Vector3& halfExtents,const b3Vector3& supportDir)
+B3_FORCE_INLINE b3Vector3 b3AabbSupport(const b3Vector3& halfExtents, const b3Vector3& supportDir)
 {
 	return b3MakeVector3(supportDir.getX() < b3Scalar(0.0) ? -halfExtents.getX() : halfExtents.getX(),
-      supportDir.getY() < b3Scalar(0.0) ? -halfExtents.getY() : halfExtents.getY(),
-      supportDir.getZ() < b3Scalar(0.0) ? -halfExtents.getZ() : halfExtents.getZ()); 
+						 supportDir.getY() < b3Scalar(0.0) ? -halfExtents.getY() : halfExtents.getY(),
+						 supportDir.getZ() < b3Scalar(0.0) ? -halfExtents.getZ() : halfExtents.getZ());
 }
 
-
-
-
-
-
 /// Utils related to temporal transforms
 class b3TransformUtil
 {
-
 public:
-
-	static void integrateTransform(const b3Transform& curTrans,const b3Vector3& linvel,const b3Vector3& angvel,b3Scalar timeStep,b3Transform& predictedTransform)
+	static void integrateTransform(const b3Transform& curTrans, const b3Vector3& linvel, const b3Vector3& angvel, b3Scalar timeStep, b3Transform& predictedTransform)
 	{
 		predictedTransform.setOrigin(curTrans.getOrigin() + linvel * timeStep);
-//	#define QUATERNION_DERIVATIVE
-	#ifdef QUATERNION_DERIVATIVE
+		//	#define QUATERNION_DERIVATIVE
+#ifdef QUATERNION_DERIVATIVE
 		b3Quaternion predictedOrn = curTrans.getRotation();
 		predictedOrn += (angvel * predictedOrn) * (timeStep * b3Scalar(0.5));
 		predictedOrn.normalize();
-	#else
+#else
 		//Exponential map
 		//google for "Practical Parameterization of Rotations Using the Exponential Map", F. Sebastian Grassia
 
 		b3Vector3 axis;
-		b3Scalar	fAngle = angvel.length(); 
+		b3Scalar fAngle = angvel.length();
 		//limit the angular motion
-		if (fAngle*timeStep > B3_ANGULAR_MOTION_THRESHOLD)
+		if (fAngle * timeStep > B3_ANGULAR_MOTION_THRESHOLD)
 		{
 			fAngle = B3_ANGULAR_MOTION_THRESHOLD / timeStep;
 		}
 
-		if ( fAngle < b3Scalar(0.001) )
+		if (fAngle < b3Scalar(0.001))
 		{
 			// use Taylor's expansions of sync function
-			axis   = angvel*( b3Scalar(0.5)*timeStep-(timeStep*timeStep*timeStep)*(b3Scalar(0.020833333333))*fAngle*fAngle );
+			axis = angvel * (b3Scalar(0.5) * timeStep - (timeStep * timeStep * timeStep) * (b3Scalar(0.020833333333)) * fAngle * fAngle);
 		}
 		else
 		{
 			// sync(fAngle) = sin(c*fAngle)/t
-			axis   = angvel*( b3Sin(b3Scalar(0.5)*fAngle*timeStep)/fAngle );
+			axis = angvel * (b3Sin(b3Scalar(0.5) * fAngle * timeStep) / fAngle);
 		}
-		b3Quaternion dorn (axis.getX(),axis.getY(),axis.getZ(),b3Cos( fAngle*timeStep*b3Scalar(0.5) ));
+		b3Quaternion dorn(axis.getX(), axis.getY(), axis.getZ(), b3Cos(fAngle * timeStep * b3Scalar(0.5)));
 		b3Quaternion orn0 = curTrans.getRotation();
 
 		b3Quaternion predictedOrn = dorn * orn0;
 		predictedOrn.normalize();
-	#endif
+#endif
 		predictedTransform.setRotation(predictedOrn);
 	}
 
-	static void	calculateVelocityQuaternion(const b3Vector3& pos0,const b3Vector3& pos1,const b3Quaternion& orn0,const b3Quaternion& orn1,b3Scalar timeStep,b3Vector3& linVel,b3Vector3& angVel)
+	static void calculateVelocityQuaternion(const b3Vector3& pos0, const b3Vector3& pos1, const b3Quaternion& orn0, const b3Quaternion& orn1, b3Scalar timeStep, b3Vector3& linVel, b3Vector3& angVel)
 	{
 		linVel = (pos1 - pos0) / timeStep;
 		b3Vector3 axis;
-		b3Scalar  angle;
+		b3Scalar angle;
 		if (orn0 != orn1)
 		{
-			calculateDiffAxisAngleQuaternion(orn0,orn1,axis,angle);
+			calculateDiffAxisAngleQuaternion(orn0, orn1, axis, angle);
 			angVel = axis * angle / timeStep;
-		} else
+		}
+		else
 		{
-			angVel.setValue(0,0,0);
+			angVel.setValue(0, 0, 0);
 		}
 	}
 
-	static void calculateDiffAxisAngleQuaternion(const b3Quaternion& orn0,const b3Quaternion& orn1a,b3Vector3& axis,b3Scalar& angle)
+	static void calculateDiffAxisAngleQuaternion(const b3Quaternion& orn0, const b3Quaternion& orn1a, b3Vector3& axis, b3Scalar& angle)
 	{
 		b3Quaternion orn1 = orn0.nearest(orn1a);
 		b3Quaternion dorn = orn1 * orn0.inverse();
 		angle = dorn.getAngle();
-		axis = b3MakeVector3(dorn.getX(),dorn.getY(),dorn.getZ());
+		axis = b3MakeVector3(dorn.getX(), dorn.getY(), dorn.getZ());
 		axis[3] = b3Scalar(0.);
 		//check for axis length
 		b3Scalar len = axis.length2();
-		if (len < B3_EPSILON*B3_EPSILON)
-			axis = b3MakeVector3(b3Scalar(1.),b3Scalar(0.),b3Scalar(0.));
+		if (len < B3_EPSILON * B3_EPSILON)
+			axis = b3MakeVector3(b3Scalar(1.), b3Scalar(0.), b3Scalar(0.));
 		else
 			axis /= b3Sqrt(len);
 	}
 
-	static void	calculateVelocity(const b3Transform& transform0,const b3Transform& transform1,b3Scalar timeStep,b3Vector3& linVel,b3Vector3& angVel)
+	static void calculateVelocity(const b3Transform& transform0, const b3Transform& transform1, b3Scalar timeStep, b3Vector3& linVel, b3Vector3& angVel)
 	{
 		linVel = (transform1.getOrigin() - transform0.getOrigin()) / timeStep;
 		b3Vector3 axis;
-		b3Scalar  angle;
-		calculateDiffAxisAngle(transform0,transform1,axis,angle);
+		b3Scalar angle;
+		calculateDiffAxisAngle(transform0, transform1, axis, angle);
 		angVel = axis * angle / timeStep;
 	}
 
-	static void calculateDiffAxisAngle(const b3Transform& transform0,const b3Transform& transform1,b3Vector3& axis,b3Scalar& angle)
+	static void calculateDiffAxisAngle(const b3Transform& transform0, const b3Transform& transform1, b3Vector3& axis, b3Scalar& angle)
 	{
 		b3Matrix3x3 dmat = transform1.getBasis() * transform0.getBasis().inverse();
 		b3Quaternion dorn;
 		dmat.getRotation(dorn);
 
-		///floating point inaccuracy can lead to w component > 1..., which breaks 
+		///floating point inaccuracy can lead to w component > 1..., which breaks
 		dorn.normalize();
-		
+
 		angle = dorn.getAngle();
-		axis = b3MakeVector3(dorn.getX(),dorn.getY(),dorn.getZ());
+		axis = b3MakeVector3(dorn.getX(), dorn.getY(), dorn.getZ());
 		axis[3] = b3Scalar(0.);
 		//check for axis length
 		b3Scalar len = axis.length2();
-		if (len < B3_EPSILON*B3_EPSILON)
-			axis = b3MakeVector3(b3Scalar(1.),b3Scalar(0.),b3Scalar(0.));
+		if (len < B3_EPSILON * B3_EPSILON)
+			axis = b3MakeVector3(b3Scalar(1.), b3Scalar(0.), b3Scalar(0.));
 		else
 			axis /= b3Sqrt(len);
 	}
-
 };
 
-
-///The b3ConvexSeparatingDistanceUtil can help speed up convex collision detection 
+///The b3ConvexSeparatingDistanceUtil can help speed up convex collision detection
 ///by conservatively updating a cached separating distance/vector instead of re-calculating the closest distance
-class	b3ConvexSeparatingDistanceUtil
+class b3ConvexSeparatingDistanceUtil
 {
-	b3Quaternion	m_ornA;
-	b3Quaternion	m_ornB;
-	b3Vector3	m_posA;
-	b3Vector3	m_posB;
-	
-	b3Vector3	m_separatingNormal;
+	b3Quaternion m_ornA;
+	b3Quaternion m_ornB;
+	b3Vector3 m_posA;
+	b3Vector3 m_posB;
 
-	b3Scalar	m_boundingRadiusA;
-	b3Scalar	m_boundingRadiusB;
-	b3Scalar	m_separatingDistance;
+	b3Vector3 m_separatingNormal;
 
-public:
+	b3Scalar m_boundingRadiusA;
+	b3Scalar m_boundingRadiusB;
+	b3Scalar m_separatingDistance;
 
-	b3ConvexSeparatingDistanceUtil(b3Scalar	boundingRadiusA,b3Scalar	boundingRadiusB)
-		:m_boundingRadiusA(boundingRadiusA),
-		m_boundingRadiusB(boundingRadiusB),
-		m_separatingDistance(0.f)
+public:
+	b3ConvexSeparatingDistanceUtil(b3Scalar boundingRadiusA, b3Scalar boundingRadiusB)
+		: m_boundingRadiusA(boundingRadiusA),
+		  m_boundingRadiusB(boundingRadiusB),
+		  m_separatingDistance(0.f)
 	{
 	}
 
-	b3Scalar	getConservativeSeparatingDistance()
+	b3Scalar getConservativeSeparatingDistance()
 	{
 		return m_separatingDistance;
 	}
 
-	void	updateSeparatingDistance(const b3Transform& transA,const b3Transform& transB)
+	void updateSeparatingDistance(const b3Transform& transA, const b3Transform& transB)
 	{
 		const b3Vector3& toPosA = transA.getOrigin();
 		const b3Vector3& toPosB = transB.getOrigin();
 		b3Quaternion toOrnA = transA.getRotation();
 		b3Quaternion toOrnB = transB.getRotation();
 
-		if (m_separatingDistance>0.f)
+		if (m_separatingDistance > 0.f)
 		{
-			
-
-			b3Vector3 linVelA,angVelA,linVelB,angVelB;
-			b3TransformUtil::calculateVelocityQuaternion(m_posA,toPosA,m_ornA,toOrnA,b3Scalar(1.),linVelA,angVelA);
-			b3TransformUtil::calculateVelocityQuaternion(m_posB,toPosB,m_ornB,toOrnB,b3Scalar(1.),linVelB,angVelB);
+			b3Vector3 linVelA, angVelA, linVelB, angVelB;
+			b3TransformUtil::calculateVelocityQuaternion(m_posA, toPosA, m_ornA, toOrnA, b3Scalar(1.), linVelA, angVelA);
+			b3TransformUtil::calculateVelocityQuaternion(m_posB, toPosB, m_ornB, toOrnB, b3Scalar(1.), linVelB, angVelB);
 			b3Scalar maxAngularProjectedVelocity = angVelA.length() * m_boundingRadiusA + angVelB.length() * m_boundingRadiusB;
-			b3Vector3 relLinVel = (linVelB-linVelA);
+			b3Vector3 relLinVel = (linVelB - linVelA);
 			b3Scalar relLinVelocLength = relLinVel.dot(m_separatingNormal);
-			if (relLinVelocLength<0.f)
+			if (relLinVelocLength < 0.f)
 			{
 				relLinVelocLength = 0.f;
 			}
-	
-			b3Scalar	projectedMotion = maxAngularProjectedVelocity +relLinVelocLength;
+
+			b3Scalar projectedMotion = maxAngularProjectedVelocity + relLinVelocLength;
 			m_separatingDistance -= projectedMotion;
 		}
-	
+
 		m_posA = toPosA;
 		m_posB = toPosB;
 		m_ornA = toOrnA;
 		m_ornB = toOrnB;
 	}
 
-	void	initSeparatingDistance(const b3Vector3& separatingVector,b3Scalar separatingDistance,const b3Transform& transA,const b3Transform& transB)
+	void initSeparatingDistance(const b3Vector3& separatingVector, b3Scalar separatingDistance, const b3Transform& transA, const b3Transform& transB)
 	{
 		m_separatingDistance = separatingDistance;
 
-		if (m_separatingDistance>0.f)
+		if (m_separatingDistance > 0.f)
 		{
 			m_separatingNormal = separatingVector;
-			
+
 			const b3Vector3& toPosA = transA.getOrigin();
 			const b3Vector3& toPosB = transB.getOrigin();
 			b3Quaternion toOrnA = transA.getRotation();
@@ -220,9 +205,6 @@ public:
 			m_ornB = toOrnB;
 		}
 	}
-
 };
 
-
-#endif //B3_TRANSFORM_UTIL_H
-
+#endif  //B3_TRANSFORM_UTIL_H
diff --git a/thirdparty/bullet/Bullet3Common/b3Vector3.cpp b/thirdparty/bullet/Bullet3Common/b3Vector3.cpp
index 5f5ac4ac04..100fb774c1 100644
--- a/thirdparty/bullet/Bullet3Common/b3Vector3.cpp
+++ b/thirdparty/bullet/Bullet3Common/b3Vector3.cpp
@@ -14,274 +14,281 @@
  This source version has been altered.
  */
 
-#if defined (_WIN32) || defined (__i386__)
+#if defined(_WIN32) || defined(__i386__)
 #define B3_USE_SSE_IN_API
 #endif
 
 #include "b3Vector3.h"
 
-#if defined (B3_USE_SSE) || defined (B3_USE_NEON)
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
 
 #ifdef __APPLE__
 #include <stdint.h>
-typedef  float float4 __attribute__ ((vector_size(16)));
+typedef float float4 __attribute__((vector_size(16)));
 #else
 #define float4 __m128
 #endif
 //typedef  uint32_t uint4 __attribute__ ((vector_size(16)));
 
-
 #if defined B3_USE_SSE || defined _WIN32
 
-#define LOG2_ARRAY_SIZE     6
-#define STACK_ARRAY_COUNT   (1UL << LOG2_ARRAY_SIZE)
+#define LOG2_ARRAY_SIZE 6
+#define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
 
 #include <emmintrin.h>
 
-long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
-long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
+long b3_maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
+long b3_maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
 {
-    const float4 *vertices = (const float4*) vv;
-    static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
-    float4 dotMax = b3Assign128( -B3_INFINITY,  -B3_INFINITY,  -B3_INFINITY,  -B3_INFINITY );
-    float4 vvec = _mm_loadu_ps( vec );
-    float4 vHi = b3CastiTo128f(_mm_shuffle_epi32( b3CastfTo128i( vvec), 0xaa ));          /// zzzz
-    float4 vLo = _mm_movelh_ps( vvec, vvec );                               /// xyxy
+	const float4 *vertices = (const float4 *)vv;
+	static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
+	float4 dotMax = b3Assign128(-B3_INFINITY, -B3_INFINITY, -B3_INFINITY, -B3_INFINITY);
+	float4 vvec = _mm_loadu_ps(vec);
+	float4 vHi = b3CastiTo128f(_mm_shuffle_epi32(b3CastfTo128i(vvec), 0xaa));  /// zzzz
+	float4 vLo = _mm_movelh_ps(vvec, vvec);                                    /// xyxy
 
-    long maxIndex = -1L;
+	long maxIndex = -1L;
 
-    size_t segment = 0;
-    float4 stack_array[ STACK_ARRAY_COUNT ];
+	size_t segment = 0;
+	float4 stack_array[STACK_ARRAY_COUNT];
 
 #if DEBUG
-    // memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
+	// memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
 #endif
 
-    size_t index;
-    float4 max;
-    // Faster loop without cleanup code for full tiles
-    for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
-    {
-        max = dotMax;
-
-        for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
-        { // do four dot products at a time. Carefully avoid touching the w element.
-            float4 v0 = vertices[0];
-            float4 v1 = vertices[1];
-            float4 v2 = vertices[2];
-            float4 v3 = vertices[3];            vertices += 4;
-
-            float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-            float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-            float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-            float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-            lo0 = lo0*vLo;
-            lo1 = lo1*vLo;
-            float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
-            float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
-            float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            z = z*vHi;
-            x = x+y;
-            x = x+z;
-            stack_array[index] = x;
-            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-
-            v0 = vertices[0];
-            v1 = vertices[1];
-            v2 = vertices[2];
-            v3 = vertices[3];            vertices += 4;
-
-            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-            lo0 = lo0*vLo;
-            lo1 = lo1*vLo;
-            z = _mm_shuffle_ps(hi0, hi1, 0x88);
-            x = _mm_shuffle_ps(lo0, lo1, 0x88);
-            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            z = z*vHi;
-            x = x+y;
-            x = x+z;
-            stack_array[index+1] = x;
-            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-
-            v0 = vertices[0];
-            v1 = vertices[1];
-            v2 = vertices[2];
-            v3 = vertices[3];            vertices += 4;
-
-            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-            lo0 = lo0*vLo;
-            lo1 = lo1*vLo;
-            z = _mm_shuffle_ps(hi0, hi1, 0x88);
-            x = _mm_shuffle_ps(lo0, lo1, 0x88);
-            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            z = z*vHi;
-            x = x+y;
-            x = x+z;
-            stack_array[index+2] = x;
-            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-
-            v0 = vertices[0];
-            v1 = vertices[1];
-            v2 = vertices[2];
-            v3 = vertices[3];            vertices += 4;
-
-            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-            lo0 = lo0*vLo;
-            lo1 = lo1*vLo;
-            z = _mm_shuffle_ps(hi0, hi1, 0x88);
-            x = _mm_shuffle_ps(lo0, lo1, 0x88);
-            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            z = z*vHi;
-            x = x+y;
-            x = x+z;
-            stack_array[index+3] = x;
-            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-
-            // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
-        }
-
-        // If we found a new max
-        if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
-        {
-            // copy the new max across all lanes of our max accumulator
-            max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
-            max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
-
-            dotMax = max;
-
-            // find first occurrence of that max
-            size_t test;
-            for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
-            {}
-            // record where it is.
-            maxIndex = 4*index + segment + indexTable[test];
-        }
-    }
-
-    // account for work we've already done
-    count -= segment;
-
-    // Deal with the last < STACK_ARRAY_COUNT vectors
-    max = dotMax;
-    index = 0;
-
-
-    if( b3Unlikely( count > 16) )
-    {
-        for( ; index + 4 <= count / 4; index+=4 )
-        { // do four dot products at a time. Carefully avoid touching the w element.
-            float4 v0 = vertices[0];
-            float4 v1 = vertices[1];
-            float4 v2 = vertices[2];
-            float4 v3 = vertices[3];            vertices += 4;
-
-            float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-            float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-            float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-            float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-            lo0 = lo0*vLo;
-            lo1 = lo1*vLo;
-            float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
-            float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
-            float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            z = z*vHi;
-            x = x+y;
-            x = x+z;
-            stack_array[index] = x;
-            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-
-            v0 = vertices[0];
-            v1 = vertices[1];
-            v2 = vertices[2];
-            v3 = vertices[3];            vertices += 4;
-
-            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-            lo0 = lo0*vLo;
-            lo1 = lo1*vLo;
-            z = _mm_shuffle_ps(hi0, hi1, 0x88);
-            x = _mm_shuffle_ps(lo0, lo1, 0x88);
-            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            z = z*vHi;
-            x = x+y;
-            x = x+z;
-            stack_array[index+1] = x;
-            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-
-            v0 = vertices[0];
-            v1 = vertices[1];
-            v2 = vertices[2];
-            v3 = vertices[3];            vertices += 4;
-
-            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-            lo0 = lo0*vLo;
-            lo1 = lo1*vLo;
-            z = _mm_shuffle_ps(hi0, hi1, 0x88);
-            x = _mm_shuffle_ps(lo0, lo1, 0x88);
-            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            z = z*vHi;
-            x = x+y;
-            x = x+z;
-            stack_array[index+2] = x;
-            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-
-            v0 = vertices[0];
-            v1 = vertices[1];
-            v2 = vertices[2];
-            v3 = vertices[3];            vertices += 4;
-
-            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-            lo0 = lo0*vLo;
-            lo1 = lo1*vLo;
-            z = _mm_shuffle_ps(hi0, hi1, 0x88);
-            x = _mm_shuffle_ps(lo0, lo1, 0x88);
-            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            z = z*vHi;
-            x = x+y;
-            x = x+z;
-            stack_array[index+3] = x;
-            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-
-            // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
-        }
-    }
-
-    size_t localCount = (count & -4L) - 4*index;
-    if( localCount )
-    {
+	size_t index;
+	float4 max;
+	// Faster loop without cleanup code for full tiles
+	for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
+	{
+		max = dotMax;
+
+		for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
+		{  // do four dot products at a time. Carefully avoid touching the w element.
+			float4 v0 = vertices[0];
+			float4 v1 = vertices[1];
+			float4 v2 = vertices[2];
+			float4 v3 = vertices[3];
+			vertices += 4;
+
+			float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index] = x;
+			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 1] = x;
+			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 2] = x;
+			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 3] = x;
+			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+
+			// It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
+		}
+
+		// If we found a new max
+		if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
+		{
+			// copy the new max across all lanes of our max accumulator
+			max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
+			max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
+
+			dotMax = max;
+
+			// find first occurrence of that max
+			size_t test;
+			for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++)  // local_count must be a multiple of 4
+			{
+			}
+			// record where it is.
+			maxIndex = 4 * index + segment + indexTable[test];
+		}
+	}
+
+	// account for work we've already done
+	count -= segment;
+
+	// Deal with the last < STACK_ARRAY_COUNT vectors
+	max = dotMax;
+	index = 0;
+
+	if (b3Unlikely(count > 16))
+	{
+		for (; index + 4 <= count / 4; index += 4)
+		{  // do four dot products at a time. Carefully avoid touching the w element.
+			float4 v0 = vertices[0];
+			float4 v1 = vertices[1];
+			float4 v2 = vertices[2];
+			float4 v3 = vertices[3];
+			vertices += 4;
+
+			float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index] = x;
+			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 1] = x;
+			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 2] = x;
+			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 3] = x;
+			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+
+			// It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
+		}
+	}
+
+	size_t localCount = (count & -4L) - 4 * index;
+	if (localCount)
+	{
 #ifdef __APPLE__
-        float4 t0, t1, t2, t3, t4;
-        float4 * sap = &stack_array[index + localCount / 4];
-          vertices += localCount;      // counter the offset
-         size_t byteIndex = -(localCount) * sizeof(float);
-        //AT&T Code style assembly
-        asm volatile
-        (   ".align 4                                                                   \n\
+		float4 t0, t1, t2, t3, t4;
+		float4 *sap = &stack_array[index + localCount / 4];
+		vertices += localCount;  // counter the offset
+		size_t byteIndex = -(localCount) * sizeof(float);
+		//AT&T Code style assembly
+		asm volatile(
+			".align 4                                                                   \n\
              0: movaps  %[max], %[t2]                            // move max out of the way to avoid propagating NaNs in max \n\
           movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
           movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
@@ -307,369 +314,375 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl
          add     $16, %[byteIndex]                           // advance loop counter\n\
          jnz     0b                                          \n\
      "
-         : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
-         : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
-         : "memory", "cc"
-         );
-        index += localCount/4;
+			: [max] "+x"(max), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
+			: [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
+			: "memory", "cc");
+		index += localCount / 4;
 #else
-        {
-            for( unsigned int i=0; i<localCount/4; i++,index++)
-            { // do four dot products at a time. Carefully avoid touching the w element.
-                float4 v0 = vertices[0];
-                float4 v1 = vertices[1];
-                float4 v2 = vertices[2];
-                float4 v3 = vertices[3];
-                vertices += 4;
-
-                float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-                float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-                float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-                float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-                lo0 = lo0*vLo;
-                lo1 = lo1*vLo;
-                float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
-                float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
-                float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-                z = z*vHi;
-                x = x+y;
-                x = x+z;
-                stack_array[index] = x;
-                max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-            }
-        }
-#endif //__APPLE__
-    }
-
-    // process the last few points
-    if( count & 3 )
-    {
-        float4 v0, v1, v2, x, y, z;
-        switch( count & 3 )
-        {
-            case 3:
-            {
-                v0 = vertices[0];
-                v1 = vertices[1];
-                v2 = vertices[2];
-
-                // Calculate 3 dot products, transpose, duplicate v2
-                float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
-                float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
-                lo0 = lo0*vLo;
-                z = _mm_shuffle_ps(hi0, v2,  0xa8 );           // z0z1z2z2
-                z = z*vHi;
-                float4 lo1 = _mm_movelh_ps(v2, v2);          // xyxy
-                lo1 = lo1*vLo;
-                x = _mm_shuffle_ps(lo0, lo1, 0x88);
-                y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            }
-                break;
-            case 2:
-            {
-                v0 = vertices[0];
-                v1 = vertices[1];
-                float4 xy = _mm_movelh_ps(v0, v1);
-                z = _mm_movehl_ps(v1, v0);
-                xy = xy*vLo;
-                z = _mm_shuffle_ps( z, z,  0xa8);
-                x = _mm_shuffle_ps( xy, xy, 0xa8);
-                y = _mm_shuffle_ps( xy, xy, 0xfd);
-                z = z*vHi;
-            }
-                break;
-            case 1:
-            {
-                float4 xy = vertices[0];
-                z =  _mm_shuffle_ps( xy, xy, 0xaa);
-                xy = xy*vLo;
-                z = z*vHi;
-                x = _mm_shuffle_ps(xy, xy, 0);
-                y = _mm_shuffle_ps(xy, xy, 0x55);
-            }
-                break;
-        }
-        x = x+y;
-        x = x+z;
-        stack_array[index] = x;
-        max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-        index++;
-    }
-
-    // if we found a new max.
-    if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
-    { // we found a new max. Search for it
-      // find max across the max vector, place in all elements of max -- big latency hit here
-        max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
-        max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
-
-        // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
-        // this where it actually makes a difference is handled in the early out at the top of the function,
-        // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
-        // complexity, and removed it.
-
-        dotMax = max;
-
-        // scan for the first occurence of max in the array
-        size_t test;
-        for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
-        {}
-        maxIndex = 4*index + segment + indexTable[test];
-    }
-
-    _mm_store_ss( dotResult, dotMax);
-    return maxIndex;
+		{
+			for (unsigned int i = 0; i < localCount / 4; i++, index++)
+			{  // do four dot products at a time. Carefully avoid touching the w element.
+				float4 v0 = vertices[0];
+				float4 v1 = vertices[1];
+				float4 v2 = vertices[2];
+				float4 v3 = vertices[3];
+				vertices += 4;
+
+				float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+				float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+				float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+				float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+				lo0 = lo0 * vLo;
+				lo1 = lo1 * vLo;
+				float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+				float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+				float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+				z = z * vHi;
+				x = x + y;
+				x = x + z;
+				stack_array[index] = x;
+				max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+			}
+		}
+#endif  //__APPLE__
+	}
+
+	// process the last few points
+	if (count & 3)
+	{
+		float4 v0, v1, v2, x, y, z;
+		switch (count & 3)
+		{
+			case 3:
+			{
+				v0 = vertices[0];
+				v1 = vertices[1];
+				v2 = vertices[2];
+
+				// Calculate 3 dot products, transpose, duplicate v2
+				float4 lo0 = _mm_movelh_ps(v0, v1);  // xyxy.lo
+				float4 hi0 = _mm_movehl_ps(v1, v0);  // z?z?.lo
+				lo0 = lo0 * vLo;
+				z = _mm_shuffle_ps(hi0, v2, 0xa8);  // z0z1z2z2
+				z = z * vHi;
+				float4 lo1 = _mm_movelh_ps(v2, v2);  // xyxy
+				lo1 = lo1 * vLo;
+				x = _mm_shuffle_ps(lo0, lo1, 0x88);
+				y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			}
+			break;
+			case 2:
+			{
+				v0 = vertices[0];
+				v1 = vertices[1];
+				float4 xy = _mm_movelh_ps(v0, v1);
+				z = _mm_movehl_ps(v1, v0);
+				xy = xy * vLo;
+				z = _mm_shuffle_ps(z, z, 0xa8);
+				x = _mm_shuffle_ps(xy, xy, 0xa8);
+				y = _mm_shuffle_ps(xy, xy, 0xfd);
+				z = z * vHi;
+			}
+			break;
+			case 1:
+			{
+				float4 xy = vertices[0];
+				z = _mm_shuffle_ps(xy, xy, 0xaa);
+				xy = xy * vLo;
+				z = z * vHi;
+				x = _mm_shuffle_ps(xy, xy, 0);
+				y = _mm_shuffle_ps(xy, xy, 0x55);
+			}
+			break;
+		}
+		x = x + y;
+		x = x + z;
+		stack_array[index] = x;
+		max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+		index++;
+	}
+
+	// if we found a new max.
+	if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
+	{  // we found a new max. Search for it
+		// find max across the max vector, place in all elements of max -- big latency hit here
+		max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
+		max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
+
+		// It is slightly faster to do this part in scalar code when count < 8. However, the common case for
+		// this where it actually makes a difference is handled in the early out at the top of the function,
+		// so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
+		// complexity, and removed it.
+
+		dotMax = max;
+
+		// scan for the first occurence of max in the array
+		size_t test;
+		for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++)  // local_count must be a multiple of 4
+		{
+		}
+		maxIndex = 4 * index + segment + indexTable[test];
+	}
+
+	_mm_store_ss(dotResult, dotMax);
+	return maxIndex;
 }
 
-long b3_mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
+long b3_mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
 
-long b3_mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
+long b3_mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
 {
-    const float4 *vertices = (const float4*) vv;
-    static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
+	const float4 *vertices = (const float4 *)vv;
+	static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
 
-    float4 dotmin = b3Assign128( B3_INFINITY,  B3_INFINITY,  B3_INFINITY,  B3_INFINITY );
-    float4 vvec = _mm_loadu_ps( vec );
-    float4 vHi = b3CastiTo128f(_mm_shuffle_epi32( b3CastfTo128i( vvec), 0xaa ));          /// zzzz
-    float4 vLo = _mm_movelh_ps( vvec, vvec );                               /// xyxy
+	float4 dotmin = b3Assign128(B3_INFINITY, B3_INFINITY, B3_INFINITY, B3_INFINITY);
+	float4 vvec = _mm_loadu_ps(vec);
+	float4 vHi = b3CastiTo128f(_mm_shuffle_epi32(b3CastfTo128i(vvec), 0xaa));  /// zzzz
+	float4 vLo = _mm_movelh_ps(vvec, vvec);                                    /// xyxy
 
-    long minIndex = -1L;
+	long minIndex = -1L;
 
-    size_t segment = 0;
-    float4 stack_array[ STACK_ARRAY_COUNT ];
+	size_t segment = 0;
+	float4 stack_array[STACK_ARRAY_COUNT];
 
 #if DEBUG
-    // memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
+	// memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
 #endif
 
-    size_t index;
-    float4 min;
-    // Faster loop without cleanup code for full tiles
-    for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
-    {
-        min = dotmin;
-
-        for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
-        { // do four dot products at a time. Carefully avoid touching the w element.
-            float4 v0 = vertices[0];
-            float4 v1 = vertices[1];
-            float4 v2 = vertices[2];
-            float4 v3 = vertices[3];            vertices += 4;
-
-            float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-            float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-            float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-            float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-            lo0 = lo0*vLo;
-            lo1 = lo1*vLo;
-            float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
-            float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
-            float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            z = z*vHi;
-            x = x+y;
-            x = x+z;
-            stack_array[index] = x;
-            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
-
-            v0 = vertices[0];
-            v1 = vertices[1];
-            v2 = vertices[2];
-            v3 = vertices[3];            vertices += 4;
-
-            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-            lo0 = lo0*vLo;
-            lo1 = lo1*vLo;
-            z = _mm_shuffle_ps(hi0, hi1, 0x88);
-            x = _mm_shuffle_ps(lo0, lo1, 0x88);
-            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            z = z*vHi;
-            x = x+y;
-            x = x+z;
-            stack_array[index+1] = x;
-            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
-
-            v0 = vertices[0];
-            v1 = vertices[1];
-            v2 = vertices[2];
-            v3 = vertices[3];            vertices += 4;
-
-            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-            lo0 = lo0*vLo;
-            lo1 = lo1*vLo;
-            z = _mm_shuffle_ps(hi0, hi1, 0x88);
-            x = _mm_shuffle_ps(lo0, lo1, 0x88);
-            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            z = z*vHi;
-            x = x+y;
-            x = x+z;
-            stack_array[index+2] = x;
-            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
-
-            v0 = vertices[0];
-            v1 = vertices[1];
-            v2 = vertices[2];
-            v3 = vertices[3];            vertices += 4;
-
-            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-            lo0 = lo0*vLo;
-            lo1 = lo1*vLo;
-            z = _mm_shuffle_ps(hi0, hi1, 0x88);
-            x = _mm_shuffle_ps(lo0, lo1, 0x88);
-            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            z = z*vHi;
-            x = x+y;
-            x = x+z;
-            stack_array[index+3] = x;
-            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
-
-            // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
-        }
-
-        // If we found a new min
-        if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
-        {
-            // copy the new min across all lanes of our min accumulator
-            min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
-            min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
-
-            dotmin = min;
-
-            // find first occurrence of that min
-            size_t test;
-            for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
-            {}
-            // record where it is.
-            minIndex = 4*index + segment + indexTable[test];
-        }
-    }
-
-    // account for work we've already done
-    count -= segment;
-
-    // Deal with the last < STACK_ARRAY_COUNT vectors
-    min = dotmin;
-    index = 0;
-
-
-    if(b3Unlikely( count > 16) )
-    {
-        for( ; index + 4 <= count / 4; index+=4 )
-        { // do four dot products at a time. Carefully avoid touching the w element.
-            float4 v0 = vertices[0];
-            float4 v1 = vertices[1];
-            float4 v2 = vertices[2];
-            float4 v3 = vertices[3];            vertices += 4;
-
-            float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-            float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-            float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-            float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-            lo0 = lo0*vLo;
-            lo1 = lo1*vLo;
-            float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
-            float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
-            float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            z = z*vHi;
-            x = x+y;
-            x = x+z;
-            stack_array[index] = x;
-            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
-
-            v0 = vertices[0];
-            v1 = vertices[1];
-            v2 = vertices[2];
-            v3 = vertices[3];            vertices += 4;
-
-            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-            lo0 = lo0*vLo;
-            lo1 = lo1*vLo;
-            z = _mm_shuffle_ps(hi0, hi1, 0x88);
-            x = _mm_shuffle_ps(lo0, lo1, 0x88);
-            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            z = z*vHi;
-            x = x+y;
-            x = x+z;
-            stack_array[index+1] = x;
-            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
-
-            v0 = vertices[0];
-            v1 = vertices[1];
-            v2 = vertices[2];
-            v3 = vertices[3];            vertices += 4;
-
-            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-            lo0 = lo0*vLo;
-            lo1 = lo1*vLo;
-            z = _mm_shuffle_ps(hi0, hi1, 0x88);
-            x = _mm_shuffle_ps(lo0, lo1, 0x88);
-            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            z = z*vHi;
-            x = x+y;
-            x = x+z;
-            stack_array[index+2] = x;
-            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
-
-            v0 = vertices[0];
-            v1 = vertices[1];
-            v2 = vertices[2];
-            v3 = vertices[3];            vertices += 4;
-
-            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-            lo0 = lo0*vLo;
-            lo1 = lo1*vLo;
-            z = _mm_shuffle_ps(hi0, hi1, 0x88);
-            x = _mm_shuffle_ps(lo0, lo1, 0x88);
-            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            z = z*vHi;
-            x = x+y;
-            x = x+z;
-            stack_array[index+3] = x;
-            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
-
-            // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
-        }
-    }
-
-    size_t localCount = (count & -4L) - 4*index;
-    if( localCount )
-    {
-
-
+	size_t index;
+	float4 min;
+	// Faster loop without cleanup code for full tiles
+	for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
+	{
+		min = dotmin;
+
+		for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
+		{  // do four dot products at a time. Carefully avoid touching the w element.
+			float4 v0 = vertices[0];
+			float4 v1 = vertices[1];
+			float4 v2 = vertices[2];
+			float4 v3 = vertices[3];
+			vertices += 4;
+
+			float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index] = x;
+			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 1] = x;
+			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 2] = x;
+			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 3] = x;
+			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+
+			// It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
+		}
+
+		// If we found a new min
+		if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
+		{
+			// copy the new min across all lanes of our min accumulator
+			min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
+			min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
+
+			dotmin = min;
+
+			// find first occurrence of that min
+			size_t test;
+			for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++)  // local_count must be a multiple of 4
+			{
+			}
+			// record where it is.
+			minIndex = 4 * index + segment + indexTable[test];
+		}
+	}
+
+	// account for work we've already done
+	count -= segment;
+
+	// Deal with the last < STACK_ARRAY_COUNT vectors
+	min = dotmin;
+	index = 0;
+
+	if (b3Unlikely(count > 16))
+	{
+		for (; index + 4 <= count / 4; index += 4)
+		{  // do four dot products at a time. Carefully avoid touching the w element.
+			float4 v0 = vertices[0];
+			float4 v1 = vertices[1];
+			float4 v2 = vertices[2];
+			float4 v3 = vertices[3];
+			vertices += 4;
+
+			float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index] = x;
+			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 1] = x;
+			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 2] = x;
+			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 3] = x;
+			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+
+			// It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
+		}
+	}
+
+	size_t localCount = (count & -4L) - 4 * index;
+	if (localCount)
+	{
 #ifdef __APPLE__
-        vertices += localCount;      // counter the offset
-        float4 t0, t1, t2, t3, t4;
-        size_t byteIndex = -(localCount) * sizeof(float);
-        float4 * sap = &stack_array[index + localCount / 4];
+		vertices += localCount;  // counter the offset
+		float4 t0, t1, t2, t3, t4;
+		size_t byteIndex = -(localCount) * sizeof(float);
+		float4 *sap = &stack_array[index + localCount / 4];
 
-        asm volatile
-        (   ".align 4                                                                   \n\
+		asm volatile(
+			".align 4                                                                   \n\
              0: movaps  %[min], %[t2]                            // move min out of the way to avoid propagating NaNs in min \n\
              movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
              movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
@@ -695,937 +708,930 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl
              add     $16, %[byteIndex]                           // advance loop counter\n\
              jnz     0b                                          \n\
              "
-         : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
-         : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
-         : "memory", "cc"
-         );
-        index += localCount/4;
+			: [min] "+x"(min), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
+			: [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
+			: "memory", "cc");
+		index += localCount / 4;
 #else
-        {
-            for( unsigned int i=0; i<localCount/4; i++,index++)
-            { // do four dot products at a time. Carefully avoid touching the w element.
-                float4 v0 = vertices[0];
-                float4 v1 = vertices[1];
-                float4 v2 = vertices[2];
-                float4 v3 = vertices[3];
-                vertices += 4;
-
-                float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
-                float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
-                float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
-                float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-
-                lo0 = lo0*vLo;
-                lo1 = lo1*vLo;
-                float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
-                float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
-                float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-                z = z*vHi;
-                x = x+y;
-                x = x+z;
-                stack_array[index] = x;
-                min = _mm_min_ps( x, min );         // control the order here so that max is never NaN even if x is nan
-            }
-        }
+		{
+			for (unsigned int i = 0; i < localCount / 4; i++, index++)
+			{  // do four dot products at a time. Carefully avoid touching the w element.
+				float4 v0 = vertices[0];
+				float4 v1 = vertices[1];
+				float4 v2 = vertices[2];
+				float4 v3 = vertices[3];
+				vertices += 4;
+
+				float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+				float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+				float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+				float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+				lo0 = lo0 * vLo;
+				lo1 = lo1 * vLo;
+				float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+				float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+				float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+				z = z * vHi;
+				x = x + y;
+				x = x + z;
+				stack_array[index] = x;
+				min = _mm_min_ps(x, min);  // control the order here so that max is never NaN even if x is nan
+			}
+		}
 
 #endif
-    }
-
-    // process the last few points
-    if( count & 3 )
-    {
-        float4 v0, v1, v2, x, y, z;
-        switch( count & 3 )
-        {
-            case 3:
-            {
-                v0 = vertices[0];
-                v1 = vertices[1];
-                v2 = vertices[2];
-
-                // Calculate 3 dot products, transpose, duplicate v2
-                float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
-                float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
-                lo0 = lo0*vLo;
-                z = _mm_shuffle_ps(hi0, v2,  0xa8 );           // z0z1z2z2
-                z = z*vHi;
-                float4 lo1 = _mm_movelh_ps(v2, v2);          // xyxy
-                lo1 = lo1*vLo;
-                x = _mm_shuffle_ps(lo0, lo1, 0x88);
-                y = _mm_shuffle_ps(lo0, lo1, 0xdd);
-            }
-                break;
-            case 2:
-            {
-                v0 = vertices[0];
-                v1 = vertices[1];
-                float4 xy = _mm_movelh_ps(v0, v1);
-                z = _mm_movehl_ps(v1, v0);
-                xy = xy*vLo;
-                z = _mm_shuffle_ps( z, z,  0xa8);
-                x = _mm_shuffle_ps( xy, xy, 0xa8);
-                y = _mm_shuffle_ps( xy, xy, 0xfd);
-                z = z*vHi;
-            }
-                break;
-            case 1:
-            {
-                float4 xy = vertices[0];
-                z =  _mm_shuffle_ps( xy, xy, 0xaa);
-                xy = xy*vLo;
-                z = z*vHi;
-                x = _mm_shuffle_ps(xy, xy, 0);
-                y = _mm_shuffle_ps(xy, xy, 0x55);
-            }
-                break;
-        }
-        x = x+y;
-        x = x+z;
-        stack_array[index] = x;
-        min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
-        index++;
-    }
-
-    // if we found a new min.
-    if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
-    { // we found a new min. Search for it
-      // find min across the min vector, place in all elements of min -- big latency hit here
-        min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
-        min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
-
-        // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
-        // this where it actually makes a difference is handled in the early out at the top of the function,
-        // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
-        // complexity, and removed it.
-
-        dotmin = min;
-
-        // scan for the first occurence of min in the array
-        size_t test;
-        for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
-        {}
-        minIndex = 4*index + segment + indexTable[test];
-    }
-
-    _mm_store_ss( dotResult, dotmin);
-    return minIndex;
+	}
+
+	// process the last few points
+	if (count & 3)
+	{
+		float4 v0, v1, v2, x, y, z;
+		switch (count & 3)
+		{
+			case 3:
+			{
+				v0 = vertices[0];
+				v1 = vertices[1];
+				v2 = vertices[2];
+
+				// Calculate 3 dot products, transpose, duplicate v2
+				float4 lo0 = _mm_movelh_ps(v0, v1);  // xyxy.lo
+				float4 hi0 = _mm_movehl_ps(v1, v0);  // z?z?.lo
+				lo0 = lo0 * vLo;
+				z = _mm_shuffle_ps(hi0, v2, 0xa8);  // z0z1z2z2
+				z = z * vHi;
+				float4 lo1 = _mm_movelh_ps(v2, v2);  // xyxy
+				lo1 = lo1 * vLo;
+				x = _mm_shuffle_ps(lo0, lo1, 0x88);
+				y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			}
+			break;
+			case 2:
+			{
+				v0 = vertices[0];
+				v1 = vertices[1];
+				float4 xy = _mm_movelh_ps(v0, v1);
+				z = _mm_movehl_ps(v1, v0);
+				xy = xy * vLo;
+				z = _mm_shuffle_ps(z, z, 0xa8);
+				x = _mm_shuffle_ps(xy, xy, 0xa8);
+				y = _mm_shuffle_ps(xy, xy, 0xfd);
+				z = z * vHi;
+			}
+			break;
+			case 1:
+			{
+				float4 xy = vertices[0];
+				z = _mm_shuffle_ps(xy, xy, 0xaa);
+				xy = xy * vLo;
+				z = z * vHi;
+				x = _mm_shuffle_ps(xy, xy, 0);
+				y = _mm_shuffle_ps(xy, xy, 0x55);
+			}
+			break;
+		}
+		x = x + y;
+		x = x + z;
+		stack_array[index] = x;
+		min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+		index++;
+	}
+
+	// if we found a new min.
+	if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
+	{  // we found a new min. Search for it
+		// find min across the min vector, place in all elements of min -- big latency hit here
+		min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
+		min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
+
+		// It is slightly faster to do this part in scalar code when count < 8. However, the common case for
+		// this where it actually makes a difference is handled in the early out at the top of the function,
+		// so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
+		// complexity, and removed it.
+
+		dotmin = min;
+
+		// scan for the first occurence of min in the array
+		size_t test;
+		for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++)  // local_count must be a multiple of 4
+		{
+		}
+		minIndex = 4 * index + segment + indexTable[test];
+	}
+
+	_mm_store_ss(dotResult, dotmin);
+	return minIndex;
 }
 
-
 #elif defined B3_USE_NEON
-#define ARM_NEON_GCC_COMPATIBILITY  1
+#define ARM_NEON_GCC_COMPATIBILITY 1
 #include <arm_neon.h>
 
+static long b3_maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
+static long b3_maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
+static long b3_maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
+static long b3_mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
+static long b3_mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
+static long b3_mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
 
-static long b3_maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
-static long b3_maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
-static long b3_maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
-static long b3_mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
-static long b3_mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
-static long b3_mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
+long (*b3_maxdot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = b3_maxdot_large_sel;
+long (*b3_mindot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = b3_mindot_large_sel;
 
-long (*b3_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = b3_maxdot_large_sel;
-long (*b3_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = b3_mindot_large_sel;
-
-extern "C" {int  _get_cpu_capabilities( void );}
-
-static long b3_maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
+extern "C"
 {
-    if( _get_cpu_capabilities() & 0x2000 )
-        b3_maxdot_large = _maxdot_large_v1;
-    else
-        b3_maxdot_large = _maxdot_large_v0;
-
-    return b3_maxdot_large(vv, vec, count, dotResult);
+	int _get_cpu_capabilities(void);
 }
 
-static long b3_mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
+static long b3_maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
 {
-    if( _get_cpu_capabilities() & 0x2000 )
-        b3_mindot_large = _mindot_large_v1;
-    else
-        b3_mindot_large = _mindot_large_v0;
+	if (_get_cpu_capabilities() & 0x2000)
+		b3_maxdot_large = _maxdot_large_v1;
+	else
+		b3_maxdot_large = _maxdot_large_v0;
 
-    return b3_mindot_large(vv, vec, count, dotResult);
+	return b3_maxdot_large(vv, vec, count, dotResult);
 }
 
+static long b3_mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
+{
+	if (_get_cpu_capabilities() & 0x2000)
+		b3_mindot_large = _mindot_large_v1;
+	else
+		b3_mindot_large = _mindot_large_v0;
 
+	return b3_mindot_large(vv, vec, count, dotResult);
+}
 
-#define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32  {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
-
+#define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r; asm( "vld1.f32  {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
 
-long b3_maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
+long b3_maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
 {
-    unsigned long i = 0;
-    float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
-    float32x2_t vLo = vget_low_f32(vvec);
-    float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
-    float32x2_t dotMaxLo = (float32x2_t) { -B3_INFINITY, -B3_INFINITY };
-    float32x2_t dotMaxHi = (float32x2_t) { -B3_INFINITY, -B3_INFINITY };
-    uint32x2_t indexLo = (uint32x2_t) {0, 1};
-    uint32x2_t indexHi = (uint32x2_t) {2, 3};
-    uint32x2_t iLo = (uint32x2_t) {-1, -1};
-    uint32x2_t iHi = (uint32x2_t) {-1, -1};
-    const uint32x2_t four = (uint32x2_t) {4,4};
-
-    for( ; i+8 <= count; i+= 8 )
-    {
-        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
-        float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
-        float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
-        float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
-        float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
-        float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
-        float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
-        float32x2_t zLo = vmul_f32( z0.val[0], vHi);
-        float32x2_t zHi = vmul_f32( z1.val[0], vHi);
-
-        float32x2_t rLo = vpadd_f32( xy0, xy1);
-        float32x2_t rHi = vpadd_f32( xy2, xy3);
-        rLo = vadd_f32(rLo, zLo);
-        rHi = vadd_f32(rHi, zHi);
-
-        uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
-        uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
-        dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
-        dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
-        iLo = vbsl_u32(maskLo, indexLo, iLo);
-        iHi = vbsl_u32(maskHi, indexHi, iHi);
-        indexLo = vadd_u32(indexLo, four);
-        indexHi = vadd_u32(indexHi, four);
-
-        v0 = vld1q_f32_aligned_postincrement( vv );
-        v1 = vld1q_f32_aligned_postincrement( vv );
-        v2 = vld1q_f32_aligned_postincrement( vv );
-        v3 = vld1q_f32_aligned_postincrement( vv );
-
-        xy0 = vmul_f32( vget_low_f32(v0), vLo);
-        xy1 = vmul_f32( vget_low_f32(v1), vLo);
-        xy2 = vmul_f32( vget_low_f32(v2), vLo);
-        xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
-        z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
-        z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
-        zLo = vmul_f32( z0.val[0], vHi);
-        zHi = vmul_f32( z1.val[0], vHi);
-
-        rLo = vpadd_f32( xy0, xy1);
-        rHi = vpadd_f32( xy2, xy3);
-        rLo = vadd_f32(rLo, zLo);
-        rHi = vadd_f32(rHi, zHi);
-
-        maskLo = vcgt_f32( rLo, dotMaxLo );
-        maskHi = vcgt_f32( rHi, dotMaxHi );
-        dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
-        dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
-        iLo = vbsl_u32(maskLo, indexLo, iLo);
-        iHi = vbsl_u32(maskHi, indexHi, iHi);
-        indexLo = vadd_u32(indexLo, four);
-        indexHi = vadd_u32(indexHi, four);
-    }
-
-    for( ; i+4 <= count; i+= 4 )
-    {
-        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
-        float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
-        float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
-        float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
-        float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
-        float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
-        float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
-        float32x2_t zLo = vmul_f32( z0.val[0], vHi);
-        float32x2_t zHi = vmul_f32( z1.val[0], vHi);
-
-        float32x2_t rLo = vpadd_f32( xy0, xy1);
-        float32x2_t rHi = vpadd_f32( xy2, xy3);
-        rLo = vadd_f32(rLo, zLo);
-        rHi = vadd_f32(rHi, zHi);
-
-        uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
-        uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
-        dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
-        dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
-        iLo = vbsl_u32(maskLo, indexLo, iLo);
-        iHi = vbsl_u32(maskHi, indexHi, iHi);
-        indexLo = vadd_u32(indexLo, four);
-        indexHi = vadd_u32(indexHi, four);
-    }
-
-    switch( count & 3 )
-    {
-        case 3:
-        {
-            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-            float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-
-            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
-            float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
-            float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
-
-            float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
-            float32x2_t zLo = vmul_f32( z0.val[0], vHi);
-            float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
-
-            float32x2_t rLo = vpadd_f32( xy0, xy1);
-            float32x2_t rHi = vpadd_f32( xy2, xy2);
-            rLo = vadd_f32(rLo, zLo);
-            rHi = vadd_f32(rHi, zHi);
-
-            uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
-            uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
-            dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
-            dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
-            iLo = vbsl_u32(maskLo, indexLo, iLo);
-            iHi = vbsl_u32(maskHi, indexHi, iHi);
-        }
-            break;
-        case 2:
-        {
-            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-
-            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
-            float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
-
-            float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
-            float32x2_t zLo = vmul_f32( z0.val[0], vHi);
-
-            float32x2_t rLo = vpadd_f32( xy0, xy1);
-            rLo = vadd_f32(rLo, zLo);
-
-            uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
-            dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
-            iLo = vbsl_u32(maskLo, indexLo, iLo);
-        }
-            break;
-        case 1:
-        {
-            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
-            float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
-            float32x2_t zLo = vmul_f32( z0, vHi);
-            float32x2_t rLo = vpadd_f32( xy0, xy0);
-            rLo = vadd_f32(rLo, zLo);
-            uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
-            dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
-            iLo = vbsl_u32(maskLo, indexLo, iLo);
-        }
-            break;
-
-        default:
-            break;
-    }
-
-    // select best answer between hi and lo results
-    uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
-    dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
-    iLo = vbsl_u32(mask, iHi, iLo);
-
-    // select best answer between even and odd results
-    dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
-    iHi = vdup_lane_u32(iLo, 1);
-    mask = vcgt_f32( dotMaxHi, dotMaxLo );
-    dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
-    iLo = vbsl_u32(mask, iHi, iLo);
-
-    *dotResult = vget_lane_f32( dotMaxLo, 0);
-    return vget_lane_u32(iLo, 0);
+	unsigned long i = 0;
+	float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
+	float32x2_t vLo = vget_low_f32(vvec);
+	float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
+	float32x2_t dotMaxLo = (float32x2_t){-B3_INFINITY, -B3_INFINITY};
+	float32x2_t dotMaxHi = (float32x2_t){-B3_INFINITY, -B3_INFINITY};
+	uint32x2_t indexLo = (uint32x2_t){0, 1};
+	uint32x2_t indexHi = (uint32x2_t){2, 3};
+	uint32x2_t iLo = (uint32x2_t){-1, -1};
+	uint32x2_t iHi = (uint32x2_t){-1, -1};
+	const uint32x2_t four = (uint32x2_t){4, 4};
+
+	for (; i + 8 <= count; i += 8)
+	{
+		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+		float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+		float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+		float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+		float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+		float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+		float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+		float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+		float32x2_t zHi = vmul_f32(z1.val[0], vHi);
+
+		float32x2_t rLo = vpadd_f32(xy0, xy1);
+		float32x2_t rHi = vpadd_f32(xy2, xy3);
+		rLo = vadd_f32(rLo, zLo);
+		rHi = vadd_f32(rHi, zHi);
+
+		uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
+		uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
+		dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+		dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
+		iLo = vbsl_u32(maskLo, indexLo, iLo);
+		iHi = vbsl_u32(maskHi, indexHi, iHi);
+		indexLo = vadd_u32(indexLo, four);
+		indexHi = vadd_u32(indexHi, four);
+
+		v0 = vld1q_f32_aligned_postincrement(vv);
+		v1 = vld1q_f32_aligned_postincrement(vv);
+		v2 = vld1q_f32_aligned_postincrement(vv);
+		v3 = vld1q_f32_aligned_postincrement(vv);
+
+		xy0 = vmul_f32(vget_low_f32(v0), vLo);
+		xy1 = vmul_f32(vget_low_f32(v1), vLo);
+		xy2 = vmul_f32(vget_low_f32(v2), vLo);
+		xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+		z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+		z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+		zLo = vmul_f32(z0.val[0], vHi);
+		zHi = vmul_f32(z1.val[0], vHi);
+
+		rLo = vpadd_f32(xy0, xy1);
+		rHi = vpadd_f32(xy2, xy3);
+		rLo = vadd_f32(rLo, zLo);
+		rHi = vadd_f32(rHi, zHi);
+
+		maskLo = vcgt_f32(rLo, dotMaxLo);
+		maskHi = vcgt_f32(rHi, dotMaxHi);
+		dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+		dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
+		iLo = vbsl_u32(maskLo, indexLo, iLo);
+		iHi = vbsl_u32(maskHi, indexHi, iHi);
+		indexLo = vadd_u32(indexLo, four);
+		indexHi = vadd_u32(indexHi, four);
+	}
+
+	for (; i + 4 <= count; i += 4)
+	{
+		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+		float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+		float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+		float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+		float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+		float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+		float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+		float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+		float32x2_t zHi = vmul_f32(z1.val[0], vHi);
+
+		float32x2_t rLo = vpadd_f32(xy0, xy1);
+		float32x2_t rHi = vpadd_f32(xy2, xy3);
+		rLo = vadd_f32(rLo, zLo);
+		rHi = vadd_f32(rHi, zHi);
+
+		uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
+		uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
+		dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+		dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
+		iLo = vbsl_u32(maskLo, indexLo, iLo);
+		iHi = vbsl_u32(maskHi, indexHi, iHi);
+		indexLo = vadd_u32(indexLo, four);
+		indexHi = vadd_u32(indexHi, four);
+	}
+
+	switch (count & 3)
+	{
+		case 3:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+
+			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+			float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+			float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+
+			float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+			float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+			float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
+
+			float32x2_t rLo = vpadd_f32(xy0, xy1);
+			float32x2_t rHi = vpadd_f32(xy2, xy2);
+			rLo = vadd_f32(rLo, zLo);
+			rHi = vadd_f32(rHi, zHi);
+
+			uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
+			uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
+			dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+			dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
+			iLo = vbsl_u32(maskLo, indexLo, iLo);
+			iHi = vbsl_u32(maskHi, indexHi, iHi);
+		}
+		break;
+		case 2:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+
+			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+			float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+
+			float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+			float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+
+			float32x2_t rLo = vpadd_f32(xy0, xy1);
+			rLo = vadd_f32(rLo, zLo);
+
+			uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
+			dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+			iLo = vbsl_u32(maskLo, indexLo, iLo);
+		}
+		break;
+		case 1:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+			float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
+			float32x2_t zLo = vmul_f32(z0, vHi);
+			float32x2_t rLo = vpadd_f32(xy0, xy0);
+			rLo = vadd_f32(rLo, zLo);
+			uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
+			dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+			iLo = vbsl_u32(maskLo, indexLo, iLo);
+		}
+		break;
+
+		default:
+			break;
+	}
+
+	// select best answer between hi and lo results
+	uint32x2_t mask = vcgt_f32(dotMaxHi, dotMaxLo);
+	dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
+	iLo = vbsl_u32(mask, iHi, iLo);
+
+	// select best answer between even and odd results
+	dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
+	iHi = vdup_lane_u32(iLo, 1);
+	mask = vcgt_f32(dotMaxHi, dotMaxLo);
+	dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
+	iLo = vbsl_u32(mask, iHi, iLo);
+
+	*dotResult = vget_lane_f32(dotMaxLo, 0);
+	return vget_lane_u32(iLo, 0);
 }
 
-
-long b3_maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
+long b3_maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
 {
-    float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
-    float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
-    float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
-    const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
-    uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
-    uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
-    float32x4_t maxDot = (float32x4_t) { -B3_INFINITY, -B3_INFINITY, -B3_INFINITY, -B3_INFINITY };
-
-    unsigned long i = 0;
-    for( ; i + 8 <= count; i += 8 )
-    {
-        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
-        // the next two lines should resolve to a single vswp d, d
-        float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
-        float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
-        // the next two lines should resolve to a single vswp d, d
-        float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
-        float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
-        xy0 = vmulq_f32(xy0, vLo);
-        xy1 = vmulq_f32(xy1, vLo);
-
-        float32x4x2_t zb = vuzpq_f32( z0, z1);
-        float32x4_t z = vmulq_f32( zb.val[0], vHi);
-        float32x4x2_t xy = vuzpq_f32( xy0, xy1);
-        float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
-        x = vaddq_f32(x, z);
-
-        uint32x4_t mask = vcgtq_f32(x, maxDot);
-        maxDot = vbslq_f32( mask, x, maxDot);
-        index = vbslq_u32(mask, local_index, index);
-        local_index = vaddq_u32(local_index, four);
-
-        v0 = vld1q_f32_aligned_postincrement( vv );
-        v1 = vld1q_f32_aligned_postincrement( vv );
-        v2 = vld1q_f32_aligned_postincrement( vv );
-        v3 = vld1q_f32_aligned_postincrement( vv );
-
-        // the next two lines should resolve to a single vswp d, d
-        xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
-        xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
-        // the next two lines should resolve to a single vswp d, d
-        z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
-        z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
-        xy0 = vmulq_f32(xy0, vLo);
-        xy1 = vmulq_f32(xy1, vLo);
-
-        zb = vuzpq_f32( z0, z1);
-        z = vmulq_f32( zb.val[0], vHi);
-        xy = vuzpq_f32( xy0, xy1);
-        x = vaddq_f32(xy.val[0], xy.val[1]);
-        x = vaddq_f32(x, z);
-
-        mask = vcgtq_f32(x, maxDot);
-        maxDot = vbslq_f32( mask, x, maxDot);
-        index = vbslq_u32(mask, local_index, index);
-        local_index = vaddq_u32(local_index, four);
-    }
-
-    for( ; i + 4 <= count; i += 4 )
-    {
-        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
-        // the next two lines should resolve to a single vswp d, d
-        float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
-        float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
-        // the next two lines should resolve to a single vswp d, d
-        float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
-        float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
-        xy0 = vmulq_f32(xy0, vLo);
-        xy1 = vmulq_f32(xy1, vLo);
-
-        float32x4x2_t zb = vuzpq_f32( z0, z1);
-        float32x4_t z = vmulq_f32( zb.val[0], vHi);
-        float32x4x2_t xy = vuzpq_f32( xy0, xy1);
-        float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
-        x = vaddq_f32(x, z);
-
-        uint32x4_t mask = vcgtq_f32(x, maxDot);
-        maxDot = vbslq_f32( mask, x, maxDot);
-        index = vbslq_u32(mask, local_index, index);
-        local_index = vaddq_u32(local_index, four);
-    }
-
-    switch (count & 3) {
-        case 3:
-        {
-            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-            float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-
-            // the next two lines should resolve to a single vswp d, d
-            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
-            float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
-            // the next two lines should resolve to a single vswp d, d
-            float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
-            float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
-
-            xy0 = vmulq_f32(xy0, vLo);
-            xy1 = vmulq_f32(xy1, vLo);
-
-            float32x4x2_t zb = vuzpq_f32( z0, z1);
-            float32x4_t z = vmulq_f32( zb.val[0], vHi);
-            float32x4x2_t xy = vuzpq_f32( xy0, xy1);
-            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
-            x = vaddq_f32(x, z);
-
-            uint32x4_t mask = vcgtq_f32(x, maxDot);
-            maxDot = vbslq_f32( mask, x, maxDot);
-            index = vbslq_u32(mask, local_index, index);
-            local_index = vaddq_u32(local_index, four);
-        }
-            break;
-
-        case 2:
-        {
-            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-
-            // the next two lines should resolve to a single vswp d, d
-            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
-            // the next two lines should resolve to a single vswp d, d
-            float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
-
-            xy0 = vmulq_f32(xy0, vLo);
-
-            float32x4x2_t zb = vuzpq_f32( z0, z0);
-            float32x4_t z = vmulq_f32( zb.val[0], vHi);
-            float32x4x2_t xy = vuzpq_f32( xy0, xy0);
-            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
-            x = vaddq_f32(x, z);
-
-            uint32x4_t mask = vcgtq_f32(x, maxDot);
-            maxDot = vbslq_f32( mask, x, maxDot);
-            index = vbslq_u32(mask, local_index, index);
-            local_index = vaddq_u32(local_index, four);
-        }
-            break;
-
-        case 1:
-        {
-            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-
-            // the next two lines should resolve to a single vswp d, d
-            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
-            // the next two lines should resolve to a single vswp d, d
-            float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
-
-            xy0 = vmulq_f32(xy0, vLo);
-
-            z = vmulq_f32( z, vHi);
-            float32x4x2_t xy = vuzpq_f32( xy0, xy0);
-            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
-            x = vaddq_f32(x, z);
-
-            uint32x4_t mask = vcgtq_f32(x, maxDot);
-            maxDot = vbslq_f32( mask, x, maxDot);
-            index = vbslq_u32(mask, local_index, index);
-            local_index = vaddq_u32(local_index, four);
-        }
-            break;
-
-        default:
-            break;
-    }
-
-
-    // select best answer between hi and lo results
-    uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
-    float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
-    uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
-
-    // select best answer between even and odd results
-    float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
-    uint32x2_t indexHi = vdup_lane_u32(index2, 1);
-    mask = vcgt_f32( maxDotO, maxDot2 );
-    maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
-    index2 = vbsl_u32(mask, indexHi, index2);
-
-    *dotResult = vget_lane_f32( maxDot2, 0);
-    return vget_lane_u32(index2, 0);
-
+	float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
+	float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
+	float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
+	const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
+	uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
+	uint32x4_t index = (uint32x4_t){-1, -1, -1, -1};
+	float32x4_t maxDot = (float32x4_t){-B3_INFINITY, -B3_INFINITY, -B3_INFINITY, -B3_INFINITY};
+
+	unsigned long i = 0;
+	for (; i + 8 <= count; i += 8)
+	{
+		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+		// the next two lines should resolve to a single vswp d, d
+		float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+		float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+		// the next two lines should resolve to a single vswp d, d
+		float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+		float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+		xy0 = vmulq_f32(xy0, vLo);
+		xy1 = vmulq_f32(xy1, vLo);
+
+		float32x4x2_t zb = vuzpq_f32(z0, z1);
+		float32x4_t z = vmulq_f32(zb.val[0], vHi);
+		float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+		float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+		x = vaddq_f32(x, z);
+
+		uint32x4_t mask = vcgtq_f32(x, maxDot);
+		maxDot = vbslq_f32(mask, x, maxDot);
+		index = vbslq_u32(mask, local_index, index);
+		local_index = vaddq_u32(local_index, four);
+
+		v0 = vld1q_f32_aligned_postincrement(vv);
+		v1 = vld1q_f32_aligned_postincrement(vv);
+		v2 = vld1q_f32_aligned_postincrement(vv);
+		v3 = vld1q_f32_aligned_postincrement(vv);
+
+		// the next two lines should resolve to a single vswp d, d
+		xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+		xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+		// the next two lines should resolve to a single vswp d, d
+		z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+		z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+		xy0 = vmulq_f32(xy0, vLo);
+		xy1 = vmulq_f32(xy1, vLo);
+
+		zb = vuzpq_f32(z0, z1);
+		z = vmulq_f32(zb.val[0], vHi);
+		xy = vuzpq_f32(xy0, xy1);
+		x = vaddq_f32(xy.val[0], xy.val[1]);
+		x = vaddq_f32(x, z);
+
+		mask = vcgtq_f32(x, maxDot);
+		maxDot = vbslq_f32(mask, x, maxDot);
+		index = vbslq_u32(mask, local_index, index);
+		local_index = vaddq_u32(local_index, four);
+	}
+
+	for (; i + 4 <= count; i += 4)
+	{
+		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+		// the next two lines should resolve to a single vswp d, d
+		float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+		float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+		// the next two lines should resolve to a single vswp d, d
+		float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+		float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+		xy0 = vmulq_f32(xy0, vLo);
+		xy1 = vmulq_f32(xy1, vLo);
+
+		float32x4x2_t zb = vuzpq_f32(z0, z1);
+		float32x4_t z = vmulq_f32(zb.val[0], vHi);
+		float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+		float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+		x = vaddq_f32(x, z);
+
+		uint32x4_t mask = vcgtq_f32(x, maxDot);
+		maxDot = vbslq_f32(mask, x, maxDot);
+		index = vbslq_u32(mask, local_index, index);
+		local_index = vaddq_u32(local_index, four);
+	}
+
+	switch (count & 3)
+	{
+		case 3:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+			float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+			float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
+
+			xy0 = vmulq_f32(xy0, vLo);
+			xy1 = vmulq_f32(xy1, vLo);
+
+			float32x4x2_t zb = vuzpq_f32(z0, z1);
+			float32x4_t z = vmulq_f32(zb.val[0], vHi);
+			float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+			x = vaddq_f32(x, z);
+
+			uint32x4_t mask = vcgtq_f32(x, maxDot);
+			maxDot = vbslq_f32(mask, x, maxDot);
+			index = vbslq_u32(mask, local_index, index);
+			local_index = vaddq_u32(local_index, four);
+		}
+		break;
+
+		case 2:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+
+			xy0 = vmulq_f32(xy0, vLo);
+
+			float32x4x2_t zb = vuzpq_f32(z0, z0);
+			float32x4_t z = vmulq_f32(zb.val[0], vHi);
+			float32x4x2_t xy = vuzpq_f32(xy0, xy0);
+			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+			x = vaddq_f32(x, z);
+
+			uint32x4_t mask = vcgtq_f32(x, maxDot);
+			maxDot = vbslq_f32(mask, x, maxDot);
+			index = vbslq_u32(mask, local_index, index);
+			local_index = vaddq_u32(local_index, four);
+		}
+		break;
+
+		case 1:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
+
+			xy0 = vmulq_f32(xy0, vLo);
+
+			z = vmulq_f32(z, vHi);
+			float32x4x2_t xy = vuzpq_f32(xy0, xy0);
+			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+			x = vaddq_f32(x, z);
+
+			uint32x4_t mask = vcgtq_f32(x, maxDot);
+			maxDot = vbslq_f32(mask, x, maxDot);
+			index = vbslq_u32(mask, local_index, index);
+			local_index = vaddq_u32(local_index, four);
+		}
+		break;
+
+		default:
+			break;
+	}
+
+	// select best answer between hi and lo results
+	uint32x2_t mask = vcgt_f32(vget_high_f32(maxDot), vget_low_f32(maxDot));
+	float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
+	uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
+
+	// select best answer between even and odd results
+	float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
+	uint32x2_t indexHi = vdup_lane_u32(index2, 1);
+	mask = vcgt_f32(maxDotO, maxDot2);
+	maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
+	index2 = vbsl_u32(mask, indexHi, index2);
+
+	*dotResult = vget_lane_f32(maxDot2, 0);
+	return vget_lane_u32(index2, 0);
 }
 
-long b3_mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
+long b3_mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
 {
-    unsigned long i = 0;
-    float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
-    float32x2_t vLo = vget_low_f32(vvec);
-    float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
-    float32x2_t dotMinLo = (float32x2_t) { B3_INFINITY, B3_INFINITY };
-    float32x2_t dotMinHi = (float32x2_t) { B3_INFINITY, B3_INFINITY };
-    uint32x2_t indexLo = (uint32x2_t) {0, 1};
-    uint32x2_t indexHi = (uint32x2_t) {2, 3};
-    uint32x2_t iLo = (uint32x2_t) {-1, -1};
-    uint32x2_t iHi = (uint32x2_t) {-1, -1};
-    const uint32x2_t four = (uint32x2_t) {4,4};
-
-    for( ; i+8 <= count; i+= 8 )
-    {
-        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
-        float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
-        float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
-        float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
-        float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
-        float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
-        float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
-        float32x2_t zLo = vmul_f32( z0.val[0], vHi);
-        float32x2_t zHi = vmul_f32( z1.val[0], vHi);
-
-        float32x2_t rLo = vpadd_f32( xy0, xy1);
-        float32x2_t rHi = vpadd_f32( xy2, xy3);
-        rLo = vadd_f32(rLo, zLo);
-        rHi = vadd_f32(rHi, zHi);
-
-        uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
-        uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
-        dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
-        dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
-        iLo = vbsl_u32(maskLo, indexLo, iLo);
-        iHi = vbsl_u32(maskHi, indexHi, iHi);
-        indexLo = vadd_u32(indexLo, four);
-        indexHi = vadd_u32(indexHi, four);
-
-        v0 = vld1q_f32_aligned_postincrement( vv );
-        v1 = vld1q_f32_aligned_postincrement( vv );
-        v2 = vld1q_f32_aligned_postincrement( vv );
-        v3 = vld1q_f32_aligned_postincrement( vv );
-
-        xy0 = vmul_f32( vget_low_f32(v0), vLo);
-        xy1 = vmul_f32( vget_low_f32(v1), vLo);
-        xy2 = vmul_f32( vget_low_f32(v2), vLo);
-        xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
-        z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
-        z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
-        zLo = vmul_f32( z0.val[0], vHi);
-        zHi = vmul_f32( z1.val[0], vHi);
-
-        rLo = vpadd_f32( xy0, xy1);
-        rHi = vpadd_f32( xy2, xy3);
-        rLo = vadd_f32(rLo, zLo);
-        rHi = vadd_f32(rHi, zHi);
-
-        maskLo = vclt_f32( rLo, dotMinLo );
-        maskHi = vclt_f32( rHi, dotMinHi );
-        dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
-        dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
-        iLo = vbsl_u32(maskLo, indexLo, iLo);
-        iHi = vbsl_u32(maskHi, indexHi, iHi);
-        indexLo = vadd_u32(indexLo, four);
-        indexHi = vadd_u32(indexHi, four);
-    }
-
-    for( ; i+4 <= count; i+= 4 )
-    {
-        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
-        float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
-        float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
-        float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
-        float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
-        float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
-        float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
-        float32x2_t zLo = vmul_f32( z0.val[0], vHi);
-        float32x2_t zHi = vmul_f32( z1.val[0], vHi);
-
-        float32x2_t rLo = vpadd_f32( xy0, xy1);
-        float32x2_t rHi = vpadd_f32( xy2, xy3);
-        rLo = vadd_f32(rLo, zLo);
-        rHi = vadd_f32(rHi, zHi);
-
-        uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
-        uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
-        dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
-        dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
-        iLo = vbsl_u32(maskLo, indexLo, iLo);
-        iHi = vbsl_u32(maskHi, indexHi, iHi);
-        indexLo = vadd_u32(indexLo, four);
-        indexHi = vadd_u32(indexHi, four);
-    }
-    switch( count & 3 )
-    {
-        case 3:
-        {
-            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-            float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-
-            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
-            float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
-            float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
-
-            float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
-            float32x2_t zLo = vmul_f32( z0.val[0], vHi);
-            float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
-
-            float32x2_t rLo = vpadd_f32( xy0, xy1);
-            float32x2_t rHi = vpadd_f32( xy2, xy2);
-            rLo = vadd_f32(rLo, zLo);
-            rHi = vadd_f32(rHi, zHi);
-
-            uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
-            uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
-            dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
-            dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
-            iLo = vbsl_u32(maskLo, indexLo, iLo);
-            iHi = vbsl_u32(maskHi, indexHi, iHi);
-        }
-            break;
-        case 2:
-        {
-            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-
-            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
-            float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
-
-            float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
-            float32x2_t zLo = vmul_f32( z0.val[0], vHi);
-
-            float32x2_t rLo = vpadd_f32( xy0, xy1);
-            rLo = vadd_f32(rLo, zLo);
-
-            uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
-            dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
-            iLo = vbsl_u32(maskLo, indexLo, iLo);
-        }
-            break;
-        case 1:
-        {
-            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
-            float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
-            float32x2_t zLo = vmul_f32( z0, vHi);
-            float32x2_t rLo = vpadd_f32( xy0, xy0);
-            rLo = vadd_f32(rLo, zLo);
-            uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
-            dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
-            iLo = vbsl_u32(maskLo, indexLo, iLo);
-        }
-            break;
-
-        default:
-            break;
-    }
-
-    // select best answer between hi and lo results
-    uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
-    dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
-    iLo = vbsl_u32(mask, iHi, iLo);
-
-    // select best answer between even and odd results
-    dotMinHi = vdup_lane_f32(dotMinLo, 1);
-    iHi = vdup_lane_u32(iLo, 1);
-    mask = vclt_f32( dotMinHi, dotMinLo );
-    dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
-    iLo = vbsl_u32(mask, iHi, iLo);
-
-    *dotResult = vget_lane_f32( dotMinLo, 0);
-    return vget_lane_u32(iLo, 0);
+	unsigned long i = 0;
+	float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
+	float32x2_t vLo = vget_low_f32(vvec);
+	float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
+	float32x2_t dotMinLo = (float32x2_t){B3_INFINITY, B3_INFINITY};
+	float32x2_t dotMinHi = (float32x2_t){B3_INFINITY, B3_INFINITY};
+	uint32x2_t indexLo = (uint32x2_t){0, 1};
+	uint32x2_t indexHi = (uint32x2_t){2, 3};
+	uint32x2_t iLo = (uint32x2_t){-1, -1};
+	uint32x2_t iHi = (uint32x2_t){-1, -1};
+	const uint32x2_t four = (uint32x2_t){4, 4};
+
+	for (; i + 8 <= count; i += 8)
+	{
+		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+		float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+		float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+		float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+		float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+		float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+		float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+		float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+		float32x2_t zHi = vmul_f32(z1.val[0], vHi);
+
+		float32x2_t rLo = vpadd_f32(xy0, xy1);
+		float32x2_t rHi = vpadd_f32(xy2, xy3);
+		rLo = vadd_f32(rLo, zLo);
+		rHi = vadd_f32(rHi, zHi);
+
+		uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
+		uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
+		dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+		dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
+		iLo = vbsl_u32(maskLo, indexLo, iLo);
+		iHi = vbsl_u32(maskHi, indexHi, iHi);
+		indexLo = vadd_u32(indexLo, four);
+		indexHi = vadd_u32(indexHi, four);
+
+		v0 = vld1q_f32_aligned_postincrement(vv);
+		v1 = vld1q_f32_aligned_postincrement(vv);
+		v2 = vld1q_f32_aligned_postincrement(vv);
+		v3 = vld1q_f32_aligned_postincrement(vv);
+
+		xy0 = vmul_f32(vget_low_f32(v0), vLo);
+		xy1 = vmul_f32(vget_low_f32(v1), vLo);
+		xy2 = vmul_f32(vget_low_f32(v2), vLo);
+		xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+		z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+		z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+		zLo = vmul_f32(z0.val[0], vHi);
+		zHi = vmul_f32(z1.val[0], vHi);
+
+		rLo = vpadd_f32(xy0, xy1);
+		rHi = vpadd_f32(xy2, xy3);
+		rLo = vadd_f32(rLo, zLo);
+		rHi = vadd_f32(rHi, zHi);
+
+		maskLo = vclt_f32(rLo, dotMinLo);
+		maskHi = vclt_f32(rHi, dotMinHi);
+		dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+		dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
+		iLo = vbsl_u32(maskLo, indexLo, iLo);
+		iHi = vbsl_u32(maskHi, indexHi, iHi);
+		indexLo = vadd_u32(indexLo, four);
+		indexHi = vadd_u32(indexHi, four);
+	}
+
+	for (; i + 4 <= count; i += 4)
+	{
+		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+		float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+		float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+		float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+		float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+		float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+		float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+		float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+		float32x2_t zHi = vmul_f32(z1.val[0], vHi);
+
+		float32x2_t rLo = vpadd_f32(xy0, xy1);
+		float32x2_t rHi = vpadd_f32(xy2, xy3);
+		rLo = vadd_f32(rLo, zLo);
+		rHi = vadd_f32(rHi, zHi);
+
+		uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
+		uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
+		dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+		dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
+		iLo = vbsl_u32(maskLo, indexLo, iLo);
+		iHi = vbsl_u32(maskHi, indexHi, iHi);
+		indexLo = vadd_u32(indexLo, four);
+		indexHi = vadd_u32(indexHi, four);
+	}
+	switch (count & 3)
+	{
+		case 3:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+
+			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+			float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+			float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+
+			float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+			float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+			float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
+
+			float32x2_t rLo = vpadd_f32(xy0, xy1);
+			float32x2_t rHi = vpadd_f32(xy2, xy2);
+			rLo = vadd_f32(rLo, zLo);
+			rHi = vadd_f32(rHi, zHi);
+
+			uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
+			uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
+			dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+			dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
+			iLo = vbsl_u32(maskLo, indexLo, iLo);
+			iHi = vbsl_u32(maskHi, indexHi, iHi);
+		}
+		break;
+		case 2:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+
+			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+			float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+
+			float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+			float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+
+			float32x2_t rLo = vpadd_f32(xy0, xy1);
+			rLo = vadd_f32(rLo, zLo);
+
+			uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
+			dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+			iLo = vbsl_u32(maskLo, indexLo, iLo);
+		}
+		break;
+		case 1:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+			float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
+			float32x2_t zLo = vmul_f32(z0, vHi);
+			float32x2_t rLo = vpadd_f32(xy0, xy0);
+			rLo = vadd_f32(rLo, zLo);
+			uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
+			dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+			iLo = vbsl_u32(maskLo, indexLo, iLo);
+		}
+		break;
+
+		default:
+			break;
+	}
+
+	// select best answer between hi and lo results
+	uint32x2_t mask = vclt_f32(dotMinHi, dotMinLo);
+	dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
+	iLo = vbsl_u32(mask, iHi, iLo);
+
+	// select best answer between even and odd results
+	dotMinHi = vdup_lane_f32(dotMinLo, 1);
+	iHi = vdup_lane_u32(iLo, 1);
+	mask = vclt_f32(dotMinHi, dotMinLo);
+	dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
+	iLo = vbsl_u32(mask, iHi, iLo);
+
+	*dotResult = vget_lane_f32(dotMinLo, 0);
+	return vget_lane_u32(iLo, 0);
 }
 
-long b3_mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
+long b3_mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
 {
-    float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
-    float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
-    float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
-    const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
-    uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
-    uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
-    float32x4_t minDot = (float32x4_t) { B3_INFINITY, B3_INFINITY, B3_INFINITY, B3_INFINITY };
-
-    unsigned long i = 0;
-    for( ; i + 8 <= count; i += 8 )
-    {
-        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
-        // the next two lines should resolve to a single vswp d, d
-        float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
-        float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
-        // the next two lines should resolve to a single vswp d, d
-        float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
-        float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
-        xy0 = vmulq_f32(xy0, vLo);
-        xy1 = vmulq_f32(xy1, vLo);
-
-        float32x4x2_t zb = vuzpq_f32( z0, z1);
-        float32x4_t z = vmulq_f32( zb.val[0], vHi);
-        float32x4x2_t xy = vuzpq_f32( xy0, xy1);
-        float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
-        x = vaddq_f32(x, z);
-
-        uint32x4_t mask = vcltq_f32(x, minDot);
-        minDot = vbslq_f32( mask, x, minDot);
-        index = vbslq_u32(mask, local_index, index);
-        local_index = vaddq_u32(local_index, four);
-
-        v0 = vld1q_f32_aligned_postincrement( vv );
-        v1 = vld1q_f32_aligned_postincrement( vv );
-        v2 = vld1q_f32_aligned_postincrement( vv );
-        v3 = vld1q_f32_aligned_postincrement( vv );
-
-        // the next two lines should resolve to a single vswp d, d
-        xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
-        xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
-        // the next two lines should resolve to a single vswp d, d
-        z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
-        z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
-        xy0 = vmulq_f32(xy0, vLo);
-        xy1 = vmulq_f32(xy1, vLo);
-
-        zb = vuzpq_f32( z0, z1);
-        z = vmulq_f32( zb.val[0], vHi);
-        xy = vuzpq_f32( xy0, xy1);
-        x = vaddq_f32(xy.val[0], xy.val[1]);
-        x = vaddq_f32(x, z);
-
-        mask = vcltq_f32(x, minDot);
-        minDot = vbslq_f32( mask, x, minDot);
-        index = vbslq_u32(mask, local_index, index);
-        local_index = vaddq_u32(local_index, four);
-    }
-
-    for( ; i + 4 <= count; i += 4 )
-    {
-        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
-        // the next two lines should resolve to a single vswp d, d
-        float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
-        float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
-        // the next two lines should resolve to a single vswp d, d
-        float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
-        float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
-        xy0 = vmulq_f32(xy0, vLo);
-        xy1 = vmulq_f32(xy1, vLo);
-
-        float32x4x2_t zb = vuzpq_f32( z0, z1);
-        float32x4_t z = vmulq_f32( zb.val[0], vHi);
-        float32x4x2_t xy = vuzpq_f32( xy0, xy1);
-        float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
-        x = vaddq_f32(x, z);
-
-        uint32x4_t mask = vcltq_f32(x, minDot);
-        minDot = vbslq_f32( mask, x, minDot);
-        index = vbslq_u32(mask, local_index, index);
-        local_index = vaddq_u32(local_index, four);
-    }
-
-    switch (count & 3) {
-        case 3:
-        {
-            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-            float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-
-            // the next two lines should resolve to a single vswp d, d
-            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
-            float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
-            // the next two lines should resolve to a single vswp d, d
-            float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
-            float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
-
-            xy0 = vmulq_f32(xy0, vLo);
-            xy1 = vmulq_f32(xy1, vLo);
-
-            float32x4x2_t zb = vuzpq_f32( z0, z1);
-            float32x4_t z = vmulq_f32( zb.val[0], vHi);
-            float32x4x2_t xy = vuzpq_f32( xy0, xy1);
-            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
-            x = vaddq_f32(x, z);
-
-            uint32x4_t mask = vcltq_f32(x, minDot);
-            minDot = vbslq_f32( mask, x, minDot);
-            index = vbslq_u32(mask, local_index, index);
-            local_index = vaddq_u32(local_index, four);
-        }
-            break;
-
-        case 2:
-        {
-            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-
-            // the next two lines should resolve to a single vswp d, d
-            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
-            // the next two lines should resolve to a single vswp d, d
-            float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
-
-            xy0 = vmulq_f32(xy0, vLo);
-
-            float32x4x2_t zb = vuzpq_f32( z0, z0);
-            float32x4_t z = vmulq_f32( zb.val[0], vHi);
-            float32x4x2_t xy = vuzpq_f32( xy0, xy0);
-            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
-            x = vaddq_f32(x, z);
-
-            uint32x4_t mask = vcltq_f32(x, minDot);
-            minDot = vbslq_f32( mask, x, minDot);
-            index = vbslq_u32(mask, local_index, index);
-            local_index = vaddq_u32(local_index, four);
-        }
-            break;
-
-        case 1:
-        {
-            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-
-            // the next two lines should resolve to a single vswp d, d
-            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
-            // the next two lines should resolve to a single vswp d, d
-            float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
-
-            xy0 = vmulq_f32(xy0, vLo);
-
-            z = vmulq_f32( z, vHi);
-            float32x4x2_t xy = vuzpq_f32( xy0, xy0);
-            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
-            x = vaddq_f32(x, z);
-
-            uint32x4_t mask = vcltq_f32(x, minDot);
-            minDot = vbslq_f32( mask, x, minDot);
-            index = vbslq_u32(mask, local_index, index);
-            local_index = vaddq_u32(local_index, four);
-        }
-            break;
-
-        default:
-            break;
-    }
-
-
-    // select best answer between hi and lo results
-    uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
-    float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
-    uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
-
-    // select best answer between even and odd results
-    float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
-    uint32x2_t indexHi = vdup_lane_u32(index2, 1);
-    mask = vclt_f32( minDotO, minDot2 );
-    minDot2 = vbsl_f32(mask, minDotO, minDot2);
-    index2 = vbsl_u32(mask, indexHi, index2);
-
-    *dotResult = vget_lane_f32( minDot2, 0);
-    return vget_lane_u32(index2, 0);
-
+	float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
+	float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
+	float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
+	const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
+	uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
+	uint32x4_t index = (uint32x4_t){-1, -1, -1, -1};
+	float32x4_t minDot = (float32x4_t){B3_INFINITY, B3_INFINITY, B3_INFINITY, B3_INFINITY};
+
+	unsigned long i = 0;
+	for (; i + 8 <= count; i += 8)
+	{
+		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+		// the next two lines should resolve to a single vswp d, d
+		float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+		float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+		// the next two lines should resolve to a single vswp d, d
+		float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+		float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+		xy0 = vmulq_f32(xy0, vLo);
+		xy1 = vmulq_f32(xy1, vLo);
+
+		float32x4x2_t zb = vuzpq_f32(z0, z1);
+		float32x4_t z = vmulq_f32(zb.val[0], vHi);
+		float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+		float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+		x = vaddq_f32(x, z);
+
+		uint32x4_t mask = vcltq_f32(x, minDot);
+		minDot = vbslq_f32(mask, x, minDot);
+		index = vbslq_u32(mask, local_index, index);
+		local_index = vaddq_u32(local_index, four);
+
+		v0 = vld1q_f32_aligned_postincrement(vv);
+		v1 = vld1q_f32_aligned_postincrement(vv);
+		v2 = vld1q_f32_aligned_postincrement(vv);
+		v3 = vld1q_f32_aligned_postincrement(vv);
+
+		// the next two lines should resolve to a single vswp d, d
+		xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+		xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+		// the next two lines should resolve to a single vswp d, d
+		z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+		z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+		xy0 = vmulq_f32(xy0, vLo);
+		xy1 = vmulq_f32(xy1, vLo);
+
+		zb = vuzpq_f32(z0, z1);
+		z = vmulq_f32(zb.val[0], vHi);
+		xy = vuzpq_f32(xy0, xy1);
+		x = vaddq_f32(xy.val[0], xy.val[1]);
+		x = vaddq_f32(x, z);
+
+		mask = vcltq_f32(x, minDot);
+		minDot = vbslq_f32(mask, x, minDot);
+		index = vbslq_u32(mask, local_index, index);
+		local_index = vaddq_u32(local_index, four);
+	}
+
+	for (; i + 4 <= count; i += 4)
+	{
+		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+		// the next two lines should resolve to a single vswp d, d
+		float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+		float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+		// the next two lines should resolve to a single vswp d, d
+		float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+		float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+		xy0 = vmulq_f32(xy0, vLo);
+		xy1 = vmulq_f32(xy1, vLo);
+
+		float32x4x2_t zb = vuzpq_f32(z0, z1);
+		float32x4_t z = vmulq_f32(zb.val[0], vHi);
+		float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+		float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+		x = vaddq_f32(x, z);
+
+		uint32x4_t mask = vcltq_f32(x, minDot);
+		minDot = vbslq_f32(mask, x, minDot);
+		index = vbslq_u32(mask, local_index, index);
+		local_index = vaddq_u32(local_index, four);
+	}
+
+	switch (count & 3)
+	{
+		case 3:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+			float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+			float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
+
+			xy0 = vmulq_f32(xy0, vLo);
+			xy1 = vmulq_f32(xy1, vLo);
+
+			float32x4x2_t zb = vuzpq_f32(z0, z1);
+			float32x4_t z = vmulq_f32(zb.val[0], vHi);
+			float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+			x = vaddq_f32(x, z);
+
+			uint32x4_t mask = vcltq_f32(x, minDot);
+			minDot = vbslq_f32(mask, x, minDot);
+			index = vbslq_u32(mask, local_index, index);
+			local_index = vaddq_u32(local_index, four);
+		}
+		break;
+
+		case 2:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+
+			xy0 = vmulq_f32(xy0, vLo);
+
+			float32x4x2_t zb = vuzpq_f32(z0, z0);
+			float32x4_t z = vmulq_f32(zb.val[0], vHi);
+			float32x4x2_t xy = vuzpq_f32(xy0, xy0);
+			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+			x = vaddq_f32(x, z);
+
+			uint32x4_t mask = vcltq_f32(x, minDot);
+			minDot = vbslq_f32(mask, x, minDot);
+			index = vbslq_u32(mask, local_index, index);
+			local_index = vaddq_u32(local_index, four);
+		}
+		break;
+
+		case 1:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
+
+			xy0 = vmulq_f32(xy0, vLo);
+
+			z = vmulq_f32(z, vHi);
+			float32x4x2_t xy = vuzpq_f32(xy0, xy0);
+			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+			x = vaddq_f32(x, z);
+
+			uint32x4_t mask = vcltq_f32(x, minDot);
+			minDot = vbslq_f32(mask, x, minDot);
+			index = vbslq_u32(mask, local_index, index);
+			local_index = vaddq_u32(local_index, four);
+		}
+		break;
+
+		default:
+			break;
+	}
+
+	// select best answer between hi and lo results
+	uint32x2_t mask = vclt_f32(vget_high_f32(minDot), vget_low_f32(minDot));
+	float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
+	uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
+
+	// select best answer between even and odd results
+	float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
+	uint32x2_t indexHi = vdup_lane_u32(index2, 1);
+	mask = vclt_f32(minDotO, minDot2);
+	minDot2 = vbsl_f32(mask, minDotO, minDot2);
+	index2 = vbsl_u32(mask, indexHi, index2);
+
+	*dotResult = vget_lane_f32(minDot2, 0);
+	return vget_lane_u32(index2, 0);
 }
 
 #else
-    #error Unhandled __APPLE__ arch
+#error Unhandled __APPLE__ arch
 #endif
 
-#endif  /* __APPLE__ */
-
-
+#endif /* __APPLE__ */
diff --git a/thirdparty/bullet/Bullet3Common/b3Vector3.h b/thirdparty/bullet/Bullet3Common/b3Vector3.h
index 16ec02b0ed..56e6c13311 100644
--- a/thirdparty/bullet/Bullet3Common/b3Vector3.h
+++ b/thirdparty/bullet/Bullet3Common/b3Vector3.h
@@ -12,8 +12,6 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
-
-
 #ifndef B3_VECTOR3_H
 #define B3_VECTOR3_H
 
@@ -28,37 +26,34 @@ subject to the following restrictions:
 #else
 #define b3Vector3Data b3Vector3FloatData
 #define b3Vector3DataName "b3Vector3FloatData"
-#endif //B3_USE_DOUBLE_PRECISION
+#endif  //B3_USE_DOUBLE_PRECISION
 
 #if defined B3_USE_SSE
 
 //typedef  uint32_t __m128i __attribute__ ((vector_size(16)));
 
 #ifdef _MSC_VER
-#pragma warning(disable: 4556) // value of intrinsic immediate argument '4294967239' is out of range '0 - 255'
+#pragma warning(disable : 4556)  // value of intrinsic immediate argument '4294967239' is out of range '0 - 255'
 #endif
 
-
-#define B3_SHUFFLE(x,y,z,w) ((w)<<6 | (z)<<4 | (y)<<2 | (x))
+#define B3_SHUFFLE(x, y, z, w) ((w) << 6 | (z) << 4 | (y) << 2 | (x))
 //#define b3_pshufd_ps( _a, _mask ) (__m128) _mm_shuffle_epi32((__m128i)(_a), (_mask) )
-#define b3_pshufd_ps( _a, _mask ) _mm_shuffle_ps((_a), (_a), (_mask) )
-#define b3_splat3_ps( _a, _i ) b3_pshufd_ps((_a), B3_SHUFFLE(_i,_i,_i, 3) )
-#define b3_splat_ps( _a, _i )  b3_pshufd_ps((_a), B3_SHUFFLE(_i,_i,_i,_i) )
+#define b3_pshufd_ps(_a, _mask) _mm_shuffle_ps((_a), (_a), (_mask))
+#define b3_splat3_ps(_a, _i) b3_pshufd_ps((_a), B3_SHUFFLE(_i, _i, _i, 3))
+#define b3_splat_ps(_a, _i) b3_pshufd_ps((_a), B3_SHUFFLE(_i, _i, _i, _i))
 
 #define b3v3AbsiMask (_mm_set_epi32(0x00000000, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
-#define b3vAbsMask (_mm_set_epi32( 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
+#define b3vAbsMask (_mm_set_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
 #define b3vFFF0Mask (_mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))
 #define b3v3AbsfMask b3CastiTo128f(b3v3AbsiMask)
 #define b3vFFF0fMask b3CastiTo128f(b3vFFF0Mask)
 #define b3vxyzMaskf b3vFFF0fMask
 #define b3vAbsfMask b3CastiTo128f(b3vAbsMask)
 
-
-
 const __m128 B3_ATTRIBUTE_ALIGNED16(b3vMzeroMask) = {-0.0f, -0.0f, -0.0f, -0.0f};
 const __m128 B3_ATTRIBUTE_ALIGNED16(b3v1110) = {1.0f, 1.0f, 1.0f, 0.0f};
 const __m128 B3_ATTRIBUTE_ALIGNED16(b3vHalf) = {0.5f, 0.5f, 0.5f, 0.5f};
-const __m128 B3_ATTRIBUTE_ALIGNED16(b3v1_5)  = {1.5f, 1.5f, 1.5f, 1.5f};
+const __m128 B3_ATTRIBUTE_ALIGNED16(b3v1_5) = {1.5f, 1.5f, 1.5f, 1.5f};
 
 #endif
 
@@ -74,70 +69,69 @@ const int32x4_t B3_ATTRIBUTE_ALIGNED16(b3v3AbsMask) = (int32x4_t){0x7FFFFFFF, 0x
 class b3Vector3;
 class b3Vector4;
 
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 //#if defined (B3_USE_SSE) || defined (B3_USE_NEON)
-inline b3Vector3 b3MakeVector3( b3SimdFloat4 v);
-inline b3Vector4 b3MakeVector4( b3SimdFloat4 vec);
+inline b3Vector3 b3MakeVector3(b3SimdFloat4 v);
+inline b3Vector4 b3MakeVector4(b3SimdFloat4 vec);
 #endif
 
-inline b3Vector3 b3MakeVector3(b3Scalar x,b3Scalar y,b3Scalar z);
-inline b3Vector3 b3MakeVector3(b3Scalar x,b3Scalar y,b3Scalar z, b3Scalar w);
-inline b3Vector4 b3MakeVector4(b3Scalar x,b3Scalar y,b3Scalar z,b3Scalar w);
-
+inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z);
+inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w);
+inline b3Vector4 b3MakeVector4(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w);
 
 /**@brief b3Vector3 can be used to represent 3D points and vectors.
  * It has an un-used w component to suit 16-byte alignment when b3Vector3 is stored in containers. This extra component can be used by derived classes (Quaternion?) or by user
  * Ideally, this class should be replaced by a platform optimized SIMD version that keeps the data in registers
  */
-B3_ATTRIBUTE_ALIGNED16(class) b3Vector3
+B3_ATTRIBUTE_ALIGNED16(class)
+b3Vector3
 {
 public:
-#if defined (B3_USE_SSE) || defined(B3_USE_NEON) // _WIN32 || ARM
-        union {
-            b3SimdFloat4      mVec128;
-            float	m_floats[4];
-			struct {float x,y,z,w;};
-
-        };
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)  // _WIN32 || ARM
+	union {
+		b3SimdFloat4 mVec128;
+		float m_floats[4];
+		struct
+		{
+			float x, y, z, w;
+		};
+	};
 #else
-	union
-	{
-        	float	m_floats[4];
-			struct {float	x,y,z,w;};
+	union {
+		float m_floats[4];
+		struct
+		{
+			float x, y, z, w;
+		};
 	};
 #endif
 
-
 public:
-
 	B3_DECLARE_ALIGNED_ALLOCATOR();
 
-#if defined (B3_USE_SSE) || defined(B3_USE_NEON) // _WIN32 || ARM
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)  // _WIN32 || ARM
 
 	/*B3_FORCE_INLINE		b3Vector3()
 	{
 	}
 	*/
 
-    B3_FORCE_INLINE	b3SimdFloat4	get128() const
-    {
-        return mVec128;
-    }
-    B3_FORCE_INLINE	void	set128(b3SimdFloat4 v128)
-    {
-        mVec128 = v128;
-    }
+	B3_FORCE_INLINE b3SimdFloat4 get128() const
+	{
+		return mVec128;
+	}
+	B3_FORCE_INLINE void set128(b3SimdFloat4 v128)
+	{
+		mVec128 = v128;
+	}
 #endif
 
-	public:
-
-
-
-/**@brief Add a vector to this one
+public:
+	/**@brief Add a vector to this one
  * @param The vector to add to this one */
 	B3_FORCE_INLINE b3Vector3& operator+=(const b3Vector3& v)
 	{
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 		mVec128 = _mm_add_ps(mVec128, v.mVec128);
 #elif defined(B3_USE_NEON)
 		mVec128 = vaddq_f32(mVec128, v.mVec128);
@@ -149,12 +143,11 @@ public:
 		return *this;
 	}
 
-
-  /**@brief Subtract a vector from this one
+	/**@brief Subtract a vector from this one
    * @param The vector to subtract */
 	B3_FORCE_INLINE b3Vector3& operator-=(const b3Vector3& v)
 	{
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 		mVec128 = _mm_sub_ps(mVec128, v.mVec128);
 #elif defined(B3_USE_NEON)
 		mVec128 = vsubq_f32(mVec128, v.mVec128);
@@ -166,13 +159,13 @@ public:
 		return *this;
 	}
 
-  /**@brief Scale the vector
+	/**@brief Scale the vector
    * @param s Scale factor */
 	B3_FORCE_INLINE b3Vector3& operator*=(const b3Scalar& s)
 	{
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
-		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
-		vs = b3_pshufd_ps(vs, 0x80);	//	(S S S 0.0)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vs = _mm_load_ss(&s);  //	(S 0 0 0)
+		vs = b3_pshufd_ps(vs, 0x80);  //	(S S S 0.0)
 		mVec128 = _mm_mul_ps(mVec128, vs);
 #elif defined(B3_USE_NEON)
 		mVec128 = vmulq_n_f32(mVec128, s);
@@ -184,13 +177,13 @@ public:
 		return *this;
 	}
 
-  /**@brief Inversely scale the vector
+	/**@brief Inversely scale the vector
    * @param s Scale factor to divide by */
 	B3_FORCE_INLINE b3Vector3& operator/=(const b3Scalar& s)
 	{
 		b3FullAssert(s != b3Scalar(0.0));
 
-#if 0 //defined(B3_USE_SSE_IN_API)
+#if 0  //defined(B3_USE_SSE_IN_API)
 // this code is not faster !
 		__m128 vs = _mm_load_ss(&s);
 		vs = _mm_div_ss(b3v1110, vs);
@@ -204,11 +197,11 @@ public:
 #endif
 	}
 
-  /**@brief Return the dot product
+	/**@brief Return the dot product
    * @param v The other vector in the dot product */
 	B3_FORCE_INLINE b3Scalar dot(const b3Vector3& v) const
 	{
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 		__m128 vd = _mm_mul_ps(mVec128, v.mVec128);
 		__m128 z = _mm_movehl_ps(vd, vd);
 		__m128 y = _mm_shuffle_ps(vd, vd, 0x55);
@@ -221,29 +214,29 @@ public:
 		x = vadd_f32(x, vget_high_f32(vd));
 		return vget_lane_f32(x, 0);
 #else
-		return	m_floats[0] * v.m_floats[0] +
-				m_floats[1] * v.m_floats[1] +
-				m_floats[2] * v.m_floats[2];
+		return m_floats[0] * v.m_floats[0] +
+			   m_floats[1] * v.m_floats[1] +
+			   m_floats[2] * v.m_floats[2];
 #endif
 	}
 
-  /**@brief Return the length of the vector squared */
+	/**@brief Return the length of the vector squared */
 	B3_FORCE_INLINE b3Scalar length2() const
 	{
 		return dot(*this);
 	}
 
-  /**@brief Return the length of the vector */
+	/**@brief Return the length of the vector */
 	B3_FORCE_INLINE b3Scalar length() const
 	{
 		return b3Sqrt(length2());
 	}
 
-  /**@brief Return the distance squared between the ends of this and another vector
+	/**@brief Return the distance squared between the ends of this and another vector
    * This is symantically treating the vector like a point */
 	B3_FORCE_INLINE b3Scalar distance2(const b3Vector3& v) const;
 
-  /**@brief Return the distance between the ends of this and another vector
+	/**@brief Return the distance between the ends of this and another vector
    * This is symantically treating the vector like a point */
 	B3_FORCE_INLINE b3Scalar distance(const b3Vector3& v) const;
 
@@ -251,7 +244,7 @@ public:
 	{
 		b3Scalar l2 = length2();
 		//triNormal.normalize();
-		if (l2 >= B3_EPSILON*B3_EPSILON)
+		if (l2 >= B3_EPSILON * B3_EPSILON)
 		{
 			(*this) /= b3Sqrt(l2);
 		}
@@ -262,43 +255,42 @@ public:
 		return *this;
 	}
 
-  /**@brief Normalize this vector
+	/**@brief Normalize this vector
    * x^2 + y^2 + z^2 = 1 */
 	B3_FORCE_INLINE b3Vector3& normalize()
 	{
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
-        // dot product first
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		// dot product first
 		__m128 vd = _mm_mul_ps(mVec128, mVec128);
 		__m128 z = _mm_movehl_ps(vd, vd);
 		__m128 y = _mm_shuffle_ps(vd, vd, 0x55);
 		vd = _mm_add_ss(vd, y);
 		vd = _mm_add_ss(vd, z);
 
-        #if 0
+#if 0
         vd = _mm_sqrt_ss(vd);
 		vd = _mm_div_ss(b3v1110, vd);
 		vd = b3_splat_ps(vd, 0x80);
 		mVec128 = _mm_mul_ps(mVec128, vd);
-        #else
+#else
 
-        // NR step 1/sqrt(x) - vd is x, y is output
-        y = _mm_rsqrt_ss(vd); // estimate
+		// NR step 1/sqrt(x) - vd is x, y is output
+		y = _mm_rsqrt_ss(vd);  // estimate
 
-        //  one step NR
-        z = b3v1_5;
-        vd = _mm_mul_ss(vd, b3vHalf); // vd * 0.5
-        //x2 = vd;
-        vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0
-        vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0 * y0
-        z = _mm_sub_ss(z, vd);  // 1.5 - vd * 0.5 * y0 * y0
+		//  one step NR
+		z = b3v1_5;
+		vd = _mm_mul_ss(vd, b3vHalf);  // vd * 0.5
+		//x2 = vd;
+		vd = _mm_mul_ss(vd, y);  // vd * 0.5 * y0
+		vd = _mm_mul_ss(vd, y);  // vd * 0.5 * y0 * y0
+		z = _mm_sub_ss(z, vd);   // 1.5 - vd * 0.5 * y0 * y0
 
-        y = _mm_mul_ss(y, z);   // y0 * (1.5 - vd * 0.5 * y0 * y0)
+		y = _mm_mul_ss(y, z);  // y0 * (1.5 - vd * 0.5 * y0 * y0)
 
 		y = b3_splat_ps(y, 0x80);
 		mVec128 = _mm_mul_ps(mVec128, y);
 
-        #endif
-
+#endif
 
 		return *this;
 #else
@@ -306,15 +298,15 @@ public:
 #endif
 	}
 
-  /**@brief Return a normalized version of this vector */
+	/**@brief Return a normalized version of this vector */
 	B3_FORCE_INLINE b3Vector3 normalized() const;
 
-  /**@brief Return a rotated version of this vector
+	/**@brief Return a rotated version of this vector
    * @param wAxis The axis to rotate about
    * @param angle The angle to rotate by */
-	B3_FORCE_INLINE b3Vector3 rotate( const b3Vector3& wAxis, const b3Scalar angle ) const;
+	B3_FORCE_INLINE b3Vector3 rotate(const b3Vector3& wAxis, const b3Scalar angle) const;
 
-  /**@brief Return the angle between this and another vector
+	/**@brief Return the angle between this and another vector
    * @param v The other vector */
 	B3_FORCE_INLINE b3Scalar angle(const b3Vector3& v) const
 	{
@@ -323,10 +315,10 @@ public:
 		return b3Acos(dot(v) / s);
 	}
 
-  /**@brief Return a vector will the absolute values of each element */
+	/**@brief Return a vector will the absolute values of each element */
 	B3_FORCE_INLINE b3Vector3 absolute() const
 	{
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 		return b3MakeVector3(_mm_and_ps(mVec128, b3v3AbsfMask));
 #elif defined(B3_USE_NEON)
 		return b3Vector3(vabsq_f32(mVec128));
@@ -338,15 +330,15 @@ public:
 #endif
 	}
 
-  /**@brief Return the cross product between this and another vector
+	/**@brief Return the cross product between this and another vector
    * @param v The other vector */
 	B3_FORCE_INLINE b3Vector3 cross(const b3Vector3& v) const
 	{
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
-		__m128	T, V;
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 T, V;
 
-		T = b3_pshufd_ps(mVec128, B3_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
-		V = b3_pshufd_ps(v.mVec128, B3_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
+		T = b3_pshufd_ps(mVec128, B3_SHUFFLE(1, 2, 0, 3));    //	(Y Z X 0)
+		V = b3_pshufd_ps(v.mVec128, B3_SHUFFLE(1, 2, 0, 3));  //	(Y Z X 0)
 
 		V = _mm_mul_ps(V, mVec128);
 		T = _mm_mul_ps(T, v.mVec128);
@@ -381,10 +373,10 @@ public:
 
 	B3_FORCE_INLINE b3Scalar triple(const b3Vector3& v1, const b3Vector3& v2) const
 	{
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 		// cross:
-		__m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, B3_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
-		__m128 V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, B3_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
+		__m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, B3_SHUFFLE(1, 2, 0, 3));  //	(Y Z X 0)
+		__m128 V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, B3_SHUFFLE(1, 2, 0, 3));  //	(Y Z X 0)
 
 		V = _mm_mul_ps(V, v1.mVec128);
 		T = _mm_mul_ps(T, v2.mVec128);
@@ -422,25 +414,24 @@ public:
 		x = vadd_f32(x, vget_high_f32(V));
 		return vget_lane_f32(x, 0);
 #else
-		return
-			m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) +
-			m_floats[1] * (v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2]) +
-			m_floats[2] * (v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0]);
+		return m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) +
+			   m_floats[1] * (v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2]) +
+			   m_floats[2] * (v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0]);
 #endif
 	}
 
-  /**@brief Return the axis with the smallest value
+	/**@brief Return the axis with the smallest value
    * Note return values are 0,1,2 for x, y, or z */
 	B3_FORCE_INLINE int minAxis() const
 	{
-		return m_floats[0] < m_floats[1] ? (m_floats[0] <m_floats[2] ? 0 : 2) : (m_floats[1] <m_floats[2] ? 1 : 2);
+		return m_floats[0] < m_floats[1] ? (m_floats[0] < m_floats[2] ? 0 : 2) : (m_floats[1] < m_floats[2] ? 1 : 2);
 	}
 
-  /**@brief Return the axis with the largest value
+	/**@brief Return the axis with the largest value
    * Note return values are 0,1,2 for x, y, or z */
 	B3_FORCE_INLINE int maxAxis() const
 	{
-		return m_floats[0] < m_floats[1] ? (m_floats[1] <m_floats[2] ? 2 : 1) : (m_floats[0] <m_floats[2] ? 2 : 0);
+		return m_floats[0] < m_floats[1] ? (m_floats[1] < m_floats[2] ? 2 : 1) : (m_floats[0] < m_floats[2] ? 2 : 0);
 	}
 
 	B3_FORCE_INLINE int furthestAxis() const
@@ -453,18 +444,17 @@ public:
 		return absolute().maxAxis();
 	}
 
-
 	B3_FORCE_INLINE void setInterpolate3(const b3Vector3& v0, const b3Vector3& v1, b3Scalar rt)
 	{
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
-		__m128	vrt = _mm_load_ss(&rt);	//	(rt 0 0 0)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vrt = _mm_load_ss(&rt);  //	(rt 0 0 0)
 		b3Scalar s = b3Scalar(1.0) - rt;
-		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
-		vs = b3_pshufd_ps(vs, 0x80);	//	(S S S 0.0)
+		__m128 vs = _mm_load_ss(&s);  //	(S 0 0 0)
+		vs = b3_pshufd_ps(vs, 0x80);  //	(S S S 0.0)
 		__m128 r0 = _mm_mul_ps(v0.mVec128, vs);
-		vrt = b3_pshufd_ps(vrt, 0x80);	//	(rt rt rt 0.0)
+		vrt = b3_pshufd_ps(vrt, 0x80);  //	(rt rt rt 0.0)
 		__m128 r1 = _mm_mul_ps(v1.mVec128, vrt);
-		__m128 tmp3 = _mm_add_ps(r0,r1);
+		__m128 tmp3 = _mm_add_ps(r0, r1);
 		mVec128 = tmp3;
 #elif defined(B3_USE_NEON)
 		float32x4_t vl = vsubq_f32(v1.mVec128, v0.mVec128);
@@ -480,14 +470,14 @@ public:
 #endif
 	}
 
-  /**@brief Return the linear interpolation between this and another vector
+	/**@brief Return the linear interpolation between this and another vector
    * @param v The other vector
    * @param t The ration of this to v (t = 0 => return this, t=1 => return other) */
 	B3_FORCE_INLINE b3Vector3 lerp(const b3Vector3& v, const b3Scalar& t) const
 	{
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
-		__m128	vt = _mm_load_ss(&t);	//	(t 0 0 0)
-		vt = b3_pshufd_ps(vt, 0x80);	//	(rt rt rt 0.0)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vt = _mm_load_ss(&t);  //	(t 0 0 0)
+		vt = b3_pshufd_ps(vt, 0x80);  //	(rt rt rt 0.0)
 		__m128 vl = _mm_sub_ps(v.mVec128, mVec128);
 		vl = _mm_mul_ps(vl, vt);
 		vl = _mm_add_ps(vl, mVec128);
@@ -500,18 +490,17 @@ public:
 
 		return b3Vector3(vl);
 #else
-		return
-			b3MakeVector3(	m_floats[0] + (v.m_floats[0] - m_floats[0]) * t,
-						m_floats[1] + (v.m_floats[1] - m_floats[1]) * t,
-						m_floats[2] + (v.m_floats[2] - m_floats[2]) * t);
+		return b3MakeVector3(m_floats[0] + (v.m_floats[0] - m_floats[0]) * t,
+							 m_floats[1] + (v.m_floats[1] - m_floats[1]) * t,
+							 m_floats[2] + (v.m_floats[2] - m_floats[2]) * t);
 #endif
 	}
 
-  /**@brief Elementwise multiply this vector by the other
+	/**@brief Elementwise multiply this vector by the other
    * @param v The other vector */
 	B3_FORCE_INLINE b3Vector3& operator*=(const b3Vector3& v)
 	{
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 		mVec128 = _mm_mul_ps(mVec128, v.mVec128);
 #elif defined(B3_USE_NEON)
 		mVec128 = vmulq_f32(mVec128, v.mVec128);
@@ -523,53 +512,53 @@ public:
 		return *this;
 	}
 
-	 /**@brief Return the x value */
-		B3_FORCE_INLINE const b3Scalar& getX() const { return m_floats[0]; }
-  /**@brief Return the y value */
-		B3_FORCE_INLINE const b3Scalar& getY() const { return m_floats[1]; }
-  /**@brief Return the z value */
-		B3_FORCE_INLINE const b3Scalar& getZ() const { return m_floats[2]; }
-/**@brief Return the w value */
-		B3_FORCE_INLINE const b3Scalar& getW() const { return m_floats[3]; }
-
-  /**@brief Set the x value */
-		B3_FORCE_INLINE void	setX(b3Scalar _x) { m_floats[0] = _x;};
-  /**@brief Set the y value */
-		B3_FORCE_INLINE void	setY(b3Scalar _y) { m_floats[1] = _y;};
-  /**@brief Set the z value */
-		B3_FORCE_INLINE void	setZ(b3Scalar _z) { m_floats[2] = _z;};
-  /**@brief Set the w value */
-		B3_FORCE_INLINE void	setW(b3Scalar _w) { m_floats[3] = _w;};
+	/**@brief Return the x value */
+	B3_FORCE_INLINE const b3Scalar& getX() const { return m_floats[0]; }
+	/**@brief Return the y value */
+	B3_FORCE_INLINE const b3Scalar& getY() const { return m_floats[1]; }
+	/**@brief Return the z value */
+	B3_FORCE_INLINE const b3Scalar& getZ() const { return m_floats[2]; }
+	/**@brief Return the w value */
+	B3_FORCE_INLINE const b3Scalar& getW() const { return m_floats[3]; }
+
+	/**@brief Set the x value */
+	B3_FORCE_INLINE void setX(b3Scalar _x) { m_floats[0] = _x; };
+	/**@brief Set the y value */
+	B3_FORCE_INLINE void setY(b3Scalar _y) { m_floats[1] = _y; };
+	/**@brief Set the z value */
+	B3_FORCE_INLINE void setZ(b3Scalar _z) { m_floats[2] = _z; };
+	/**@brief Set the w value */
+	B3_FORCE_INLINE void setW(b3Scalar _w) { m_floats[3] = _w; };
 
 	//B3_FORCE_INLINE b3Scalar&       operator[](int i)       { return (&m_floats[0])[i];	}
 	//B3_FORCE_INLINE const b3Scalar& operator[](int i) const { return (&m_floats[0])[i]; }
 	///operator b3Scalar*() replaces operator[], using implicit conversion. We added operator != and operator == to avoid pointer comparisons.
-	B3_FORCE_INLINE	operator       b3Scalar *()       { return &m_floats[0]; }
-	B3_FORCE_INLINE	operator const b3Scalar *() const { return &m_floats[0]; }
+	B3_FORCE_INLINE operator b3Scalar*() { return &m_floats[0]; }
+	B3_FORCE_INLINE operator const b3Scalar*() const { return &m_floats[0]; }
 
-	B3_FORCE_INLINE	bool	operator==(const b3Vector3& other) const
+	B3_FORCE_INLINE bool operator==(const b3Vector3& other) const
 	{
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
-        return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
 #else
-		return ((m_floats[3]==other.m_floats[3]) &&
-                (m_floats[2]==other.m_floats[2]) &&
-                (m_floats[1]==other.m_floats[1]) &&
-                (m_floats[0]==other.m_floats[0]));
+		return ((m_floats[3] == other.m_floats[3]) &&
+				(m_floats[2] == other.m_floats[2]) &&
+				(m_floats[1] == other.m_floats[1]) &&
+				(m_floats[0] == other.m_floats[0]));
 #endif
 	}
 
-	B3_FORCE_INLINE	bool	operator!=(const b3Vector3& other) const
+	B3_FORCE_INLINE bool operator!=(const b3Vector3& other) const
 	{
 		return !(*this == other);
 	}
 
-  /**@brief Set each element to the max of the current values and the values of another b3Vector3
+	/**@brief Set each element to the max of the current values and the values of another b3Vector3
    * @param other The other b3Vector3 to compare with
    */
-	B3_FORCE_INLINE void	setMax(const b3Vector3& other)
+	B3_FORCE_INLINE void setMax(const b3Vector3& other)
 	{
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 		mVec128 = _mm_max_ps(mVec128, other.mVec128);
 #elif defined(B3_USE_NEON)
 		mVec128 = vmaxq_f32(mVec128, other.mVec128);
@@ -581,12 +570,12 @@ public:
 #endif
 	}
 
-  /**@brief Set each element to the min of the current values and the values of another b3Vector3
+	/**@brief Set each element to the min of the current values and the values of another b3Vector3
    * @param other The other b3Vector3 to compare with
    */
-	B3_FORCE_INLINE void	setMin(const b3Vector3& other)
+	B3_FORCE_INLINE void setMin(const b3Vector3& other)
 	{
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 		mVec128 = _mm_min_ps(mVec128, other.mVec128);
 #elif defined(B3_USE_NEON)
 		mVec128 = vminq_f32(mVec128, other.mVec128);
@@ -598,46 +587,46 @@ public:
 #endif
 	}
 
-	B3_FORCE_INLINE void 	setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z)
+	B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z)
 	{
-		m_floats[0]=_x;
-		m_floats[1]=_y;
-		m_floats[2]=_z;
+		m_floats[0] = _x;
+		m_floats[1] = _y;
+		m_floats[2] = _z;
 		m_floats[3] = b3Scalar(0.f);
 	}
 
-	void	getSkewSymmetricMatrix(b3Vector3* v0,b3Vector3* v1,b3Vector3* v2) const
+	void getSkewSymmetricMatrix(b3Vector3 * v0, b3Vector3 * v1, b3Vector3 * v2) const
 	{
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 
-		__m128 V  = _mm_and_ps(mVec128, b3vFFF0fMask);
+		__m128 V = _mm_and_ps(mVec128, b3vFFF0fMask);
 		__m128 V0 = _mm_xor_ps(b3vMzeroMask, V);
 		__m128 V2 = _mm_movelh_ps(V0, V);
 
 		__m128 V1 = _mm_shuffle_ps(V, V0, 0xCE);
 
-        V0 = _mm_shuffle_ps(V0, V, 0xDB);
+		V0 = _mm_shuffle_ps(V0, V, 0xDB);
 		V2 = _mm_shuffle_ps(V2, V, 0xF9);
 
 		v0->mVec128 = V0;
 		v1->mVec128 = V1;
 		v2->mVec128 = V2;
 #else
-		v0->setValue(0.		,-getZ()		,getY());
-		v1->setValue(getZ()	,0.			,-getX());
-		v2->setValue(-getY()	,getX()	,0.);
+		v0->setValue(0., -getZ(), getY());
+		v1->setValue(getZ(), 0., -getX());
+		v2->setValue(-getY(), getX(), 0.);
 #endif
 	}
 
 	void setZero()
 	{
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 		mVec128 = (__m128)_mm_xor_ps(mVec128, mVec128);
 #elif defined(B3_USE_NEON)
 		int32x4_t vi = vdupq_n_s32(0);
 		mVec128 = vreinterpretq_f32_s32(vi);
 #else
-		setValue(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+		setValue(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
 #endif
 	}
 
@@ -651,76 +640,76 @@ public:
 		return length2() < B3_EPSILON;
 	}
 
-	B3_FORCE_INLINE	void	serialize(struct	b3Vector3Data& dataOut) const;
+	B3_FORCE_INLINE void serialize(struct b3Vector3Data & dataOut) const;
 
-	B3_FORCE_INLINE	void	deSerialize(const struct	b3Vector3Data& dataIn);
+	B3_FORCE_INLINE void deSerialize(const struct b3Vector3Data& dataIn);
 
-	B3_FORCE_INLINE	void	serializeFloat(struct	b3Vector3FloatData& dataOut) const;
+	B3_FORCE_INLINE void serializeFloat(struct b3Vector3FloatData & dataOut) const;
 
-	B3_FORCE_INLINE	void	deSerializeFloat(const struct	b3Vector3FloatData& dataIn);
+	B3_FORCE_INLINE void deSerializeFloat(const struct b3Vector3FloatData& dataIn);
 
-	B3_FORCE_INLINE	void	serializeDouble(struct	b3Vector3DoubleData& dataOut) const;
+	B3_FORCE_INLINE void serializeDouble(struct b3Vector3DoubleData & dataOut) const;
 
-	B3_FORCE_INLINE	void	deSerializeDouble(const struct	b3Vector3DoubleData& dataIn);
+	B3_FORCE_INLINE void deSerializeDouble(const struct b3Vector3DoubleData& dataIn);
 
-        /**@brief returns index of maximum dot product between this and vectors in array[]
+	/**@brief returns index of maximum dot product between this and vectors in array[]
          * @param array The other vectors
          * @param array_count The number of other vectors
          * @param dotOut The maximum dot product */
-        B3_FORCE_INLINE   long    maxDot( const b3Vector3 *array, long array_count, b3Scalar &dotOut ) const;
+	B3_FORCE_INLINE long maxDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const;
 
-        /**@brief returns index of minimum dot product between this and vectors in array[]
+	/**@brief returns index of minimum dot product between this and vectors in array[]
          * @param array The other vectors
          * @param array_count The number of other vectors
          * @param dotOut The minimum dot product */
-        B3_FORCE_INLINE   long    minDot( const b3Vector3 *array, long array_count, b3Scalar &dotOut ) const;
-
-    /* create a vector as  b3Vector3( this->dot( b3Vector3 v0 ), this->dot( b3Vector3 v1), this->dot( b3Vector3 v2 ))  */
-    B3_FORCE_INLINE b3Vector3  dot3( const b3Vector3 &v0, const b3Vector3 &v1, const b3Vector3 &v2 ) const
-    {
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
-
-        __m128 a0 = _mm_mul_ps( v0.mVec128, this->mVec128 );
-        __m128 a1 = _mm_mul_ps( v1.mVec128, this->mVec128 );
-        __m128 a2 = _mm_mul_ps( v2.mVec128, this->mVec128 );
-        __m128 b0 = _mm_unpacklo_ps( a0, a1 );
-        __m128 b1 = _mm_unpackhi_ps( a0, a1 );
-        __m128 b2 = _mm_unpacklo_ps( a2, _mm_setzero_ps() );
-        __m128 r = _mm_movelh_ps( b0, b2 );
-        r = _mm_add_ps( r, _mm_movehl_ps( b2, b0 ));
-        a2 = _mm_and_ps( a2, b3vxyzMaskf);
-        r = _mm_add_ps( r, b3CastdTo128f (_mm_move_sd( b3CastfTo128d(a2), b3CastfTo128d(b1) )));
-        return b3MakeVector3(r);
+	B3_FORCE_INLINE long minDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const;
+
+	/* create a vector as  b3Vector3( this->dot( b3Vector3 v0 ), this->dot( b3Vector3 v1), this->dot( b3Vector3 v2 ))  */
+	B3_FORCE_INLINE b3Vector3 dot3(const b3Vector3& v0, const b3Vector3& v1, const b3Vector3& v2) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+
+		__m128 a0 = _mm_mul_ps(v0.mVec128, this->mVec128);
+		__m128 a1 = _mm_mul_ps(v1.mVec128, this->mVec128);
+		__m128 a2 = _mm_mul_ps(v2.mVec128, this->mVec128);
+		__m128 b0 = _mm_unpacklo_ps(a0, a1);
+		__m128 b1 = _mm_unpackhi_ps(a0, a1);
+		__m128 b2 = _mm_unpacklo_ps(a2, _mm_setzero_ps());
+		__m128 r = _mm_movelh_ps(b0, b2);
+		r = _mm_add_ps(r, _mm_movehl_ps(b2, b0));
+		a2 = _mm_and_ps(a2, b3vxyzMaskf);
+		r = _mm_add_ps(r, b3CastdTo128f(_mm_move_sd(b3CastfTo128d(a2), b3CastfTo128d(b1))));
+		return b3MakeVector3(r);
 
 #elif defined(B3_USE_NEON)
-        static const uint32x4_t xyzMask = (const uint32x4_t){ -1, -1, -1, 0 };
-        float32x4_t a0 = vmulq_f32( v0.mVec128, this->mVec128);
-        float32x4_t a1 = vmulq_f32( v1.mVec128, this->mVec128);
-        float32x4_t a2 = vmulq_f32( v2.mVec128, this->mVec128);
-        float32x2x2_t zLo = vtrn_f32( vget_high_f32(a0), vget_high_f32(a1));
-        a2 = (float32x4_t) vandq_u32((uint32x4_t) a2, xyzMask );
-        float32x2_t b0 = vadd_f32( vpadd_f32( vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0] );
-        float32x2_t b1 = vpadd_f32( vpadd_f32( vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f));
-        return b3Vector3( vcombine_f32(b0, b1) );
+		static const uint32x4_t xyzMask = (const uint32x4_t){-1, -1, -1, 0};
+		float32x4_t a0 = vmulq_f32(v0.mVec128, this->mVec128);
+		float32x4_t a1 = vmulq_f32(v1.mVec128, this->mVec128);
+		float32x4_t a2 = vmulq_f32(v2.mVec128, this->mVec128);
+		float32x2x2_t zLo = vtrn_f32(vget_high_f32(a0), vget_high_f32(a1));
+		a2 = (float32x4_t)vandq_u32((uint32x4_t)a2, xyzMask);
+		float32x2_t b0 = vadd_f32(vpadd_f32(vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0]);
+		float32x2_t b1 = vpadd_f32(vpadd_f32(vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f));
+		return b3Vector3(vcombine_f32(b0, b1));
 #else
-		return b3MakeVector3( dot(v0), dot(v1), dot(v2));
+		return b3MakeVector3(dot(v0), dot(v1), dot(v2));
 #endif
-    }
+	}
 };
 
 /**@brief Return the sum of two vectors (Point symantics)*/
 B3_FORCE_INLINE b3Vector3
 operator+(const b3Vector3& v1, const b3Vector3& v2)
 {
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 	return b3MakeVector3(_mm_add_ps(v1.mVec128, v2.mVec128));
 #elif defined(B3_USE_NEON)
 	return b3MakeVector3(vaddq_f32(v1.mVec128, v2.mVec128));
 #else
 	return b3MakeVector3(
-			v1.m_floats[0] + v2.m_floats[0],
-			v1.m_floats[1] + v2.m_floats[1],
-			v1.m_floats[2] + v2.m_floats[2]);
+		v1.m_floats[0] + v2.m_floats[0],
+		v1.m_floats[1] + v2.m_floats[1],
+		v1.m_floats[2] + v2.m_floats[2]);
 #endif
 }
 
@@ -728,15 +717,15 @@ operator+(const b3Vector3& v1, const b3Vector3& v2)
 B3_FORCE_INLINE b3Vector3
 operator*(const b3Vector3& v1, const b3Vector3& v2)
 {
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 	return b3MakeVector3(_mm_mul_ps(v1.mVec128, v2.mVec128));
 #elif defined(B3_USE_NEON)
 	return b3MakeVector3(vmulq_f32(v1.mVec128, v2.mVec128));
 #else
 	return b3MakeVector3(
-			v1.m_floats[0] * v2.m_floats[0],
-			v1.m_floats[1] * v2.m_floats[1],
-			v1.m_floats[2] * v2.m_floats[2]);
+		v1.m_floats[0] * v2.m_floats[0],
+		v1.m_floats[1] * v2.m_floats[1],
+		v1.m_floats[2] * v2.m_floats[2]);
 #endif
 }
 
@@ -744,7 +733,7 @@ operator*(const b3Vector3& v1, const b3Vector3& v2)
 B3_FORCE_INLINE b3Vector3
 operator-(const b3Vector3& v1, const b3Vector3& v2)
 {
-#if (defined(B3_USE_SSE_IN_API)  && defined(B3_USE_SSE))
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
 
 	//	without _mm_and_ps this code causes slowdown in Concave moving
 	__m128 r = _mm_sub_ps(v1.mVec128, v2.mVec128);
@@ -754,9 +743,9 @@ operator-(const b3Vector3& v1, const b3Vector3& v2)
 	return b3MakeVector3((float32x4_t)vandq_s32((int32x4_t)r, b3vFFF0Mask));
 #else
 	return b3MakeVector3(
-			v1.m_floats[0] - v2.m_floats[0],
-			v1.m_floats[1] - v2.m_floats[1],
-			v1.m_floats[2] - v2.m_floats[2]);
+		v1.m_floats[0] - v2.m_floats[0],
+		v1.m_floats[1] - v2.m_floats[1],
+		v1.m_floats[2] - v2.m_floats[2]);
 #endif
 }
 
@@ -764,7 +753,7 @@ operator-(const b3Vector3& v1, const b3Vector3& v2)
 B3_FORCE_INLINE b3Vector3
 operator-(const b3Vector3& v)
 {
-#if (defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE))
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
 	__m128 r = _mm_xor_ps(v.mVec128, b3vMzeroMask);
 	return b3MakeVector3(_mm_and_ps(r, b3vFFF0fMask));
 #elif defined(B3_USE_NEON)
@@ -778,9 +767,9 @@ operator-(const b3Vector3& v)
 B3_FORCE_INLINE b3Vector3
 operator*(const b3Vector3& v, const b3Scalar& s)
 {
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
-	__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
-	vs = b3_pshufd_ps(vs, 0x80);	//	(S S S 0.0)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+	__m128 vs = _mm_load_ss(&s);  //	(S 0 0 0)
+	vs = b3_pshufd_ps(vs, 0x80);  //	(S S S 0.0)
 	return b3MakeVector3(_mm_mul_ps(v.mVec128, vs));
 #elif defined(B3_USE_NEON)
 	float32x4_t r = vmulq_n_f32(v.mVec128, s);
@@ -802,7 +791,7 @@ B3_FORCE_INLINE b3Vector3
 operator/(const b3Vector3& v, const b3Scalar& s)
 {
 	b3FullAssert(s != b3Scalar(0.0));
-#if 0 //defined(B3_USE_SSE_IN_API)
+#if 0  //defined(B3_USE_SSE_IN_API)
 // this code is not faster !
 	__m128 vs = _mm_load_ss(&s);
     vs = _mm_div_ss(b3v1110, vs);
@@ -818,7 +807,7 @@ operator/(const b3Vector3& v, const b3Scalar& s)
 B3_FORCE_INLINE b3Vector3
 operator/(const b3Vector3& v1, const b3Vector3& v2)
 {
-#if (defined(B3_USE_SSE_IN_API)&& defined (B3_USE_SSE))
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
 	__m128 vec = _mm_div_ps(v1.mVec128, v2.mVec128);
 	vec = _mm_and_ps(vec, b3vFFF0fMask);
 	return b3MakeVector3(vec);
@@ -828,19 +817,19 @@ operator/(const b3Vector3& v1, const b3Vector3& v2)
 	x = v1.mVec128;
 	y = v2.mVec128;
 
-	v = vrecpeq_f32(y);			// v ~ 1/y
-	m = vrecpsq_f32(y, v);		// m = (2-v*y)
-	v = vmulq_f32(v, m);		// vv = v*m ~~ 1/y
-	m = vrecpsq_f32(y, v);		// mm = (2-vv*y)
-	v = vmulq_f32(v, x);		// x*vv
-	v = vmulq_f32(v, m);		// (x*vv)*(2-vv*y) = x*(vv(2-vv*y)) ~~~ x/y
+	v = vrecpeq_f32(y);     // v ~ 1/y
+	m = vrecpsq_f32(y, v);  // m = (2-v*y)
+	v = vmulq_f32(v, m);    // vv = v*m ~~ 1/y
+	m = vrecpsq_f32(y, v);  // mm = (2-vv*y)
+	v = vmulq_f32(v, x);    // x*vv
+	v = vmulq_f32(v, m);    // (x*vv)*(2-vv*y) = x*(vv(2-vv*y)) ~~~ x/y
 
 	return b3Vector3(v);
 #else
 	return b3MakeVector3(
-			v1.m_floats[0] / v2.m_floats[0],
-			v1.m_floats[1] / v2.m_floats[1],
-			v1.m_floats[2] / v2.m_floats[2]);
+		v1.m_floats[0] / v2.m_floats[0],
+		v1.m_floats[1] / v2.m_floats[1],
+		v1.m_floats[2] / v2.m_floats[2]);
 #endif
 }
 
@@ -851,7 +840,6 @@ b3Dot(const b3Vector3& v1, const b3Vector3& v2)
 	return v1.dot(v2);
 }
 
-
 /**@brief Return the distance squared between two vectors */
 B3_FORCE_INLINE b3Scalar
 b3Distance2(const b3Vector3& v1, const b3Vector3& v2)
@@ -859,7 +847,6 @@ b3Distance2(const b3Vector3& v1, const b3Vector3& v2)
 	return v1.distance2(v2);
 }
 
-
 /**@brief Return the distance between two vectors */
 B3_FORCE_INLINE b3Scalar
 b3Distance(const b3Vector3& v1, const b3Vector3& v2)
@@ -897,8 +884,6 @@ b3Lerp(const b3Vector3& v1, const b3Vector3& v2, const b3Scalar& t)
 	return v1.lerp(v2, t);
 }
 
-
-
 B3_FORCE_INLINE b3Scalar b3Vector3::distance2(const b3Vector3& v) const
 {
 	return (v - *this).length2();
@@ -911,7 +896,7 @@ B3_FORCE_INLINE b3Scalar b3Vector3::distance(const b3Vector3& v) const
 
 B3_FORCE_INLINE b3Vector3 b3Vector3::normalized() const
 {
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 	b3Vector3 norm = *this;
 
 	return norm.normalize();
@@ -920,143 +905,136 @@ B3_FORCE_INLINE b3Vector3 b3Vector3::normalized() const
 #endif
 }
 
-B3_FORCE_INLINE b3Vector3 b3Vector3::rotate( const b3Vector3& wAxis, const b3Scalar _angle ) const
+B3_FORCE_INLINE b3Vector3 b3Vector3::rotate(const b3Vector3& wAxis, const b3Scalar _angle) const
 {
 	// wAxis must be a unit lenght vector
 
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 
-    __m128 O = _mm_mul_ps(wAxis.mVec128, mVec128);
-	b3Scalar ssin = b3Sin( _angle );
-    __m128 C = wAxis.cross( b3MakeVector3(mVec128) ).mVec128;
+	__m128 O = _mm_mul_ps(wAxis.mVec128, mVec128);
+	b3Scalar ssin = b3Sin(_angle);
+	__m128 C = wAxis.cross(b3MakeVector3(mVec128)).mVec128;
 	O = _mm_and_ps(O, b3vFFF0fMask);
-    b3Scalar scos = b3Cos( _angle );
+	b3Scalar scos = b3Cos(_angle);
 
-	__m128 vsin = _mm_load_ss(&ssin);	//	(S 0 0 0)
-    __m128 vcos = _mm_load_ss(&scos);	//	(S 0 0 0)
+	__m128 vsin = _mm_load_ss(&ssin);  //	(S 0 0 0)
+	__m128 vcos = _mm_load_ss(&scos);  //	(S 0 0 0)
 
-	__m128 Y = b3_pshufd_ps(O, 0xC9);	//	(Y Z X 0)
-	__m128 Z = b3_pshufd_ps(O, 0xD2);	//	(Z X Y 0)
+	__m128 Y = b3_pshufd_ps(O, 0xC9);  //	(Y Z X 0)
+	__m128 Z = b3_pshufd_ps(O, 0xD2);  //	(Z X Y 0)
 	O = _mm_add_ps(O, Y);
-	vsin = b3_pshufd_ps(vsin, 0x80);	//	(S S S 0)
+	vsin = b3_pshufd_ps(vsin, 0x80);  //	(S S S 0)
 	O = _mm_add_ps(O, Z);
-    vcos = b3_pshufd_ps(vcos, 0x80);	//	(S S S 0)
+	vcos = b3_pshufd_ps(vcos, 0x80);  //	(S S S 0)
 
-    vsin = vsin * C;
+	vsin = vsin * C;
 	O = O * wAxis.mVec128;
 	__m128 X = mVec128 - O;
 
-    O = O + vsin;
+	O = O + vsin;
 	vcos = vcos * X;
 	O = O + vcos;
 
 	return b3MakeVector3(O);
 #else
-	b3Vector3 o = wAxis * wAxis.dot( *this );
+	b3Vector3 o = wAxis * wAxis.dot(*this);
 	b3Vector3 _x = *this - o;
 	b3Vector3 _y;
 
-	_y = wAxis.cross( *this );
+	_y = wAxis.cross(*this);
 
-	return ( o + _x * b3Cos( _angle ) + _y * b3Sin( _angle ) );
+	return (o + _x * b3Cos(_angle) + _y * b3Sin(_angle));
 #endif
 }
 
-B3_FORCE_INLINE   long    b3Vector3::maxDot( const b3Vector3 *array, long array_count, b3Scalar &dotOut ) const
+B3_FORCE_INLINE long b3Vector3::maxDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const
 {
-#if defined (B3_USE_SSE) || defined (B3_USE_NEON)
-    #if defined _WIN32 || defined (B3_USE_SSE)
-        const long scalar_cutoff = 10;
-        long b3_maxdot_large( const float *array, const float *vec, unsigned long array_count, float *dotOut );
-    #elif defined B3_USE_NEON
-        const long scalar_cutoff = 4;
-        extern long (*_maxdot_large)( const float *array, const float *vec, unsigned long array_count, float *dotOut );
-    #endif
-    if( array_count < scalar_cutoff )
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
+#if defined _WIN32 || defined(B3_USE_SSE)
+	const long scalar_cutoff = 10;
+	long b3_maxdot_large(const float* array, const float* vec, unsigned long array_count, float* dotOut);
+#elif defined B3_USE_NEON
+	const long scalar_cutoff = 4;
+	extern long (*_maxdot_large)(const float* array, const float* vec, unsigned long array_count, float* dotOut);
+#endif
+	if (array_count < scalar_cutoff)
 #else
 
-#endif//B3_USE_SSE || B3_USE_NEON
-    {
-        b3Scalar maxDot = -B3_INFINITY;
-        int i = 0;
-        int ptIndex = -1;
-        for( i = 0; i < array_count; i++ )
-        {
-            b3Scalar dot = array[i].dot(*this);
-
-            if( dot > maxDot )
-            {
-                maxDot = dot;
-                ptIndex = i;
-            }
-        }
-
-		b3Assert(ptIndex>=0);
-        if (ptIndex<0)
+#endif  //B3_USE_SSE || B3_USE_NEON
+	{
+		b3Scalar maxDot = -B3_INFINITY;
+		int i = 0;
+		int ptIndex = -1;
+		for (i = 0; i < array_count; i++)
+		{
+			b3Scalar dot = array[i].dot(*this);
+
+			if (dot > maxDot)
+			{
+				maxDot = dot;
+				ptIndex = i;
+			}
+		}
+
+		b3Assert(ptIndex >= 0);
+		if (ptIndex < 0)
 		{
 			ptIndex = 0;
 		}
-        dotOut = maxDot;
-        return ptIndex;
-    }
-#if defined (B3_USE_SSE) || defined (B3_USE_NEON)
-    return b3_maxdot_large( (float*) array, (float*) &m_floats[0], array_count, &dotOut );
+		dotOut = maxDot;
+		return ptIndex;
+	}
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
+	return b3_maxdot_large((float*)array, (float*)&m_floats[0], array_count, &dotOut);
 #endif
 }
 
-B3_FORCE_INLINE   long    b3Vector3::minDot( const b3Vector3 *array, long array_count, b3Scalar &dotOut ) const
+B3_FORCE_INLINE long b3Vector3::minDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const
 {
-#if defined (B3_USE_SSE) || defined (B3_USE_NEON)
-    #if defined B3_USE_SSE
-        const long scalar_cutoff = 10;
-        long b3_mindot_large( const float *array, const float *vec, unsigned long array_count, float *dotOut );
-    #elif defined B3_USE_NEON
-        const long scalar_cutoff = 4;
-        extern long (*b3_mindot_large)( const float *array, const float *vec, unsigned long array_count, float *dotOut );
-    #else
-        #error unhandled arch!
-    #endif
-
-    if( array_count < scalar_cutoff )
-#endif//B3_USE_SSE || B3_USE_NEON
-    {
-        b3Scalar  minDot = B3_INFINITY;
-        int i = 0;
-        int ptIndex = -1;
-
-        for( i = 0; i < array_count; i++ )
-        {
-            b3Scalar dot = array[i].dot(*this);
-
-            if( dot < minDot )
-            {
-                minDot = dot;
-                ptIndex = i;
-            }
-        }
-
-        dotOut = minDot;
-
-        return ptIndex;
-    }
-#if defined (B3_USE_SSE) || defined (B3_USE_NEON)
-    return b3_mindot_large( (float*) array, (float*) &m_floats[0], array_count, &dotOut );
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
+#if defined B3_USE_SSE
+	const long scalar_cutoff = 10;
+	long b3_mindot_large(const float* array, const float* vec, unsigned long array_count, float* dotOut);
+#elif defined B3_USE_NEON
+	const long scalar_cutoff = 4;
+	extern long (*b3_mindot_large)(const float* array, const float* vec, unsigned long array_count, float* dotOut);
+#else
+#error unhandled arch!
 #endif
-}
-
-
-class b3Vector4 : public b3Vector3
-{
-public:
 
+	if (array_count < scalar_cutoff)
+#endif  //B3_USE_SSE || B3_USE_NEON
+	{
+		b3Scalar minDot = B3_INFINITY;
+		int i = 0;
+		int ptIndex = -1;
 
+		for (i = 0; i < array_count; i++)
+		{
+			b3Scalar dot = array[i].dot(*this);
 
+			if (dot < minDot)
+			{
+				minDot = dot;
+				ptIndex = i;
+			}
+		}
 
+		dotOut = minDot;
 
+		return ptIndex;
+	}
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
+	return b3_mindot_large((float*)array, (float*)&m_floats[0], array_count, &dotOut);
+#endif
+}
 
+class b3Vector4 : public b3Vector3
+{
+public:
 	B3_FORCE_INLINE b3Vector4 absolute4() const
 	{
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 		return b3MakeVector4(_mm_and_ps(mVec128, b3vAbsfMask));
 #elif defined(B3_USE_NEON)
 		return b3Vector4(vabsq_f32(mVec128));
@@ -1069,11 +1047,9 @@ public:
 #endif
 	}
 
+	b3Scalar getW() const { return m_floats[3]; }
 
-	b3Scalar	getW() const { return m_floats[3];}
-
-
-		B3_FORCE_INLINE int maxAxis4() const
+	B3_FORCE_INLINE int maxAxis4() const
 	{
 		int maxIndex = -1;
 		b3Scalar maxVal = b3Scalar(-B3_LARGE_FLOAT);
@@ -1090,7 +1066,7 @@ public:
 		if (m_floats[2] > maxVal)
 		{
 			maxIndex = 2;
-			maxVal =m_floats[2];
+			maxVal = m_floats[2];
 		}
 		if (m_floats[3] > maxVal)
 		{
@@ -1100,7 +1076,6 @@ public:
 		return maxIndex;
 	}
 
-
 	B3_FORCE_INLINE int minAxis4() const
 	{
 		int minIndex = -1;
@@ -1118,7 +1093,7 @@ public:
 		if (m_floats[2] < minVal)
 		{
 			minIndex = 2;
-			minVal =m_floats[2];
+			minVal = m_floats[2];
 		}
 		if (m_floats[3] < minVal)
 		{
@@ -1129,216 +1104,200 @@ public:
 		return minIndex;
 	}
 
-
 	B3_FORCE_INLINE int closestAxis4() const
 	{
 		return absolute4().maxAxis4();
 	}
 
-
-
-
-  /**@brief Set x,y,z and zero w
+	/**@brief Set x,y,z and zero w
    * @param x Value of x
    * @param y Value of y
    * @param z Value of z
    */
 
-
-/*		void getValue(b3Scalar *m) const
+	/*		void getValue(b3Scalar *m) const
 		{
 			m[0] = m_floats[0];
 			m[1] = m_floats[1];
 			m[2] =m_floats[2];
 		}
 */
-/**@brief Set the values
+	/**@brief Set the values
    * @param x Value of x
    * @param y Value of y
    * @param z Value of z
    * @param w Value of w
    */
-		B3_FORCE_INLINE void	setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z,const b3Scalar& _w)
-		{
-			m_floats[0]=_x;
-			m_floats[1]=_y;
-			m_floats[2]=_z;
-			m_floats[3]=_w;
-		}
-
-
+	B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z, const b3Scalar& _w)
+	{
+		m_floats[0] = _x;
+		m_floats[1] = _y;
+		m_floats[2] = _z;
+		m_floats[3] = _w;
+	}
 };
 
-
 ///b3SwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
-B3_FORCE_INLINE void	b3SwapScalarEndian(const b3Scalar& sourceVal, b3Scalar& destVal)
+B3_FORCE_INLINE void b3SwapScalarEndian(const b3Scalar& sourceVal, b3Scalar& destVal)
 {
-	#ifdef B3_USE_DOUBLE_PRECISION
-	unsigned char* dest = (unsigned char*) &destVal;
-	unsigned char* src  = (unsigned char*) &sourceVal;
+#ifdef B3_USE_DOUBLE_PRECISION
+	unsigned char* dest = (unsigned char*)&destVal;
+	unsigned char* src = (unsigned char*)&sourceVal;
 	dest[0] = src[7];
-    dest[1] = src[6];
-    dest[2] = src[5];
-    dest[3] = src[4];
-    dest[4] = src[3];
-    dest[5] = src[2];
-    dest[6] = src[1];
-    dest[7] = src[0];
+	dest[1] = src[6];
+	dest[2] = src[5];
+	dest[3] = src[4];
+	dest[4] = src[3];
+	dest[5] = src[2];
+	dest[6] = src[1];
+	dest[7] = src[0];
 #else
-	unsigned char* dest = (unsigned char*) &destVal;
-	unsigned char* src  = (unsigned char*) &sourceVal;
+	unsigned char* dest = (unsigned char*)&destVal;
+	unsigned char* src = (unsigned char*)&sourceVal;
 	dest[0] = src[3];
-    dest[1] = src[2];
-    dest[2] = src[1];
-    dest[3] = src[0];
-#endif //B3_USE_DOUBLE_PRECISION
+	dest[1] = src[2];
+	dest[2] = src[1];
+	dest[3] = src[0];
+#endif  //B3_USE_DOUBLE_PRECISION
 }
 ///b3SwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
-B3_FORCE_INLINE void	b3SwapVector3Endian(const b3Vector3& sourceVec, b3Vector3& destVec)
+B3_FORCE_INLINE void b3SwapVector3Endian(const b3Vector3& sourceVec, b3Vector3& destVec)
 {
-	for (int i=0;i<4;i++)
+	for (int i = 0; i < 4; i++)
 	{
-		b3SwapScalarEndian(sourceVec[i],destVec[i]);
+		b3SwapScalarEndian(sourceVec[i], destVec[i]);
 	}
-
 }
 
 ///b3UnSwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
-B3_FORCE_INLINE void	b3UnSwapVector3Endian(b3Vector3& vector)
+B3_FORCE_INLINE void b3UnSwapVector3Endian(b3Vector3& vector)
 {
-
-	b3Vector3	swappedVec;
-	for (int i=0;i<4;i++)
+	b3Vector3 swappedVec;
+	for (int i = 0; i < 4; i++)
 	{
-		b3SwapScalarEndian(vector[i],swappedVec[i]);
+		b3SwapScalarEndian(vector[i], swappedVec[i]);
 	}
 	vector = swappedVec;
 }
 
 template <class T>
-B3_FORCE_INLINE void b3PlaneSpace1 (const T& n, T& p, T& q)
+B3_FORCE_INLINE void b3PlaneSpace1(const T& n, T& p, T& q)
 {
-  if (b3Fabs(n[2]) > B3_SQRT12) {
-    // choose p in y-z plane
-    b3Scalar a = n[1]*n[1] + n[2]*n[2];
-    b3Scalar k = b3RecipSqrt (a);
-    p[0] = 0;
-	p[1] = -n[2]*k;
-	p[2] = n[1]*k;
-    // set q = n x p
-    q[0] = a*k;
-	q[1] = -n[0]*p[2];
-	q[2] = n[0]*p[1];
-  }
-  else {
-    // choose p in x-y plane
-    b3Scalar a = n[0]*n[0] + n[1]*n[1];
-    b3Scalar k = b3RecipSqrt (a);
-    p[0] = -n[1]*k;
-	p[1] = n[0]*k;
-	p[2] = 0;
-    // set q = n x p
-    q[0] = -n[2]*p[1];
-	q[1] = n[2]*p[0];
-	q[2] = a*k;
-  }
+	if (b3Fabs(n[2]) > B3_SQRT12)
+	{
+		// choose p in y-z plane
+		b3Scalar a = n[1] * n[1] + n[2] * n[2];
+		b3Scalar k = b3RecipSqrt(a);
+		p[0] = 0;
+		p[1] = -n[2] * k;
+		p[2] = n[1] * k;
+		// set q = n x p
+		q[0] = a * k;
+		q[1] = -n[0] * p[2];
+		q[2] = n[0] * p[1];
+	}
+	else
+	{
+		// choose p in x-y plane
+		b3Scalar a = n[0] * n[0] + n[1] * n[1];
+		b3Scalar k = b3RecipSqrt(a);
+		p[0] = -n[1] * k;
+		p[1] = n[0] * k;
+		p[2] = 0;
+		// set q = n x p
+		q[0] = -n[2] * p[1];
+		q[1] = n[2] * p[0];
+		q[2] = a * k;
+	}
 }
 
-
-struct	b3Vector3FloatData
+struct b3Vector3FloatData
 {
-	float	m_floats[4];
+	float m_floats[4];
 };
 
-struct	b3Vector3DoubleData
+struct b3Vector3DoubleData
 {
-	double	m_floats[4];
-
+	double m_floats[4];
 };
 
-B3_FORCE_INLINE	void	b3Vector3::serializeFloat(struct	b3Vector3FloatData& dataOut) const
+B3_FORCE_INLINE void b3Vector3::serializeFloat(struct b3Vector3FloatData& dataOut) const
 {
 	///could also do a memcpy, check if it is worth it
-	for (int i=0;i<4;i++)
+	for (int i = 0; i < 4; i++)
 		dataOut.m_floats[i] = float(m_floats[i]);
 }
 
-B3_FORCE_INLINE void	b3Vector3::deSerializeFloat(const struct	b3Vector3FloatData& dataIn)
+B3_FORCE_INLINE void b3Vector3::deSerializeFloat(const struct b3Vector3FloatData& dataIn)
 {
-	for (int i=0;i<4;i++)
+	for (int i = 0; i < 4; i++)
 		m_floats[i] = b3Scalar(dataIn.m_floats[i]);
 }
 
-
-B3_FORCE_INLINE	void	b3Vector3::serializeDouble(struct	b3Vector3DoubleData& dataOut) const
+B3_FORCE_INLINE void b3Vector3::serializeDouble(struct b3Vector3DoubleData& dataOut) const
 {
 	///could also do a memcpy, check if it is worth it
-	for (int i=0;i<4;i++)
+	for (int i = 0; i < 4; i++)
 		dataOut.m_floats[i] = double(m_floats[i]);
 }
 
-B3_FORCE_INLINE void	b3Vector3::deSerializeDouble(const struct	b3Vector3DoubleData& dataIn)
+B3_FORCE_INLINE void b3Vector3::deSerializeDouble(const struct b3Vector3DoubleData& dataIn)
 {
-	for (int i=0;i<4;i++)
+	for (int i = 0; i < 4; i++)
 		m_floats[i] = b3Scalar(dataIn.m_floats[i]);
 }
 
-
-B3_FORCE_INLINE	void	b3Vector3::serialize(struct	b3Vector3Data& dataOut) const
+B3_FORCE_INLINE void b3Vector3::serialize(struct b3Vector3Data& dataOut) const
 {
 	///could also do a memcpy, check if it is worth it
-	for (int i=0;i<4;i++)
+	for (int i = 0; i < 4; i++)
 		dataOut.m_floats[i] = m_floats[i];
 }
 
-B3_FORCE_INLINE void	b3Vector3::deSerialize(const struct	b3Vector3Data& dataIn)
+B3_FORCE_INLINE void b3Vector3::deSerialize(const struct b3Vector3Data& dataIn)
 {
-	for (int i=0;i<4;i++)
+	for (int i = 0; i < 4; i++)
 		m_floats[i] = dataIn.m_floats[i];
 }
 
-
-
-
-inline b3Vector3 b3MakeVector3(b3Scalar x,b3Scalar y,b3Scalar z)
+inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z)
 {
-	b3Vector3	tmp;
-	tmp.setValue(x,y,z);
+	b3Vector3 tmp;
+	tmp.setValue(x, y, z);
 	return tmp;
 }
 
-inline b3Vector3 b3MakeVector3(b3Scalar x,b3Scalar y,b3Scalar z, b3Scalar w)
+inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w)
 {
-	b3Vector3	tmp;
-	tmp.setValue(x,y,z);
+	b3Vector3 tmp;
+	tmp.setValue(x, y, z);
 	tmp.w = w;
 	return tmp;
 }
 
-inline b3Vector4 b3MakeVector4(b3Scalar x,b3Scalar y,b3Scalar z,b3Scalar w)
+inline b3Vector4 b3MakeVector4(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w)
 {
-	b3Vector4	tmp;
-	tmp.setValue(x,y,z,w);
+	b3Vector4 tmp;
+	tmp.setValue(x, y, z, w);
 	return tmp;
 }
 
-#if defined(B3_USE_SSE_IN_API) && defined (B3_USE_SSE)
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
 
-inline b3Vector3 b3MakeVector3( b3SimdFloat4 v)
+inline b3Vector3 b3MakeVector3(b3SimdFloat4 v)
 {
-        b3Vector3 tmp;
-        tmp.set128(v);
-        return tmp;
+	b3Vector3 tmp;
+	tmp.set128(v);
+	return tmp;
 }
 
 inline b3Vector4 b3MakeVector4(b3SimdFloat4 vec)
 {
-	b3Vector4	tmp;
+	b3Vector4 tmp;
 	tmp.set128(vec);
 	return tmp;
 }
 
 #endif
 
-
-#endif //B3_VECTOR3_H
+#endif  //B3_VECTOR3_H
diff --git a/thirdparty/bullet/Bullet3Common/shared/b3Float4.h b/thirdparty/bullet/Bullet3Common/shared/b3Float4.h
index 5e4b95bcee..d8a9f47411 100644
--- a/thirdparty/bullet/Bullet3Common/shared/b3Float4.h
+++ b/thirdparty/bullet/Bullet3Common/shared/b3Float4.h
@@ -4,94 +4,87 @@
 #include "Bullet3Common/shared/b3PlatformDefinitions.h"
 
 #ifdef __cplusplus
-	#include "Bullet3Common/b3Vector3.h"
-	#define b3Float4 b3Vector3
-	#define b3Float4ConstArg const b3Vector3&
-	#define b3Dot3F4 b3Dot
-	#define b3Cross3 b3Cross
-	#define	b3MakeFloat4  b3MakeVector3
-	inline b3Vector3 b3Normalized(const b3Vector3& vec)
-	{
-		return vec.normalized();
-	}
-
-	inline b3Float4 b3FastNormalized3(b3Float4ConstArg v)
-	{
-		return v.normalized();
-	}
-
-	inline b3Float4 b3MaxFloat4 (const b3Float4& a, const b3Float4& b)
-	{
-		b3Float4 tmp = a;
-		tmp.setMax(b);
-		return tmp;
-	}
-	inline b3Float4 b3MinFloat4 (const b3Float4& a, const b3Float4& b)
-	{
-		b3Float4 tmp = a;
-		tmp.setMin(b);
-		return tmp;
-	}
+#include "Bullet3Common/b3Vector3.h"
+#define b3Float4 b3Vector3
+#define b3Float4ConstArg const b3Vector3&
+#define b3Dot3F4 b3Dot
+#define b3Cross3 b3Cross
+#define b3MakeFloat4 b3MakeVector3
+inline b3Vector3 b3Normalized(const b3Vector3& vec)
+{
+	return vec.normalized();
+}
 
+inline b3Float4 b3FastNormalized3(b3Float4ConstArg v)
+{
+	return v.normalized();
+}
 
+inline b3Float4 b3MaxFloat4(const b3Float4& a, const b3Float4& b)
+{
+	b3Float4 tmp = a;
+	tmp.setMax(b);
+	return tmp;
+}
+inline b3Float4 b3MinFloat4(const b3Float4& a, const b3Float4& b)
+{
+	b3Float4 tmp = a;
+	tmp.setMin(b);
+	return tmp;
+}
 
 #else
-	typedef float4	b3Float4;
-	#define b3Float4ConstArg const b3Float4
-	#define b3MakeFloat4 (float4)
-	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)
-	{
-		float4 a1 = b3MakeFloat4(v0.xyz,0.f);
-		float4 b1 = b3MakeFloat4(v1.xyz,0.f);
-		return dot(a1, b1);
-	}
-	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)
-	{
-		float4 a1 = b3MakeFloat4(v0.xyz,0.f);
-		float4 b1 = b3MakeFloat4(v1.xyz,0.f);
-		return cross(a1, b1);
-	}
-	#define b3MinFloat4 min
-	#define b3MaxFloat4 max
-
-	#define b3Normalized(a) normalize(a)
+typedef float4 b3Float4;
+#define b3Float4ConstArg const b3Float4
+#define b3MakeFloat4 (float4)
+float b3Dot3F4(b3Float4ConstArg v0, b3Float4ConstArg v1)
+{
+	float4 a1 = b3MakeFloat4(v0.xyz, 0.f);
+	float4 b1 = b3MakeFloat4(v1.xyz, 0.f);
+	return dot(a1, b1);
+}
+b3Float4 b3Cross3(b3Float4ConstArg v0, b3Float4ConstArg v1)
+{
+	float4 a1 = b3MakeFloat4(v0.xyz, 0.f);
+	float4 b1 = b3MakeFloat4(v1.xyz, 0.f);
+	return cross(a1, b1);
+}
+#define b3MinFloat4 min
+#define b3MaxFloat4 max
 
-#endif 
+#define b3Normalized(a) normalize(a)
 
+#endif
 
-		
 inline bool b3IsAlmostZero(b3Float4ConstArg v)
 {
-	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	
+	if (b3Fabs(v.x) > 1e-6 || b3Fabs(v.y) > 1e-6 || b3Fabs(v.z) > 1e-6)
 		return false;
 	return true;
 }
 
-
-inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )
+inline int b3MaxDot(b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut)
 {
-    float maxDot = -B3_INFINITY;
-    int i = 0;
-    int ptIndex = -1;
-    for( i = 0; i < vecLen; i++ )
-    {
-        float dot = b3Dot3F4(vecArray[i],vec);
-            
-        if( dot > maxDot )
-        {
-            maxDot = dot;
-            ptIndex = i;
-        }
-    }
-	b3Assert(ptIndex>=0);
-    if (ptIndex<0)
+	float maxDot = -B3_INFINITY;
+	int i = 0;
+	int ptIndex = -1;
+	for (i = 0; i < vecLen; i++)
+	{
+		float dot = b3Dot3F4(vecArray[i], vec);
+
+		if (dot > maxDot)
+		{
+			maxDot = dot;
+			ptIndex = i;
+		}
+	}
+	b3Assert(ptIndex >= 0);
+	if (ptIndex < 0)
 	{
 		ptIndex = 0;
 	}
-    *dotOut = maxDot;
-    return ptIndex;
+	*dotOut = maxDot;
+	return ptIndex;
 }
 
-
-
-#endif //B3_FLOAT4_H
+#endif  //B3_FLOAT4_H
diff --git a/thirdparty/bullet/Bullet3Common/shared/b3Int2.h b/thirdparty/bullet/Bullet3Common/shared/b3Int2.h
index f1d01f81a5..7b84de4436 100644
--- a/thirdparty/bullet/Bullet3Common/shared/b3Int2.h
+++ b/thirdparty/bullet/Bullet3Common/shared/b3Int2.h
@@ -20,11 +20,10 @@ subject to the following restrictions:
 
 struct b3UnsignedInt2
 {
-	union
-	{
+	union {
 		struct
 		{
-			unsigned int x,y;
+			unsigned int x, y;
 		};
 		struct
 		{
@@ -35,11 +34,10 @@ struct b3UnsignedInt2
 
 struct b3Int2
 {
-	union
-	{
+	union {
 		struct
 		{
-			int x,y;
+			int x, y;
 		};
 		struct
 		{
@@ -51,7 +49,8 @@ struct b3Int2
 inline b3Int2 b3MakeInt2(int x, int y)
 {
 	b3Int2 v;
-	v.s[0] = x; v.s[1] = y;
+	v.s[0] = x;
+	v.s[1] = y;
 	return v;
 }
 #else
@@ -60,5 +59,5 @@ inline b3Int2 b3MakeInt2(int x, int y)
 #define b3Int2 int2
 #define b3MakeInt2 (int2)
 
-#endif //__cplusplus
+#endif  //__cplusplus
 #endif
 \ No newline at end of file
diff --git a/thirdparty/bullet/Bullet3Common/shared/b3Int4.h b/thirdparty/bullet/Bullet3Common/shared/b3Int4.h
index aa02d6beef..f6a1754245 100644
--- a/thirdparty/bullet/Bullet3Common/shared/b3Int4.h
+++ b/thirdparty/bullet/Bullet3Common/shared/b3Int4.h
@@ -5,16 +5,15 @@
 
 #include "Bullet3Common/b3Scalar.h"
 
-
-B3_ATTRIBUTE_ALIGNED16(struct) b3UnsignedInt4
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3UnsignedInt4
 {
 	B3_DECLARE_ALIGNED_ALLOCATOR();
 
-	union
-	{
+	union {
 		struct
 		{
-			unsigned int x,y,z,w;
+			unsigned int x, y, z, w;
 		};
 		struct
 		{
@@ -23,15 +22,15 @@ B3_ATTRIBUTE_ALIGNED16(struct) b3UnsignedInt4
 	};
 };
 
-B3_ATTRIBUTE_ALIGNED16(struct) b3Int4
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3Int4
 {
 	B3_DECLARE_ALIGNED_ALLOCATOR();
 
-	union
-	{
+	union {
 		struct
 		{
-			int x,y,z,w;
+			int x, y, z, w;
 		};
 		struct
 		{
@@ -43,26 +42,30 @@ B3_ATTRIBUTE_ALIGNED16(struct) b3Int4
 B3_FORCE_INLINE b3Int4 b3MakeInt4(int x, int y, int z, int w = 0)
 {
 	b3Int4 v;
-	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
+	v.s[0] = x;
+	v.s[1] = y;
+	v.s[2] = z;
+	v.s[3] = w;
 	return v;
 }
 
 B3_FORCE_INLINE b3UnsignedInt4 b3MakeUnsignedInt4(unsigned int x, unsigned int y, unsigned int z, unsigned int w = 0)
 {
 	b3UnsignedInt4 v;
-	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
+	v.s[0] = x;
+	v.s[1] = y;
+	v.s[2] = z;
+	v.s[3] = w;
 	return v;
 }
 
 #else
 
-
 #define b3UnsignedInt4 uint4
 #define b3Int4 int4
 #define b3MakeInt4 (int4)
 #define b3MakeUnsignedInt4 (uint4)
 
+#endif  //__cplusplus
 
-#endif //__cplusplus
-
-#endif //B3_INT4_H
+#endif  //B3_INT4_H
diff --git a/thirdparty/bullet/Bullet3Common/shared/b3Mat3x3.h b/thirdparty/bullet/Bullet3Common/shared/b3Mat3x3.h
index 7b1fef32f8..ce6482b5a6 100644
--- a/thirdparty/bullet/Bullet3Common/shared/b3Mat3x3.h
+++ b/thirdparty/bullet/Bullet3Common/shared/b3Mat3x3.h
@@ -4,7 +4,6 @@
 
 #include "Bullet3Common/shared/b3Quat.h"
 
-
 #ifdef __cplusplus
 
 #include "Bullet3Common/b3Matrix3x3.h"
@@ -22,43 +21,41 @@ inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg mat)
 	return mat.absolute();
 }
 
-#define b3GetRow(m,row) m.getRow(row)
+#define b3GetRow(m, row) m.getRow(row)
 
-__inline
-b3Float4 mtMul3(b3Float4ConstArg a, b3Mat3x3ConstArg b)
+__inline b3Float4 mtMul3(b3Float4ConstArg a, b3Mat3x3ConstArg b)
 {
-	return b*a;
+	return b * a;
 }
 
-
 #else
 
 typedef struct
 {
 	b3Float4 m_row[3];
-}b3Mat3x3;
+} b3Mat3x3;
 
 #define b3Mat3x3ConstArg const b3Mat3x3
-#define b3GetRow(m,row) (m.m_row[row])
+#define b3GetRow(m, row) (m.m_row[row])
 
 inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)
 {
-	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);
+	b3Float4 quat2 = (b3Float4)(quat.x * quat.x, quat.y * quat.y, quat.z * quat.z, 0.f);
 	b3Mat3x3 out;
 
-	out.m_row[0].x=1-2*quat2.y-2*quat2.z;
-	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;
-	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;
+	out.m_row[0].x = 1 - 2 * quat2.y - 2 * quat2.z;
+	out.m_row[0].y = 2 * quat.x * quat.y - 2 * quat.w * quat.z;
+	out.m_row[0].z = 2 * quat.x * quat.z + 2 * quat.w * quat.y;
 	out.m_row[0].w = 0.f;
 
-	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;
-	out.m_row[1].y=1-2*quat2.x-2*quat2.z;
-	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;
+	out.m_row[1].x = 2 * quat.x * quat.y + 2 * quat.w * quat.z;
+	out.m_row[1].y = 1 - 2 * quat2.x - 2 * quat2.z;
+	out.m_row[1].z = 2 * quat.y * quat.z - 2 * quat.w * quat.x;
 	out.m_row[1].w = 0.f;
 
-	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;
-	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;
-	out.m_row[2].z=1-2*quat2.x-2*quat2.y;
+	out.m_row[2].x = 2 * quat.x * quat.z - 2 * quat.w * quat.y;
+	out.m_row[2].y = 2 * quat.y * quat.z + 2 * quat.w * quat.x;
+	out.m_row[2].z = 1 - 2 * quat2.x - 2 * quat2.y;
 	out.m_row[2].w = 0.f;
 
 	return out;
@@ -73,27 +70,19 @@ inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)
 	return out;
 }
 
+__inline b3Mat3x3 mtZero();
 
-__inline
-b3Mat3x3 mtZero();
-
-__inline
-b3Mat3x3 mtIdentity();
+__inline b3Mat3x3 mtIdentity();
 
-__inline
-b3Mat3x3 mtTranspose(b3Mat3x3 m);
+__inline b3Mat3x3 mtTranspose(b3Mat3x3 m);
 
-__inline
-b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);
+__inline b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);
 
-__inline
-b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);
+__inline b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);
 
-__inline
-b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);
+__inline b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);
 
-__inline
-b3Mat3x3 mtZero()
+__inline b3Mat3x3 mtZero()
 {
 	b3Mat3x3 m;
 	m.m_row[0] = (b3Float4)(0.f);
@@ -102,18 +91,16 @@ b3Mat3x3 mtZero()
 	return m;
 }
 
-__inline
-b3Mat3x3 mtIdentity()
+__inline b3Mat3x3 mtIdentity()
 {
 	b3Mat3x3 m;
-	m.m_row[0] = (b3Float4)(1,0,0,0);
-	m.m_row[1] = (b3Float4)(0,1,0,0);
-	m.m_row[2] = (b3Float4)(0,0,1,0);
+	m.m_row[0] = (b3Float4)(1, 0, 0, 0);
+	m.m_row[1] = (b3Float4)(0, 1, 0, 0);
+	m.m_row[2] = (b3Float4)(0, 0, 1, 0);
 	return m;
 }
 
-__inline
-b3Mat3x3 mtTranspose(b3Mat3x3 m)
+__inline b3Mat3x3 mtTranspose(b3Mat3x3 m)
 {
 	b3Mat3x3 out;
 	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
@@ -122,58 +109,49 @@ b3Mat3x3 mtTranspose(b3Mat3x3 m)
 	return out;
 }
 
-__inline
-b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)
+__inline b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)
 {
 	b3Mat3x3 transB;
-	transB = mtTranspose( b );
+	transB = mtTranspose(b);
 	b3Mat3x3 ans;
 	//	why this doesn't run when 0ing in the for{}
 	a.m_row[0].w = 0.f;
 	a.m_row[1].w = 0.f;
 	a.m_row[2].w = 0.f;
-	for(int i=0; i<3; i++)
+	for (int i = 0; i < 3; i++)
 	{
-//	a.m_row[i].w = 0.f;
-		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);
-		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);
-		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);
+		//	a.m_row[i].w = 0.f;
+		ans.m_row[i].x = b3Dot3F4(a.m_row[i], transB.m_row[0]);
+		ans.m_row[i].y = b3Dot3F4(a.m_row[i], transB.m_row[1]);
+		ans.m_row[i].z = b3Dot3F4(a.m_row[i], transB.m_row[2]);
 		ans.m_row[i].w = 0.f;
 	}
 	return ans;
 }
 
-__inline
-b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)
+__inline b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)
 {
 	b3Float4 ans;
-	ans.x = b3Dot3F4( a.m_row[0], b );
-	ans.y = b3Dot3F4( a.m_row[1], b );
-	ans.z = b3Dot3F4( a.m_row[2], b );
+	ans.x = b3Dot3F4(a.m_row[0], b);
+	ans.y = b3Dot3F4(a.m_row[1], b);
+	ans.z = b3Dot3F4(a.m_row[2], b);
 	ans.w = 0.f;
 	return ans;
 }
 
-__inline
-b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)
+__inline b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)
 {
 	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
 	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
 	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
 
 	b3Float4 ans;
-	ans.x = b3Dot3F4( a, colx );
-	ans.y = b3Dot3F4( a, coly );
-	ans.z = b3Dot3F4( a, colz );
+	ans.x = b3Dot3F4(a, colx);
+	ans.y = b3Dot3F4(a, coly);
+	ans.z = b3Dot3F4(a, colz);
 	return ans;
 }
 
-
 #endif
 
-
-
-
-
-
-#endif //B3_MAT3x3_H
+#endif  //B3_MAT3x3_H
diff --git a/thirdparty/bullet/Bullet3Common/shared/b3PlatformDefinitions.h b/thirdparty/bullet/Bullet3Common/shared/b3PlatformDefinitions.h
index 1c133fb088..b72bee9310 100644
--- a/thirdparty/bullet/Bullet3Common/shared/b3PlatformDefinitions.h
+++ b/thirdparty/bullet/Bullet3Common/shared/b3PlatformDefinitions.h
@@ -8,18 +8,18 @@ struct MyTest
 
 #ifdef __cplusplus
 //#define b3ConstArray(a) const b3AlignedObjectArray<a>&
-#define b3ConstArray(a) const a*
+#define b3ConstArray(a) const a *
 #define b3AtomicInc(a) ((*a)++)
 
-inline int b3AtomicAdd (volatile int *p, int val)
+inline int b3AtomicAdd(volatile int *p, int val)
 {
 	int oldValue = *p;
-	int newValue = oldValue+val;
+	int newValue = oldValue + val;
 	*p = newValue;
 	return oldValue;
 }
 
-#define __global 
+#define __global
 
 #define B3_STATIC static
 #else
@@ -27,7 +27,7 @@ inline int b3AtomicAdd (volatile int *p, int val)
 #define B3_LARGE_FLOAT 1e18f
 #define B3_INFINITY 1e18f
 #define b3Assert(a)
-#define b3ConstArray(a) __global const a*
+#define b3ConstArray(a) __global const a *
 #define b3AtomicInc atomic_inc
 #define b3AtomicAdd atomic_add
 #define b3Fabs fabs
diff --git a/thirdparty/bullet/Bullet3Common/shared/b3Quat.h b/thirdparty/bullet/Bullet3Common/shared/b3Quat.h
index f262d5e08f..940610c77b 100644
--- a/thirdparty/bullet/Bullet3Common/shared/b3Quat.h
+++ b/thirdparty/bullet/Bullet3Common/shared/b3Quat.h
@@ -5,35 +5,34 @@
 #include "Bullet3Common/shared/b3Float4.h"
 
 #ifdef __cplusplus
-	#include "Bullet3Common/b3Quaternion.h"
-	#include "Bullet3Common/b3Transform.h"
+#include "Bullet3Common/b3Quaternion.h"
+#include "Bullet3Common/b3Transform.h"
 
-	#define b3Quat b3Quaternion
-	#define b3QuatConstArg const b3Quaternion&
-	inline b3Quat b3QuatInverse(b3QuatConstArg orn)
-	{
-		return orn.inverse();
-	}
+#define b3Quat b3Quaternion
+#define b3QuatConstArg const b3Quaternion&
+inline b3Quat b3QuatInverse(b3QuatConstArg orn)
+{
+	return orn.inverse();
+}
 
-	inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)
-	{
-		b3Transform tr;
-		tr.setOrigin(translation);
-		tr.setRotation(orientation);
-		return tr(point);
-	}
+inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)
+{
+	b3Transform tr;
+	tr.setOrigin(translation);
+	tr.setRotation(orientation);
+	return tr(point);
+}
 
 #else
-	typedef float4	b3Quat;
-	#define b3QuatConstArg const b3Quat
-	
-	
+typedef float4 b3Quat;
+#define b3QuatConstArg const b3Quat
+
 inline float4 b3FastNormalize4(float4 v)
 {
-	v = (float4)(v.xyz,0.f);
+	v = (float4)(v.xyz, 0.f);
 	return fast_normalize(v);
 }
-	
+
 inline b3Quat b3QuatMul(b3Quat a, b3Quat b);
 inline b3Quat b3QuatNormalized(b3QuatConstArg in);
 inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);
@@ -43,20 +42,20 @@ inline b3Quat b3QuatInverse(b3QuatConstArg q);
 inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)
 {
 	b3Quat ans;
-	ans = b3Cross3( a, b );
-	ans += a.w*b+b.w*a;
-//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
-	ans.w = a.w*b.w - b3Dot3F4(a, b);
+	ans = b3Cross3(a, b);
+	ans += a.w * b + b.w * a;
+	//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w * b.w - b3Dot3F4(a, b);
 	return ans;
 }
 
 inline b3Quat b3QuatNormalized(b3QuatConstArg in)
 {
 	b3Quat q;
-	q=in;
+	q = in;
 	//return b3FastNormalize4(in);
 	float len = native_sqrt(dot(q, q));
-	if(len > 0.f)
+	if (len > 0.f)
 	{
 		q *= 1.f / len;
 	}
@@ -69,15 +68,13 @@ inline b3Quat b3QuatNormalized(b3QuatConstArg in)
 }
 inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)
 {
-	b3Quat qInv = b3QuatInvert( q );
+	b3Quat qInv = b3QuatInvert(q);
 	float4 vcpy = vec;
 	vcpy.w = 0.f;
-	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);
+	float4 out = b3QuatMul(b3QuatMul(q, vcpy), qInv);
 	return out;
 }
 
-
-
 inline b3Quat b3QuatInverse(b3QuatConstArg q)
 {
 	return (b3Quat)(-q.xyz, q.w);
@@ -90,14 +87,14 @@ inline b3Quat b3QuatInvert(b3QuatConstArg q)
 
 inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)
 {
-	return b3QuatRotate( b3QuatInvert( q ), vec );
+	return b3QuatRotate(b3QuatInvert(q), vec);
 }
 
-inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)
+inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)
 {
-	return b3QuatRotate( orientation, point ) + (translation);
+	return b3QuatRotate(orientation, point) + (translation);
 }
-	
-#endif 
 
-#endif //B3_QUAT_H
+#endif
+
+#endif  //B3_QUAT_H