From 22b7c9dfa80d0f7abca40f061865c2ab3c136a74 Mon Sep 17 00:00:00 2001 From: Oussama Date: Thu, 3 Jan 2019 14:26:51 +0100 Subject: Update Bullet to the latest commit 126b676 --- .../LinearMath/TaskScheduler/btTaskScheduler.cpp | 1406 +++++---- .../TaskScheduler/btThreadSupportInterface.h | 76 +- .../TaskScheduler/btThreadSupportPosix.cpp | 458 ++- .../TaskScheduler/btThreadSupportWin32.cpp | 700 +++-- thirdparty/bullet/LinearMath/btAabbUtil2.h | 175 +- .../bullet/LinearMath/btAlignedAllocator.cpp | 266 +- thirdparty/bullet/LinearMath/btAlignedAllocator.h | 100 +- .../bullet/LinearMath/btAlignedObjectArray.h | 653 ++--- thirdparty/bullet/LinearMath/btConvexHull.cpp | 951 +++--- thirdparty/bullet/LinearMath/btConvexHull.h | 180 +- .../bullet/LinearMath/btConvexHullComputer.cpp | 1592 +++++----- .../bullet/LinearMath/btConvexHullComputer.h | 102 +- thirdparty/bullet/LinearMath/btCpuFeatureUtility.h | 38 +- .../bullet/LinearMath/btDefaultMotionState.h | 28 +- thirdparty/bullet/LinearMath/btGeometryUtil.cpp | 113 +- thirdparty/bullet/LinearMath/btGeometryUtil.h | 22 +- .../bullet/LinearMath/btGrahamScan2dConvexHull.h | 91 +- thirdparty/bullet/LinearMath/btHashMap.h | 248 +- thirdparty/bullet/LinearMath/btIDebugDraw.h | 340 ++- thirdparty/bullet/LinearMath/btList.h | 92 +- thirdparty/bullet/LinearMath/btMatrix3x3.h | 1472 +++++----- thirdparty/bullet/LinearMath/btMatrixX.h | 387 ++- thirdparty/bullet/LinearMath/btMinMax.h | 32 +- thirdparty/bullet/LinearMath/btMotionState.h | 26 +- .../bullet/LinearMath/btPolarDecomposition.cpp | 144 +- .../bullet/LinearMath/btPolarDecomposition.h | 29 +- thirdparty/bullet/LinearMath/btPoolAllocator.h | 98 +- thirdparty/bullet/LinearMath/btQuadWord.h | 244 +- thirdparty/bullet/LinearMath/btQuaternion.h | 895 +++--- thirdparty/bullet/LinearMath/btQuickprof.cpp | 617 ++-- thirdparty/bullet/LinearMath/btQuickprof.h | 208 +- thirdparty/bullet/LinearMath/btRandom.h | 13 +- thirdparty/bullet/LinearMath/btScalar.h | 18 +- thirdparty/bullet/LinearMath/btSerializer.cpp | 3 + thirdparty/bullet/LinearMath/btSerializer.h | 1195 ++++---- thirdparty/bullet/LinearMath/btSerializer64.cpp | 2 + thirdparty/bullet/LinearMath/btSpatialAlgebra.h | 248 +- thirdparty/bullet/LinearMath/btStackAlloc.h | 96 +- thirdparty/bullet/LinearMath/btThreads.cpp | 941 +++--- thirdparty/bullet/LinearMath/btThreads.h | 97 +- thirdparty/bullet/LinearMath/btTransform.h | 207 +- thirdparty/bullet/LinearMath/btTransformUtil.h | 158 +- thirdparty/bullet/LinearMath/btVector3.cpp | 3038 ++++++++++---------- thirdparty/bullet/LinearMath/btVector3.h | 1069 ++++--- 44 files changed, 9252 insertions(+), 9616 deletions(-) (limited to 'thirdparty/bullet/LinearMath') diff --git a/thirdparty/bullet/LinearMath/TaskScheduler/btTaskScheduler.cpp b/thirdparty/bullet/LinearMath/TaskScheduler/btTaskScheduler.cpp index 49510d1660..5f1115c402 100644 --- a/thirdparty/bullet/LinearMath/TaskScheduler/btTaskScheduler.cpp +++ b/thirdparty/bullet/LinearMath/TaskScheduler/btTaskScheduler.cpp @@ -6,13 +6,11 @@ #include #include - - #if BT_THREADSAFE #include "btThreadSupportInterface.h" -#if defined( _WIN32 ) +#if defined(_WIN32) #define WIN32_LEAN_AND_MEAN @@ -20,404 +18,399 @@ #endif - typedef unsigned long long btU64; static const int kCacheLineSize = 64; void btSpinPause() { -#if defined( _WIN32 ) - YieldProcessor(); +#if defined(_WIN32) + YieldProcessor(); #endif } - struct WorkerThreadStatus { - enum Type - { - kInvalid, - kWaitingForWork, - kWorking, - kSleeping, - }; + enum Type + { + kInvalid, + kWaitingForWork, + kWorking, + kSleeping, + }; }; - -ATTRIBUTE_ALIGNED64(class) WorkerThreadDirectives +ATTRIBUTE_ALIGNED64(class) +WorkerThreadDirectives { - static const int kMaxThreadCount = BT_MAX_THREAD_COUNT; - // directives for all worker threads packed into a single cacheline - char m_threadDirs[kMaxThreadCount]; + static const int kMaxThreadCount = BT_MAX_THREAD_COUNT; + // directives for all worker threads packed into a single cacheline + char m_threadDirs[kMaxThreadCount]; public: - enum Type - { - kInvalid, - kGoToSleep, // go to sleep - kStayAwakeButIdle, // wait for not checking job queue - kScanForJobs, // actively scan job queue for jobs - }; - WorkerThreadDirectives() - { - for ( int i = 0; i < kMaxThreadCount; ++i ) - { - m_threadDirs[ i ] = 0; - } - } - - Type getDirective(int threadId) - { - btAssert(threadId < kMaxThreadCount); - return static_cast(m_threadDirs[threadId]); - } - - void setDirectiveByRange(int threadBegin, int threadEnd, Type dir) - { - btAssert( threadBegin < threadEnd ); - btAssert( threadEnd <= kMaxThreadCount ); - char dirChar = static_cast(dir); - for ( int i = threadBegin; i < threadEnd; ++i ) - { - m_threadDirs[ i ] = dirChar; - } - } + enum Type + { + kInvalid, + kGoToSleep, // go to sleep + kStayAwakeButIdle, // wait for not checking job queue + kScanForJobs, // actively scan job queue for jobs + }; + WorkerThreadDirectives() + { + for (int i = 0; i < kMaxThreadCount; ++i) + { + m_threadDirs[i] = 0; + } + } + + Type getDirective(int threadId) + { + btAssert(threadId < kMaxThreadCount); + return static_cast(m_threadDirs[threadId]); + } + + void setDirectiveByRange(int threadBegin, int threadEnd, Type dir) + { + btAssert(threadBegin < threadEnd); + btAssert(threadEnd <= kMaxThreadCount); + char dirChar = static_cast(dir); + for (int i = threadBegin; i < threadEnd; ++i) + { + m_threadDirs[i] = dirChar; + } + } }; class JobQueue; -ATTRIBUTE_ALIGNED64(struct) ThreadLocalStorage +ATTRIBUTE_ALIGNED64(struct) +ThreadLocalStorage { - int m_threadId; - WorkerThreadStatus::Type m_status; - int m_numJobsFinished; - btSpinMutex m_mutex; - btScalar m_sumResult; - WorkerThreadDirectives * m_directive; - JobQueue* m_queue; - btClock* m_clock; - unsigned int m_cooldownTime; + int m_threadId; + WorkerThreadStatus::Type m_status; + int m_numJobsFinished; + btSpinMutex m_mutex; + btScalar m_sumResult; + WorkerThreadDirectives* m_directive; + JobQueue* m_queue; + btClock* m_clock; + unsigned int m_cooldownTime; }; - struct IJob { - virtual void executeJob(int threadId) = 0; + virtual void executeJob(int threadId) = 0; }; class ParallelForJob : public IJob { - const btIParallelForBody* m_body; - int m_begin; - int m_end; + const btIParallelForBody* m_body; + int m_begin; + int m_end; public: - ParallelForJob( int iBegin, int iEnd, const btIParallelForBody& body ) - { - m_body = &body; - m_begin = iBegin; - m_end = iEnd; - } - virtual void executeJob(int threadId) BT_OVERRIDE - { - BT_PROFILE( "executeJob" ); - - // call the functor body to do the work - m_body->forLoop( m_begin, m_end ); - } -}; + ParallelForJob(int iBegin, int iEnd, const btIParallelForBody& body) + { + m_body = &body; + m_begin = iBegin; + m_end = iEnd; + } + virtual void executeJob(int threadId) BT_OVERRIDE + { + BT_PROFILE("executeJob"); + // call the functor body to do the work + m_body->forLoop(m_begin, m_end); + } +}; class ParallelSumJob : public IJob { - const btIParallelSumBody* m_body; - ThreadLocalStorage* m_threadLocalStoreArray; - int m_begin; - int m_end; + const btIParallelSumBody* m_body; + ThreadLocalStorage* m_threadLocalStoreArray; + int m_begin; + int m_end; public: - ParallelSumJob( int iBegin, int iEnd, const btIParallelSumBody& body, ThreadLocalStorage* tls ) - { - m_body = &body; - m_threadLocalStoreArray = tls; - m_begin = iBegin; - m_end = iEnd; - } - virtual void executeJob( int threadId ) BT_OVERRIDE - { - BT_PROFILE( "executeJob" ); - - // call the functor body to do the work - btScalar val = m_body->sumLoop( m_begin, m_end ); + ParallelSumJob(int iBegin, int iEnd, const btIParallelSumBody& body, ThreadLocalStorage* tls) + { + m_body = &body; + m_threadLocalStoreArray = tls; + m_begin = iBegin; + m_end = iEnd; + } + virtual void executeJob(int threadId) BT_OVERRIDE + { + BT_PROFILE("executeJob"); + + // call the functor body to do the work + btScalar val = m_body->sumLoop(m_begin, m_end); #if BT_PARALLEL_SUM_DETERMINISTISM - // by truncating bits of the result, we can make the parallelSum deterministic (at the expense of precision) - const float TRUNC_SCALE = float(1<<19); - val = floor(val*TRUNC_SCALE+0.5f)/TRUNC_SCALE; // truncate some bits + // by truncating bits of the result, we can make the parallelSum deterministic (at the expense of precision) + const float TRUNC_SCALE = float(1 << 19); + val = floor(val * TRUNC_SCALE + 0.5f) / TRUNC_SCALE; // truncate some bits #endif - m_threadLocalStoreArray[threadId].m_sumResult += val; - } + m_threadLocalStoreArray[threadId].m_sumResult += val; + } }; - -ATTRIBUTE_ALIGNED64(class) JobQueue +ATTRIBUTE_ALIGNED64(class) +JobQueue { - btThreadSupportInterface* m_threadSupport; - btCriticalSection* m_queueLock; - btSpinMutex m_mutex; - - btAlignedObjectArray m_jobQueue; - char* m_jobMem; - int m_jobMemSize; - bool m_queueIsEmpty; - int m_tailIndex; - int m_headIndex; - int m_allocSize; - bool m_useSpinMutex; - btAlignedObjectArray m_neighborContexts; - char m_cachePadding[kCacheLineSize]; // prevent false sharing - - void freeJobMem() - { - if ( m_jobMem ) - { - // free old - btAlignedFree(m_jobMem); - m_jobMem = NULL; - } - } - void resizeJobMem(int newSize) - { - if (newSize > m_jobMemSize) - { - freeJobMem(); - m_jobMem = static_cast(btAlignedAlloc(newSize, kCacheLineSize)); - m_jobMemSize = newSize; - } - } + btThreadSupportInterface* m_threadSupport; + btCriticalSection* m_queueLock; + btSpinMutex m_mutex; + + btAlignedObjectArray m_jobQueue; + char* m_jobMem; + int m_jobMemSize; + bool m_queueIsEmpty; + int m_tailIndex; + int m_headIndex; + int m_allocSize; + bool m_useSpinMutex; + btAlignedObjectArray m_neighborContexts; + char m_cachePadding[kCacheLineSize]; // prevent false sharing + + void freeJobMem() + { + if (m_jobMem) + { + // free old + btAlignedFree(m_jobMem); + m_jobMem = NULL; + } + } + void resizeJobMem(int newSize) + { + if (newSize > m_jobMemSize) + { + freeJobMem(); + m_jobMem = static_cast(btAlignedAlloc(newSize, kCacheLineSize)); + m_jobMemSize = newSize; + } + } public: - - JobQueue() - { - m_jobMem = NULL; - m_jobMemSize = 0; - m_threadSupport = NULL; - m_queueLock = NULL; - m_headIndex = 0; - m_tailIndex = 0; - m_useSpinMutex = false; - } - ~JobQueue() - { + JobQueue() + { + m_jobMem = NULL; + m_jobMemSize = 0; + m_threadSupport = NULL; + m_queueLock = NULL; + m_headIndex = 0; + m_tailIndex = 0; + m_useSpinMutex = false; + } + ~JobQueue() + { exit(); - } + } void exit() - { + { freeJobMem(); - if (m_queueLock && m_threadSupport) - { - m_threadSupport->deleteCriticalSection(m_queueLock); - m_queueLock = NULL; + if (m_queueLock && m_threadSupport) + { + m_threadSupport->deleteCriticalSection(m_queueLock); + m_queueLock = NULL; m_threadSupport = 0; - } - } - - void init(btThreadSupportInterface* threadSup, btAlignedObjectArray* contextArray) - { - m_threadSupport = threadSup; - if (threadSup) - { - m_queueLock = m_threadSupport->createCriticalSection(); - } - setupJobStealing(contextArray, contextArray->size()); - } - void setupJobStealing(btAlignedObjectArray* contextArray, int numActiveContexts) - { - btAlignedObjectArray& contexts = *contextArray; - int selfIndex = 0; - for (int i = 0; i < contexts.size(); ++i) - { - if ( this == &contexts[ i ] ) - { - selfIndex = i; - break; - } - } - int numNeighbors = btMin(2, contexts.size() - 1); - int neighborOffsets[ ] = {-1, 1, -2, 2, -3, 3}; - int numOffsets = sizeof(neighborOffsets)/sizeof(neighborOffsets[0]); - m_neighborContexts.reserve( numNeighbors ); - m_neighborContexts.resizeNoInitialize(0); - for (int i = 0; i < numOffsets && m_neighborContexts.size() < numNeighbors; i++) - { - int neighborIndex = selfIndex + neighborOffsets[i]; - if ( neighborIndex >= 0 && neighborIndex < numActiveContexts) - { - m_neighborContexts.push_back( &contexts[ neighborIndex ] ); - } - } - } - - bool isQueueEmpty() const {return m_queueIsEmpty;} - void lockQueue() - { - if ( m_useSpinMutex ) - { - m_mutex.lock(); - } - else - { - m_queueLock->lock(); - } - } - void unlockQueue() - { - if ( m_useSpinMutex ) - { - m_mutex.unlock(); - } - else - { - m_queueLock->unlock(); - } - } - void clearQueue(int jobCount, int jobSize) - { - lockQueue(); - m_headIndex = 0; - m_tailIndex = 0; - m_allocSize = 0; - m_queueIsEmpty = true; - int jobBufSize = jobSize * jobCount; - // make sure we have enough memory allocated to store jobs - if ( jobBufSize > m_jobMemSize ) - { - resizeJobMem( jobBufSize ); - } - // make sure job queue is big enough - if ( jobCount > m_jobQueue.capacity() ) - { - m_jobQueue.reserve( jobCount ); - } - unlockQueue(); - m_jobQueue.resizeNoInitialize( 0 ); - } - void* allocJobMem(int jobSize) - { - btAssert(m_jobMemSize >= (m_allocSize + jobSize)); - void* jobMem = &m_jobMem[m_allocSize]; - m_allocSize += jobSize; - return jobMem; - } - void submitJob( IJob* job ) - { - btAssert( reinterpret_cast( job ) >= &m_jobMem[ 0 ] && reinterpret_cast( job ) < &m_jobMem[ 0 ] + m_allocSize ); - m_jobQueue.push_back( job ); - lockQueue(); - m_tailIndex++; - m_queueIsEmpty = false; - unlockQueue(); - } - IJob* consumeJobFromOwnQueue() - { - if ( m_queueIsEmpty ) - { - // lock free path. even if this is taken erroneously it isn't harmful - return NULL; - } - IJob* job = NULL; - lockQueue(); - if ( !m_queueIsEmpty ) - { - job = m_jobQueue[ m_headIndex++ ]; - btAssert( reinterpret_cast( job ) >= &m_jobMem[ 0 ] && reinterpret_cast( job ) < &m_jobMem[ 0 ] + m_allocSize ); - if ( m_headIndex == m_tailIndex ) - { - m_queueIsEmpty = true; - } - } - unlockQueue(); - return job; - } - IJob* consumeJob() - { - if (IJob* job = consumeJobFromOwnQueue()) - { - return job; - } - // own queue is empty, try to steal from neighbor - for (int i = 0; i < m_neighborContexts.size(); ++i) - { - JobQueue* otherContext = m_neighborContexts[ i ]; - if ( IJob* job = otherContext->consumeJobFromOwnQueue() ) - { - return job; - } - } - return NULL; - } -}; + } + } + void init(btThreadSupportInterface * threadSup, btAlignedObjectArray * contextArray) + { + m_threadSupport = threadSup; + if (threadSup) + { + m_queueLock = m_threadSupport->createCriticalSection(); + } + setupJobStealing(contextArray, contextArray->size()); + } + void setupJobStealing(btAlignedObjectArray * contextArray, int numActiveContexts) + { + btAlignedObjectArray& contexts = *contextArray; + int selfIndex = 0; + for (int i = 0; i < contexts.size(); ++i) + { + if (this == &contexts[i]) + { + selfIndex = i; + break; + } + } + int numNeighbors = btMin(2, contexts.size() - 1); + int neighborOffsets[] = {-1, 1, -2, 2, -3, 3}; + int numOffsets = sizeof(neighborOffsets) / sizeof(neighborOffsets[0]); + m_neighborContexts.reserve(numNeighbors); + m_neighborContexts.resizeNoInitialize(0); + for (int i = 0; i < numOffsets && m_neighborContexts.size() < numNeighbors; i++) + { + int neighborIndex = selfIndex + neighborOffsets[i]; + if (neighborIndex >= 0 && neighborIndex < numActiveContexts) + { + m_neighborContexts.push_back(&contexts[neighborIndex]); + } + } + } + + bool isQueueEmpty() const { return m_queueIsEmpty; } + void lockQueue() + { + if (m_useSpinMutex) + { + m_mutex.lock(); + } + else + { + m_queueLock->lock(); + } + } + void unlockQueue() + { + if (m_useSpinMutex) + { + m_mutex.unlock(); + } + else + { + m_queueLock->unlock(); + } + } + void clearQueue(int jobCount, int jobSize) + { + lockQueue(); + m_headIndex = 0; + m_tailIndex = 0; + m_allocSize = 0; + m_queueIsEmpty = true; + int jobBufSize = jobSize * jobCount; + // make sure we have enough memory allocated to store jobs + if (jobBufSize > m_jobMemSize) + { + resizeJobMem(jobBufSize); + } + // make sure job queue is big enough + if (jobCount > m_jobQueue.capacity()) + { + m_jobQueue.reserve(jobCount); + } + unlockQueue(); + m_jobQueue.resizeNoInitialize(0); + } + void* allocJobMem(int jobSize) + { + btAssert(m_jobMemSize >= (m_allocSize + jobSize)); + void* jobMem = &m_jobMem[m_allocSize]; + m_allocSize += jobSize; + return jobMem; + } + void submitJob(IJob * job) + { + btAssert(reinterpret_cast(job) >= &m_jobMem[0] && reinterpret_cast(job) < &m_jobMem[0] + m_allocSize); + m_jobQueue.push_back(job); + lockQueue(); + m_tailIndex++; + m_queueIsEmpty = false; + unlockQueue(); + } + IJob* consumeJobFromOwnQueue() + { + if (m_queueIsEmpty) + { + // lock free path. even if this is taken erroneously it isn't harmful + return NULL; + } + IJob* job = NULL; + lockQueue(); + if (!m_queueIsEmpty) + { + job = m_jobQueue[m_headIndex++]; + btAssert(reinterpret_cast(job) >= &m_jobMem[0] && reinterpret_cast(job) < &m_jobMem[0] + m_allocSize); + if (m_headIndex == m_tailIndex) + { + m_queueIsEmpty = true; + } + } + unlockQueue(); + return job; + } + IJob* consumeJob() + { + if (IJob* job = consumeJobFromOwnQueue()) + { + return job; + } + // own queue is empty, try to steal from neighbor + for (int i = 0; i < m_neighborContexts.size(); ++i) + { + JobQueue* otherContext = m_neighborContexts[i]; + if (IJob* job = otherContext->consumeJobFromOwnQueue()) + { + return job; + } + } + return NULL; + } +}; -static void WorkerThreadFunc( void* userPtr ) +static void WorkerThreadFunc(void* userPtr) { - BT_PROFILE( "WorkerThreadFunc" ); - ThreadLocalStorage* localStorage = (ThreadLocalStorage*) userPtr; - JobQueue* jobQueue = localStorage->m_queue; - - bool shouldSleep = false; - int threadId = localStorage->m_threadId; - while (! shouldSleep) - { - // do work - localStorage->m_mutex.lock(); - while ( IJob* job = jobQueue->consumeJob() ) - { - localStorage->m_status = WorkerThreadStatus::kWorking; - job->executeJob( threadId ); - localStorage->m_numJobsFinished++; - } - localStorage->m_status = WorkerThreadStatus::kWaitingForWork; - localStorage->m_mutex.unlock(); - btU64 clockStart = localStorage->m_clock->getTimeMicroseconds(); - // while queue is empty, - while (jobQueue->isQueueEmpty()) - { - // todo: spin wait a bit to avoid hammering the empty queue - btSpinPause(); - if ( localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kGoToSleep ) - { - shouldSleep = true; - break; - } - // if jobs are incoming, - if ( localStorage->m_directive->getDirective( threadId ) == WorkerThreadDirectives::kScanForJobs ) - { - clockStart = localStorage->m_clock->getTimeMicroseconds(); // reset clock - } - else - { - for ( int i = 0; i < 50; ++i ) - { - btSpinPause(); - btSpinPause(); - btSpinPause(); - btSpinPause(); - if (localStorage->m_directive->getDirective( threadId ) == WorkerThreadDirectives::kScanForJobs || !jobQueue->isQueueEmpty()) - { - break; - } - } - // if no jobs incoming and queue has been empty for the cooldown time, sleep - btU64 timeElapsed = localStorage->m_clock->getTimeMicroseconds() - clockStart; - if (timeElapsed > localStorage->m_cooldownTime) - { - shouldSleep = true; - break; - } - } - } - } + BT_PROFILE("WorkerThreadFunc"); + ThreadLocalStorage* localStorage = (ThreadLocalStorage*)userPtr; + JobQueue* jobQueue = localStorage->m_queue; + + bool shouldSleep = false; + int threadId = localStorage->m_threadId; + while (!shouldSleep) + { + // do work + localStorage->m_mutex.lock(); + while (IJob* job = jobQueue->consumeJob()) + { + localStorage->m_status = WorkerThreadStatus::kWorking; + job->executeJob(threadId); + localStorage->m_numJobsFinished++; + } + localStorage->m_status = WorkerThreadStatus::kWaitingForWork; + localStorage->m_mutex.unlock(); + btU64 clockStart = localStorage->m_clock->getTimeMicroseconds(); + // while queue is empty, + while (jobQueue->isQueueEmpty()) + { + // todo: spin wait a bit to avoid hammering the empty queue + btSpinPause(); + if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kGoToSleep) + { + shouldSleep = true; + break; + } + // if jobs are incoming, + if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kScanForJobs) + { + clockStart = localStorage->m_clock->getTimeMicroseconds(); // reset clock + } + else + { + for (int i = 0; i < 50; ++i) + { + btSpinPause(); + btSpinPause(); + btSpinPause(); + btSpinPause(); + if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kScanForJobs || !jobQueue->isQueueEmpty()) + { + break; + } + } + // if no jobs incoming and queue has been empty for the cooldown time, sleep + btU64 timeElapsed = localStorage->m_clock->getTimeMicroseconds() - clockStart; + if (timeElapsed > localStorage->m_cooldownTime) + { + shouldSleep = true; + break; + } + } + } + } { BT_PROFILE("sleep"); // go sleep @@ -427,376 +420,373 @@ static void WorkerThreadFunc( void* userPtr ) } } - class btTaskSchedulerDefault : public btITaskScheduler { - btThreadSupportInterface* m_threadSupport; - WorkerThreadDirectives* m_workerDirective; - btAlignedObjectArray m_jobQueues; - btAlignedObjectArray m_perThreadJobQueues; - btAlignedObjectArray m_threadLocalStorage; - btSpinMutex m_antiNestingLock; // prevent nested parallel-for - btClock m_clock; - int m_numThreads; - int m_numWorkerThreads; - int m_numActiveJobQueues; - int m_maxNumThreads; - int m_numJobs; - static const int kFirstWorkerThreadId = 1; + btThreadSupportInterface* m_threadSupport; + WorkerThreadDirectives* m_workerDirective; + btAlignedObjectArray m_jobQueues; + btAlignedObjectArray m_perThreadJobQueues; + btAlignedObjectArray m_threadLocalStorage; + btSpinMutex m_antiNestingLock; // prevent nested parallel-for + btClock m_clock; + int m_numThreads; + int m_numWorkerThreads; + int m_numActiveJobQueues; + int m_maxNumThreads; + int m_numJobs; + static const int kFirstWorkerThreadId = 1; + public: + btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport") + { + m_threadSupport = NULL; + m_workerDirective = NULL; + } - btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport") - { - m_threadSupport = NULL; - m_workerDirective = NULL; - } - - virtual ~btTaskSchedulerDefault() - { - waitForWorkersToSleep(); - - for ( int i = 0; i < m_jobQueues.size(); ++i ) - { - m_jobQueues[i].exit(); - } - - if (m_threadSupport) - { - delete m_threadSupport; - m_threadSupport = NULL; - } - if (m_workerDirective) - { - btAlignedFree(m_workerDirective); - m_workerDirective = NULL; - } - } - - void init() - { - btThreadSupportInterface::ConstructionInfo constructionInfo( "TaskScheduler", WorkerThreadFunc ); - m_threadSupport = btThreadSupportInterface::create( constructionInfo ); - m_workerDirective = static_cast(btAlignedAlloc(sizeof(*m_workerDirective), 64)); - - m_numWorkerThreads = m_threadSupport->getNumWorkerThreads(); - m_maxNumThreads = m_threadSupport->getNumWorkerThreads() + 1; - m_numThreads = m_maxNumThreads; - // ideal to have one job queue for each physical processor (except for the main thread which needs no queue) - int numThreadsPerQueue = m_threadSupport->getLogicalToPhysicalCoreRatio(); - int numJobQueues = (numThreadsPerQueue == 1) ? (m_maxNumThreads-1) : (m_maxNumThreads / numThreadsPerQueue); - m_jobQueues.resize(numJobQueues); - m_numActiveJobQueues = numJobQueues; - for ( int i = 0; i < m_jobQueues.size(); ++i ) - { - m_jobQueues[i].init( m_threadSupport, &m_jobQueues ); - } - m_perThreadJobQueues.resize(m_numThreads); - for ( int i = 0; i < m_numThreads; i++ ) - { - JobQueue* jq = NULL; - // only worker threads get a job queue - if (i > 0) - { - if (numThreadsPerQueue == 1) - { - // one queue per worker thread - jq = &m_jobQueues[ i - kFirstWorkerThreadId ]; - } - else - { - // 2 threads share each queue - jq = &m_jobQueues[ i / numThreadsPerQueue ]; - } - } - m_perThreadJobQueues[i] = jq; - } - m_threadLocalStorage.resize(m_numThreads); - for ( int i = 0; i < m_numThreads; i++ ) - { - ThreadLocalStorage& storage = m_threadLocalStorage[i]; - storage.m_threadId = i; - storage.m_directive = m_workerDirective; - storage.m_status = WorkerThreadStatus::kSleeping; - storage.m_cooldownTime = 100; // 100 microseconds, threads go to sleep after this long if they have nothing to do - storage.m_clock = &m_clock; - storage.m_queue = m_perThreadJobQueues[i]; - } - setWorkerDirectives( WorkerThreadDirectives::kGoToSleep ); // no work for them yet - setNumThreads( m_threadSupport->getCacheFriendlyNumThreads() ); - } - - void setWorkerDirectives(WorkerThreadDirectives::Type dir) - { - m_workerDirective->setDirectiveByRange(kFirstWorkerThreadId, m_numThreads, dir); - } - - virtual int getMaxNumThreads() const BT_OVERRIDE - { - return m_maxNumThreads; - } - - virtual int getNumThreads() const BT_OVERRIDE - { - return m_numThreads; - } - - virtual void setNumThreads( int numThreads ) BT_OVERRIDE - { - m_numThreads = btMax( btMin(numThreads, int(m_maxNumThreads)), 1 ); - m_numWorkerThreads = m_numThreads - 1; - m_numActiveJobQueues = 0; - // if there is at least 1 worker, - if ( m_numWorkerThreads > 0 ) - { - // re-setup job stealing between queues to avoid attempting to steal from an inactive job queue - JobQueue* lastActiveContext = m_perThreadJobQueues[ m_numThreads - 1 ]; - int iLastActiveContext = lastActiveContext - &m_jobQueues[0]; - m_numActiveJobQueues = iLastActiveContext + 1; - for ( int i = 0; i < m_jobQueues.size(); ++i ) - { - m_jobQueues[ i ].setupJobStealing( &m_jobQueues, m_numActiveJobQueues ); - } - } - m_workerDirective->setDirectiveByRange(m_numThreads, BT_MAX_THREAD_COUNT, WorkerThreadDirectives::kGoToSleep); - } - - void waitJobs() - { - BT_PROFILE( "waitJobs" ); - // have the main thread work until the job queues are empty - int numMainThreadJobsFinished = 0; - for ( int i = 0; i < m_numActiveJobQueues; ++i ) - { - while ( IJob* job = m_jobQueues[i].consumeJob() ) - { - job->executeJob( 0 ); - numMainThreadJobsFinished++; - } - } - - // done with jobs for now, tell workers to rest (but not sleep) - setWorkerDirectives( WorkerThreadDirectives::kStayAwakeButIdle ); - - btU64 clockStart = m_clock.getTimeMicroseconds(); - // wait for workers to finish any jobs in progress - while ( true ) - { - int numWorkerJobsFinished = 0; - for ( int iThread = kFirstWorkerThreadId; iThread < m_numThreads; ++iThread ) - { - ThreadLocalStorage* storage = &m_threadLocalStorage[iThread]; - storage->m_mutex.lock(); - numWorkerJobsFinished += storage->m_numJobsFinished; - storage->m_mutex.unlock(); - } - if (numWorkerJobsFinished + numMainThreadJobsFinished == m_numJobs) - { - break; - } - btU64 timeElapsed = m_clock.getTimeMicroseconds() - clockStart; - btAssert(timeElapsed < 1000); - if (timeElapsed > 100000) - { - break; - } - btSpinPause(); - } - } - - void wakeWorkers(int numWorkersToWake) - { - BT_PROFILE( "wakeWorkers" ); - btAssert( m_workerDirective->getDirective(1) == WorkerThreadDirectives::kScanForJobs ); - int numDesiredWorkers = btMin(numWorkersToWake, m_numWorkerThreads); - int numActiveWorkers = 0; - for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker ) - { - // note this count of active workers is not necessarily totally reliable, because a worker thread could be - // just about to put itself to sleep. So we may on occasion fail to wake up all the workers. It should be rare. - ThreadLocalStorage& storage = m_threadLocalStorage[ kFirstWorkerThreadId + iWorker ]; - if (storage.m_status != WorkerThreadStatus::kSleeping) - { - numActiveWorkers++; - } - } - for ( int iWorker = 0; iWorker < m_numWorkerThreads && numActiveWorkers < numDesiredWorkers; ++iWorker ) - { - ThreadLocalStorage& storage = m_threadLocalStorage[ kFirstWorkerThreadId + iWorker ]; - if (storage.m_status == WorkerThreadStatus::kSleeping) - { - m_threadSupport->runTask( iWorker, &storage ); - numActiveWorkers++; - } - } - } - - void waitForWorkersToSleep() - { - BT_PROFILE( "waitForWorkersToSleep" ); - setWorkerDirectives( WorkerThreadDirectives::kGoToSleep ); - m_threadSupport->waitForAllTasks(); - for ( int i = kFirstWorkerThreadId; i < m_numThreads; i++ ) - { - ThreadLocalStorage& storage = m_threadLocalStorage[i]; - btAssert( storage.m_status == WorkerThreadStatus::kSleeping ); - } - } - - virtual void sleepWorkerThreadsHint() BT_OVERRIDE - { - BT_PROFILE( "sleepWorkerThreadsHint" ); - // hint the task scheduler that we may not be using these threads for a little while - setWorkerDirectives( WorkerThreadDirectives::kGoToSleep ); - } - - void prepareWorkerThreads() - { - for ( int i = kFirstWorkerThreadId; i < m_numThreads; ++i ) - { - ThreadLocalStorage& storage = m_threadLocalStorage[i]; - storage.m_mutex.lock(); - storage.m_numJobsFinished = 0; - storage.m_mutex.unlock(); - } - setWorkerDirectives( WorkerThreadDirectives::kScanForJobs ); - } - - virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE - { - BT_PROFILE( "parallelFor_ThreadSupport" ); - btAssert( iEnd >= iBegin ); - btAssert( grainSize >= 1 ); - int iterationCount = iEnd - iBegin; - if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() ) - { - typedef ParallelForJob JobType; - int jobCount = ( iterationCount + grainSize - 1 ) / grainSize; - m_numJobs = jobCount; - btAssert( jobCount >= 2 ); // need more than one job for multithreading - int jobSize = sizeof( JobType ); - - for (int i = 0; i < m_numActiveJobQueues; ++i) - { - m_jobQueues[i].clearQueue( jobCount, jobSize ); - } - // prepare worker threads for incoming work - prepareWorkerThreads(); - // submit all of the jobs - int iJob = 0; - int iThread = kFirstWorkerThreadId; // first worker thread - for ( int i = iBegin; i < iEnd; i += grainSize ) - { - btAssert( iJob < jobCount ); - int iE = btMin( i + grainSize, iEnd ); - JobQueue* jq = m_perThreadJobQueues[ iThread ]; - btAssert(jq); - btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues); - void* jobMem = jq->allocJobMem(jobSize); - JobType* job = new ( jobMem ) ParallelForJob( i, iE, body ); // placement new - jq->submitJob( job ); - iJob++; - iThread++; - if ( iThread >= m_numThreads ) - { - iThread = kFirstWorkerThreadId; // first worker thread - } - } - wakeWorkers( jobCount - 1 ); - - // put the main thread to work on emptying the job queue and then wait for all workers to finish - waitJobs(); - m_antiNestingLock.unlock(); - } - else - { - BT_PROFILE( "parallelFor_mainThread" ); - // just run on main thread - body.forLoop( iBegin, iEnd ); - } - } - virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE - { - BT_PROFILE( "parallelSum_ThreadSupport" ); - btAssert( iEnd >= iBegin ); - btAssert( grainSize >= 1 ); - int iterationCount = iEnd - iBegin; - if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() ) - { - typedef ParallelSumJob JobType; - int jobCount = ( iterationCount + grainSize - 1 ) / grainSize; - m_numJobs = jobCount; - btAssert( jobCount >= 2 ); // need more than one job for multithreading - int jobSize = sizeof( JobType ); - for (int i = 0; i < m_numActiveJobQueues; ++i) - { - m_jobQueues[i].clearQueue( jobCount, jobSize ); - } - - // initialize summation - for ( int iThread = 0; iThread < m_numThreads; ++iThread ) - { - m_threadLocalStorage[iThread].m_sumResult = btScalar(0); - } - - // prepare worker threads for incoming work - prepareWorkerThreads(); - // submit all of the jobs - int iJob = 0; - int iThread = kFirstWorkerThreadId; // first worker thread - for ( int i = iBegin; i < iEnd; i += grainSize ) - { - btAssert( iJob < jobCount ); - int iE = btMin( i + grainSize, iEnd ); - JobQueue* jq = m_perThreadJobQueues[ iThread ]; - btAssert(jq); - btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues); - void* jobMem = jq->allocJobMem(jobSize); - JobType* job = new ( jobMem ) ParallelSumJob( i, iE, body, &m_threadLocalStorage[0] ); // placement new - jq->submitJob( job ); - iJob++; - iThread++; - if ( iThread >= m_numThreads ) - { - iThread = kFirstWorkerThreadId; // first worker thread - } - } - wakeWorkers( jobCount - 1 ); - - // put the main thread to work on emptying the job queue and then wait for all workers to finish - waitJobs(); - - // add up all the thread sums - btScalar sum = btScalar(0); - for ( int iThread = 0; iThread < m_numThreads; ++iThread ) - { - sum += m_threadLocalStorage[ iThread ].m_sumResult; - } - m_antiNestingLock.unlock(); - return sum; - } - else - { - BT_PROFILE( "parallelSum_mainThread" ); - // just run on main thread - return body.sumLoop( iBegin, iEnd ); - } - } -}; + virtual ~btTaskSchedulerDefault() + { + waitForWorkersToSleep(); + + for (int i = 0; i < m_jobQueues.size(); ++i) + { + m_jobQueues[i].exit(); + } + + if (m_threadSupport) + { + delete m_threadSupport; + m_threadSupport = NULL; + } + if (m_workerDirective) + { + btAlignedFree(m_workerDirective); + m_workerDirective = NULL; + } + } + void init() + { + btThreadSupportInterface::ConstructionInfo constructionInfo("TaskScheduler", WorkerThreadFunc); + m_threadSupport = btThreadSupportInterface::create(constructionInfo); + m_workerDirective = static_cast(btAlignedAlloc(sizeof(*m_workerDirective), 64)); + + m_numWorkerThreads = m_threadSupport->getNumWorkerThreads(); + m_maxNumThreads = m_threadSupport->getNumWorkerThreads() + 1; + m_numThreads = m_maxNumThreads; + // ideal to have one job queue for each physical processor (except for the main thread which needs no queue) + int numThreadsPerQueue = m_threadSupport->getLogicalToPhysicalCoreRatio(); + int numJobQueues = (numThreadsPerQueue == 1) ? (m_maxNumThreads - 1) : (m_maxNumThreads / numThreadsPerQueue); + m_jobQueues.resize(numJobQueues); + m_numActiveJobQueues = numJobQueues; + for (int i = 0; i < m_jobQueues.size(); ++i) + { + m_jobQueues[i].init(m_threadSupport, &m_jobQueues); + } + m_perThreadJobQueues.resize(m_numThreads); + for (int i = 0; i < m_numThreads; i++) + { + JobQueue* jq = NULL; + // only worker threads get a job queue + if (i > 0) + { + if (numThreadsPerQueue == 1) + { + // one queue per worker thread + jq = &m_jobQueues[i - kFirstWorkerThreadId]; + } + else + { + // 2 threads share each queue + jq = &m_jobQueues[i / numThreadsPerQueue]; + } + } + m_perThreadJobQueues[i] = jq; + } + m_threadLocalStorage.resize(m_numThreads); + for (int i = 0; i < m_numThreads; i++) + { + ThreadLocalStorage& storage = m_threadLocalStorage[i]; + storage.m_threadId = i; + storage.m_directive = m_workerDirective; + storage.m_status = WorkerThreadStatus::kSleeping; + storage.m_cooldownTime = 100; // 100 microseconds, threads go to sleep after this long if they have nothing to do + storage.m_clock = &m_clock; + storage.m_queue = m_perThreadJobQueues[i]; + } + setWorkerDirectives(WorkerThreadDirectives::kGoToSleep); // no work for them yet + setNumThreads(m_threadSupport->getCacheFriendlyNumThreads()); + } + + void setWorkerDirectives(WorkerThreadDirectives::Type dir) + { + m_workerDirective->setDirectiveByRange(kFirstWorkerThreadId, m_numThreads, dir); + } + + virtual int getMaxNumThreads() const BT_OVERRIDE + { + return m_maxNumThreads; + } + virtual int getNumThreads() const BT_OVERRIDE + { + return m_numThreads; + } + + virtual void setNumThreads(int numThreads) BT_OVERRIDE + { + m_numThreads = btMax(btMin(numThreads, int(m_maxNumThreads)), 1); + m_numWorkerThreads = m_numThreads - 1; + m_numActiveJobQueues = 0; + // if there is at least 1 worker, + if (m_numWorkerThreads > 0) + { + // re-setup job stealing between queues to avoid attempting to steal from an inactive job queue + JobQueue* lastActiveContext = m_perThreadJobQueues[m_numThreads - 1]; + int iLastActiveContext = lastActiveContext - &m_jobQueues[0]; + m_numActiveJobQueues = iLastActiveContext + 1; + for (int i = 0; i < m_jobQueues.size(); ++i) + { + m_jobQueues[i].setupJobStealing(&m_jobQueues, m_numActiveJobQueues); + } + } + m_workerDirective->setDirectiveByRange(m_numThreads, BT_MAX_THREAD_COUNT, WorkerThreadDirectives::kGoToSleep); + } + + void waitJobs() + { + BT_PROFILE("waitJobs"); + // have the main thread work until the job queues are empty + int numMainThreadJobsFinished = 0; + for (int i = 0; i < m_numActiveJobQueues; ++i) + { + while (IJob* job = m_jobQueues[i].consumeJob()) + { + job->executeJob(0); + numMainThreadJobsFinished++; + } + } + + // done with jobs for now, tell workers to rest (but not sleep) + setWorkerDirectives(WorkerThreadDirectives::kStayAwakeButIdle); + + btU64 clockStart = m_clock.getTimeMicroseconds(); + // wait for workers to finish any jobs in progress + while (true) + { + int numWorkerJobsFinished = 0; + for (int iThread = kFirstWorkerThreadId; iThread < m_numThreads; ++iThread) + { + ThreadLocalStorage* storage = &m_threadLocalStorage[iThread]; + storage->m_mutex.lock(); + numWorkerJobsFinished += storage->m_numJobsFinished; + storage->m_mutex.unlock(); + } + if (numWorkerJobsFinished + numMainThreadJobsFinished == m_numJobs) + { + break; + } + btU64 timeElapsed = m_clock.getTimeMicroseconds() - clockStart; + btAssert(timeElapsed < 1000); + if (timeElapsed > 100000) + { + break; + } + btSpinPause(); + } + } + + void wakeWorkers(int numWorkersToWake) + { + BT_PROFILE("wakeWorkers"); + btAssert(m_workerDirective->getDirective(1) == WorkerThreadDirectives::kScanForJobs); + int numDesiredWorkers = btMin(numWorkersToWake, m_numWorkerThreads); + int numActiveWorkers = 0; + for (int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker) + { + // note this count of active workers is not necessarily totally reliable, because a worker thread could be + // just about to put itself to sleep. So we may on occasion fail to wake up all the workers. It should be rare. + ThreadLocalStorage& storage = m_threadLocalStorage[kFirstWorkerThreadId + iWorker]; + if (storage.m_status != WorkerThreadStatus::kSleeping) + { + numActiveWorkers++; + } + } + for (int iWorker = 0; iWorker < m_numWorkerThreads && numActiveWorkers < numDesiredWorkers; ++iWorker) + { + ThreadLocalStorage& storage = m_threadLocalStorage[kFirstWorkerThreadId + iWorker]; + if (storage.m_status == WorkerThreadStatus::kSleeping) + { + m_threadSupport->runTask(iWorker, &storage); + numActiveWorkers++; + } + } + } + + void waitForWorkersToSleep() + { + BT_PROFILE("waitForWorkersToSleep"); + setWorkerDirectives(WorkerThreadDirectives::kGoToSleep); + m_threadSupport->waitForAllTasks(); + for (int i = kFirstWorkerThreadId; i < m_numThreads; i++) + { + ThreadLocalStorage& storage = m_threadLocalStorage[i]; + btAssert(storage.m_status == WorkerThreadStatus::kSleeping); + } + } + + virtual void sleepWorkerThreadsHint() BT_OVERRIDE + { + BT_PROFILE("sleepWorkerThreadsHint"); + // hint the task scheduler that we may not be using these threads for a little while + setWorkerDirectives(WorkerThreadDirectives::kGoToSleep); + } + + void prepareWorkerThreads() + { + for (int i = kFirstWorkerThreadId; i < m_numThreads; ++i) + { + ThreadLocalStorage& storage = m_threadLocalStorage[i]; + storage.m_mutex.lock(); + storage.m_numJobsFinished = 0; + storage.m_mutex.unlock(); + } + setWorkerDirectives(WorkerThreadDirectives::kScanForJobs); + } + + virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE + { + BT_PROFILE("parallelFor_ThreadSupport"); + btAssert(iEnd >= iBegin); + btAssert(grainSize >= 1); + int iterationCount = iEnd - iBegin; + if (iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock()) + { + typedef ParallelForJob JobType; + int jobCount = (iterationCount + grainSize - 1) / grainSize; + m_numJobs = jobCount; + btAssert(jobCount >= 2); // need more than one job for multithreading + int jobSize = sizeof(JobType); + + for (int i = 0; i < m_numActiveJobQueues; ++i) + { + m_jobQueues[i].clearQueue(jobCount, jobSize); + } + // prepare worker threads for incoming work + prepareWorkerThreads(); + // submit all of the jobs + int iJob = 0; + int iThread = kFirstWorkerThreadId; // first worker thread + for (int i = iBegin; i < iEnd; i += grainSize) + { + btAssert(iJob < jobCount); + int iE = btMin(i + grainSize, iEnd); + JobQueue* jq = m_perThreadJobQueues[iThread]; + btAssert(jq); + btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues); + void* jobMem = jq->allocJobMem(jobSize); + JobType* job = new (jobMem) ParallelForJob(i, iE, body); // placement new + jq->submitJob(job); + iJob++; + iThread++; + if (iThread >= m_numThreads) + { + iThread = kFirstWorkerThreadId; // first worker thread + } + } + wakeWorkers(jobCount - 1); + + // put the main thread to work on emptying the job queue and then wait for all workers to finish + waitJobs(); + m_antiNestingLock.unlock(); + } + else + { + BT_PROFILE("parallelFor_mainThread"); + // just run on main thread + body.forLoop(iBegin, iEnd); + } + } + virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE + { + BT_PROFILE("parallelSum_ThreadSupport"); + btAssert(iEnd >= iBegin); + btAssert(grainSize >= 1); + int iterationCount = iEnd - iBegin; + if (iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock()) + { + typedef ParallelSumJob JobType; + int jobCount = (iterationCount + grainSize - 1) / grainSize; + m_numJobs = jobCount; + btAssert(jobCount >= 2); // need more than one job for multithreading + int jobSize = sizeof(JobType); + for (int i = 0; i < m_numActiveJobQueues; ++i) + { + m_jobQueues[i].clearQueue(jobCount, jobSize); + } + + // initialize summation + for (int iThread = 0; iThread < m_numThreads; ++iThread) + { + m_threadLocalStorage[iThread].m_sumResult = btScalar(0); + } + + // prepare worker threads for incoming work + prepareWorkerThreads(); + // submit all of the jobs + int iJob = 0; + int iThread = kFirstWorkerThreadId; // first worker thread + for (int i = iBegin; i < iEnd; i += grainSize) + { + btAssert(iJob < jobCount); + int iE = btMin(i + grainSize, iEnd); + JobQueue* jq = m_perThreadJobQueues[iThread]; + btAssert(jq); + btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues); + void* jobMem = jq->allocJobMem(jobSize); + JobType* job = new (jobMem) ParallelSumJob(i, iE, body, &m_threadLocalStorage[0]); // placement new + jq->submitJob(job); + iJob++; + iThread++; + if (iThread >= m_numThreads) + { + iThread = kFirstWorkerThreadId; // first worker thread + } + } + wakeWorkers(jobCount - 1); + + // put the main thread to work on emptying the job queue and then wait for all workers to finish + waitJobs(); + + // add up all the thread sums + btScalar sum = btScalar(0); + for (int iThread = 0; iThread < m_numThreads; ++iThread) + { + sum += m_threadLocalStorage[iThread].m_sumResult; + } + m_antiNestingLock.unlock(); + return sum; + } + else + { + BT_PROFILE("parallelSum_mainThread"); + // just run on main thread + return body.sumLoop(iBegin, iEnd); + } + } +}; btITaskScheduler* btCreateDefaultTaskScheduler() { - btTaskSchedulerDefault* ts = new btTaskSchedulerDefault(); - ts->init(); - return ts; + btTaskSchedulerDefault* ts = new btTaskSchedulerDefault(); + ts->init(); + return ts; } -#else // #if BT_THREADSAFE +#else // #if BT_THREADSAFE btITaskScheduler* btCreateDefaultTaskScheduler() { - return NULL; + return NULL; } -#endif // #else // #if BT_THREADSAFE +#endif // #else // #if BT_THREADSAFE diff --git a/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportInterface.h b/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportInterface.h index a0ad802b1e..1fe49335a1 100644 --- a/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportInterface.h +++ b/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportInterface.h @@ -16,55 +16,49 @@ subject to the following restrictions: #ifndef BT_THREAD_SUPPORT_INTERFACE_H #define BT_THREAD_SUPPORT_INTERFACE_H - - class btCriticalSection { public: - btCriticalSection() {} - virtual ~btCriticalSection() {} + btCriticalSection() {} + virtual ~btCriticalSection() {} - virtual void lock() = 0; - virtual void unlock() = 0; + virtual void lock() = 0; + virtual void unlock() = 0; }; - class btThreadSupportInterface { public: - - virtual ~btThreadSupportInterface() {} - - virtual int getNumWorkerThreads() const = 0; // number of worker threads (total number of logical processors - 1) - virtual int getCacheFriendlyNumThreads() const = 0; // the number of logical processors sharing a single L3 cache - virtual int getLogicalToPhysicalCoreRatio() const = 0; // the number of logical processors per physical processor (usually 1 or 2) - virtual void runTask( int threadIndex, void* userData ) = 0; - virtual void waitForAllTasks() = 0; - - virtual btCriticalSection* createCriticalSection() = 0; - virtual void deleteCriticalSection( btCriticalSection* criticalSection ) = 0; - - typedef void( *ThreadFunc )( void* userPtr ); - - struct ConstructionInfo - { - ConstructionInfo( const char* uniqueName, - ThreadFunc userThreadFunc, - int threadStackSize = 65535 - ) - :m_uniqueName( uniqueName ), - m_userThreadFunc( userThreadFunc ), - m_threadStackSize( threadStackSize ) - { - } - - const char* m_uniqueName; - ThreadFunc m_userThreadFunc; - int m_threadStackSize; - }; - - static btThreadSupportInterface* create( const ConstructionInfo& info ); + virtual ~btThreadSupportInterface() {} + + virtual int getNumWorkerThreads() const = 0; // number of worker threads (total number of logical processors - 1) + virtual int getCacheFriendlyNumThreads() const = 0; // the number of logical processors sharing a single L3 cache + virtual int getLogicalToPhysicalCoreRatio() const = 0; // the number of logical processors per physical processor (usually 1 or 2) + virtual void runTask(int threadIndex, void* userData) = 0; + virtual void waitForAllTasks() = 0; + + virtual btCriticalSection* createCriticalSection() = 0; + virtual void deleteCriticalSection(btCriticalSection* criticalSection) = 0; + + typedef void (*ThreadFunc)(void* userPtr); + + struct ConstructionInfo + { + ConstructionInfo(const char* uniqueName, + ThreadFunc userThreadFunc, + int threadStackSize = 65535) + : m_uniqueName(uniqueName), + m_userThreadFunc(userThreadFunc), + m_threadStackSize(threadStackSize) + { + } + + const char* m_uniqueName; + ThreadFunc m_userThreadFunc; + int m_threadStackSize; + }; + + static btThreadSupportInterface* create(const ConstructionInfo& info); }; -#endif //BT_THREAD_SUPPORT_INTERFACE_H - +#endif //BT_THREAD_SUPPORT_INTERFACE_H diff --git a/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportPosix.cpp b/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportPosix.cpp index 50ca060dfe..02f4ed1631 100644 --- a/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportPosix.cpp +++ b/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportPosix.cpp @@ -1,3 +1,4 @@ + /* Bullet Continuous Collision Detection and Physics Library Copyright (c) 2003-2018 Erwin Coumans http://bulletphysics.com @@ -13,9 +14,7 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - -#if BT_THREADSAFE && !defined( _WIN32 ) - +#if BT_THREADSAFE && !defined(_WIN32) #include "LinearMath/btScalar.h" #include "LinearMath/btAlignedObjectArray.h" @@ -27,14 +26,12 @@ subject to the following restrictions: #include #include - #ifndef _XOPEN_SOURCE -#define _XOPEN_SOURCE 600 //for definition of pthread_barrier_t, see http://pages.cs.wisc.edu/~travitch/pthreads_primer.html -#endif //_XOPEN_SOURCE +#define _XOPEN_SOURCE 600 //for definition of pthread_barrier_t, see http://pages.cs.wisc.edu/~travitch/pthreads_primer.html +#endif //_XOPEN_SOURCE #include #include -#include //for sysconf - +#include //for sysconf /// /// getNumHardwareThreads() @@ -48,318 +45,309 @@ subject to the following restrictions: int btGetNumHardwareThreads() { - return btMin(BT_MAX_THREAD_COUNT, std::thread::hardware_concurrency()); + return btMin(BT_MAX_THREAD_COUNT, std::thread::hardware_concurrency()); } #else int btGetNumHardwareThreads() { - return btMin(BT_MAX_THREAD_COUNT, sysconf( _SC_NPROCESSORS_ONLN )); + return btMin(BT_MAX_THREAD_COUNT, sysconf(_SC_NPROCESSORS_ONLN)); } #endif - // btThreadSupportPosix helps to initialize/shutdown libspe2, start/stop SPU tasks and communication class btThreadSupportPosix : public btThreadSupportInterface { public: - struct btThreadStatus - { - int m_taskId; - int m_commandId; - int m_status; - - ThreadFunc m_userThreadFunc; - void* m_userPtr; //for taskDesc etc - - pthread_t thread; - //each tread will wait until this signal to start its work - sem_t* startSemaphore; - - // this is a copy of m_mainSemaphore, - //each tread will signal once it is finished with its work - sem_t* m_mainSemaphore; - unsigned long threadUsed; - }; -private: - typedef unsigned long long UINT64; - - btAlignedObjectArray m_activeThreadStatus; - // m_mainSemaphoresemaphore will signal, if and how many threads are finished with their work - sem_t* m_mainSemaphore; - int m_numThreads; - UINT64 m_startedThreadsMask; - void startThreads( const ConstructionInfo& threadInfo ); - void stopThreads(); - int waitForResponse(); + struct btThreadStatus + { + int m_taskId; + int m_commandId; + int m_status; + + ThreadFunc m_userThreadFunc; + void* m_userPtr; //for taskDesc etc + + pthread_t thread; + //each tread will wait until this signal to start its work + sem_t* startSemaphore; + btCriticalSection* m_cs; + // this is a copy of m_mainSemaphore, + //each tread will signal once it is finished with its work + sem_t* m_mainSemaphore; + unsigned long threadUsed; + }; +private: + typedef unsigned long long UINT64; + + btAlignedObjectArray m_activeThreadStatus; + // m_mainSemaphoresemaphore will signal, if and how many threads are finished with their work + sem_t* m_mainSemaphore; + int m_numThreads; + UINT64 m_startedThreadsMask; + void startThreads(const ConstructionInfo& threadInfo); + void stopThreads(); + int waitForResponse(); + btCriticalSection* m_cs; public: - btThreadSupportPosix( const ConstructionInfo& threadConstructionInfo ); - virtual ~btThreadSupportPosix(); + btThreadSupportPosix(const ConstructionInfo& threadConstructionInfo); + virtual ~btThreadSupportPosix(); - virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; } - // TODO: return the number of logical processors sharing the first L3 cache - virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return m_numThreads + 1; } - // TODO: detect if CPU has hyperthreading enabled - virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return 1; } + virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; } + // TODO: return the number of logical processors sharing the first L3 cache + virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return m_numThreads + 1; } + // TODO: detect if CPU has hyperthreading enabled + virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return 1; } - virtual void runTask( int threadIndex, void* userData ) BT_OVERRIDE; - virtual void waitForAllTasks() BT_OVERRIDE; + virtual void runTask(int threadIndex, void* userData) BT_OVERRIDE; + virtual void waitForAllTasks() BT_OVERRIDE; - virtual btCriticalSection* createCriticalSection() BT_OVERRIDE; - virtual void deleteCriticalSection( btCriticalSection* criticalSection ) BT_OVERRIDE; + virtual btCriticalSection* createCriticalSection() BT_OVERRIDE; + virtual void deleteCriticalSection(btCriticalSection* criticalSection) BT_OVERRIDE; }; - -#define checkPThreadFunction(returnValue) \ - if(0 != returnValue) { \ - printf("PThread problem at line %i in file %s: %i %d\n", __LINE__, __FILE__, returnValue, errno); \ - } +#define checkPThreadFunction(returnValue) \ + if (0 != returnValue) \ + { \ + printf("PThread problem at line %i in file %s: %i %d\n", __LINE__, __FILE__, returnValue, errno); \ + } // The number of threads should be equal to the number of available cores // Todo: each worker should be linked to a single core, using SetThreadIdealProcessor. - -btThreadSupportPosix::btThreadSupportPosix( const ConstructionInfo& threadConstructionInfo ) +btThreadSupportPosix::btThreadSupportPosix(const ConstructionInfo& threadConstructionInfo) { - startThreads( threadConstructionInfo ); + m_cs = createCriticalSection(); + startThreads(threadConstructionInfo); } // cleanup/shutdown Libspe2 btThreadSupportPosix::~btThreadSupportPosix() { - stopThreads(); + stopThreads(); + deleteCriticalSection(m_cs); + m_cs=0; } -#if (defined (__APPLE__)) +#if (defined(__APPLE__)) #define NAMED_SEMAPHORES #endif - -static sem_t* createSem( const char* baseName ) +static sem_t* createSem(const char* baseName) { - static int semCount = 0; + static int semCount = 0; #ifdef NAMED_SEMAPHORES - /// Named semaphore begin - char name[ 32 ]; - snprintf( name, 32, "/%8.s-%4.d-%4.4d", baseName, getpid(), semCount++ ); - sem_t* tempSem = sem_open( name, O_CREAT, 0600, 0 ); - - if ( tempSem != reinterpret_cast( SEM_FAILED ) ) - { - // printf("Created \"%s\" Semaphore %p\n", name, tempSem); - } - else - { - //printf("Error creating Semaphore %d\n", errno); - exit( -1 ); - } - /// Named semaphore end + /// Named semaphore begin + char name[32]; + snprintf(name, 32, "/%8.s-%4.d-%4.4d", baseName, getpid(), semCount++); + sem_t* tempSem = sem_open(name, O_CREAT, 0600, 0); + + if (tempSem != reinterpret_cast(SEM_FAILED)) + { + // printf("Created \"%s\" Semaphore %p\n", name, tempSem); + } + else + { + //printf("Error creating Semaphore %d\n", errno); + exit(-1); + } + /// Named semaphore end #else - sem_t* tempSem = new sem_t; - checkPThreadFunction( sem_init( tempSem, 0, 0 ) ); + sem_t* tempSem = new sem_t; + checkPThreadFunction(sem_init(tempSem, 0, 0)); #endif - return tempSem; + return tempSem; } -static void destroySem( sem_t* semaphore ) +static void destroySem(sem_t* semaphore) { #ifdef NAMED_SEMAPHORES - checkPThreadFunction( sem_close( semaphore ) ); + checkPThreadFunction(sem_close(semaphore)); #else - checkPThreadFunction( sem_destroy( semaphore ) ); - delete semaphore; + checkPThreadFunction(sem_destroy(semaphore)); + delete semaphore; #endif } -static void *threadFunction( void *argument ) +static void* threadFunction(void* argument) { - btThreadSupportPosix::btThreadStatus* status = ( btThreadSupportPosix::btThreadStatus* )argument; - - while ( 1 ) - { - checkPThreadFunction( sem_wait( status->startSemaphore ) ); - void* userPtr = status->m_userPtr; - - if ( userPtr ) - { - btAssert( status->m_status ); - status->m_userThreadFunc( userPtr ); - status->m_status = 2; - checkPThreadFunction( sem_post( status->m_mainSemaphore ) ); - status->threadUsed++; - } - else - { - //exit Thread - status->m_status = 3; - checkPThreadFunction( sem_post( status->m_mainSemaphore ) ); - printf( "Thread with taskId %i exiting\n", status->m_taskId ); - break; - } - } - - printf( "Thread TERMINATED\n" ); - return 0; + btThreadSupportPosix::btThreadStatus* status = (btThreadSupportPosix::btThreadStatus*)argument; + + while (1) + { + checkPThreadFunction(sem_wait(status->startSemaphore)); + void* userPtr = status->m_userPtr; + + if (userPtr) + { + btAssert(status->m_status); + status->m_userThreadFunc(userPtr); + status->m_cs->lock(); + status->m_status = 2; + status->m_cs->unlock(); + checkPThreadFunction(sem_post(status->m_mainSemaphore)); + status->threadUsed++; + } + else + { + //exit Thread + status->m_cs->lock(); + status->m_status = 3; + status->m_cs->unlock(); + checkPThreadFunction(sem_post(status->m_mainSemaphore)); + break; + } + } + + return 0; } ///send messages to SPUs -void btThreadSupportPosix::runTask( int threadIndex, void* userData ) +void btThreadSupportPosix::runTask(int threadIndex, void* userData) { - ///we should spawn an SPU task here, and in 'waitForResponse' it should wait for response of the (one of) the first tasks that finished - btThreadStatus& threadStatus = m_activeThreadStatus[ threadIndex ]; - btAssert( threadIndex >= 0 ); - btAssert( threadIndex < m_activeThreadStatus.size() ); - - threadStatus.m_commandId = 1; - threadStatus.m_status = 1; - threadStatus.m_userPtr = userData; - m_startedThreadsMask |= UINT64( 1 ) << threadIndex; - - // fire event to start new task - checkPThreadFunction( sem_post( threadStatus.startSemaphore ) ); + ///we should spawn an SPU task here, and in 'waitForResponse' it should wait for response of the (one of) the first tasks that finished + btThreadStatus& threadStatus = m_activeThreadStatus[threadIndex]; + btAssert(threadIndex >= 0); + btAssert(threadIndex < m_activeThreadStatus.size()); + threadStatus.m_cs = m_cs; + threadStatus.m_commandId = 1; + threadStatus.m_status = 1; + threadStatus.m_userPtr = userData; + m_startedThreadsMask |= UINT64(1) << threadIndex; + + // fire event to start new task + checkPThreadFunction(sem_post(threadStatus.startSemaphore)); } - ///check for messages from SPUs int btThreadSupportPosix::waitForResponse() { - ///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response - ///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback' - - btAssert( m_activeThreadStatus.size() ); - - // wait for any of the threads to finish - checkPThreadFunction( sem_wait( m_mainSemaphore ) ); - // get at least one thread which has finished - size_t last = -1; - - for ( size_t t = 0; t < size_t( m_activeThreadStatus.size() ); ++t ) - { - if ( 2 == m_activeThreadStatus[ t ].m_status ) - { - last = t; - break; - } - } - - btThreadStatus& threadStatus = m_activeThreadStatus[ last ]; - - btAssert( threadStatus.m_status > 1 ); - threadStatus.m_status = 0; - - // need to find an active spu - btAssert( last >= 0 ); - m_startedThreadsMask &= ~( UINT64( 1 ) << last ); - - return last; + ///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response + ///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback' + + btAssert(m_activeThreadStatus.size()); + + // wait for any of the threads to finish + checkPThreadFunction(sem_wait(m_mainSemaphore)); + // get at least one thread which has finished + size_t last = -1; + + for (size_t t = 0; t < size_t(m_activeThreadStatus.size()); ++t) + { + m_cs->lock(); + bool hasFinished = (2 == m_activeThreadStatus[t].m_status); + m_cs->unlock(); + if (hasFinished) + { + last = t; + break; + } + } + + btThreadStatus& threadStatus = m_activeThreadStatus[last]; + + btAssert(threadStatus.m_status > 1); + threadStatus.m_status = 0; + + // need to find an active spu + btAssert(last >= 0); + m_startedThreadsMask &= ~(UINT64(1) << last); + + return last; } - void btThreadSupportPosix::waitForAllTasks() { - while ( m_startedThreadsMask ) - { - waitForResponse(); - } + while (m_startedThreadsMask) + { + waitForResponse(); + } } - -void btThreadSupportPosix::startThreads( const ConstructionInfo& threadConstructionInfo ) +void btThreadSupportPosix::startThreads(const ConstructionInfo& threadConstructionInfo) { - m_numThreads = btGetNumHardwareThreads() - 1; // main thread exists already - printf( "%s creating %i threads.\n", __FUNCTION__, m_numThreads ); - m_activeThreadStatus.resize( m_numThreads ); - m_startedThreadsMask = 0; - - m_mainSemaphore = createSem( "main" ); - //checkPThreadFunction(sem_wait(mainSemaphore)); - - for ( int i = 0; i < m_numThreads; i++ ) - { - printf( "starting thread %d\n", i ); - btThreadStatus& threadStatus = m_activeThreadStatus[ i ]; - threadStatus.startSemaphore = createSem( "threadLocal" ); - checkPThreadFunction( pthread_create( &threadStatus.thread, NULL, &threadFunction, (void*) &threadStatus ) ); - - threadStatus.m_userPtr = 0; - threadStatus.m_taskId = i; - threadStatus.m_commandId = 0; - threadStatus.m_status = 0; - threadStatus.m_mainSemaphore = m_mainSemaphore; - threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc; - threadStatus.threadUsed = 0; - - printf( "started thread %d \n", i ); - } + m_numThreads = btGetNumHardwareThreads() - 1; // main thread exists already + m_activeThreadStatus.resize(m_numThreads); + m_startedThreadsMask = 0; + + m_mainSemaphore = createSem("main"); + //checkPThreadFunction(sem_wait(mainSemaphore)); + + for (int i = 0; i < m_numThreads; i++) + { + btThreadStatus& threadStatus = m_activeThreadStatus[i]; + threadStatus.startSemaphore = createSem("threadLocal"); + threadStatus.m_userPtr = 0; + threadStatus.m_cs = m_cs; + threadStatus.m_taskId = i; + threadStatus.m_commandId = 0; + threadStatus.m_status = 0; + threadStatus.m_mainSemaphore = m_mainSemaphore; + threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc; + threadStatus.threadUsed = 0; + checkPThreadFunction(pthread_create(&threadStatus.thread, NULL, &threadFunction, (void*)&threadStatus)); + + } } ///tell the task scheduler we are done with the SPU tasks void btThreadSupportPosix::stopThreads() { - for ( size_t t = 0; t < size_t( m_activeThreadStatus.size() ); ++t ) - { - btThreadStatus& threadStatus = m_activeThreadStatus[ t ]; - printf( "%s: Thread %i used: %ld\n", __FUNCTION__, int( t ), threadStatus.threadUsed ); - - threadStatus.m_userPtr = 0; - checkPThreadFunction( sem_post( threadStatus.startSemaphore ) ); - checkPThreadFunction( sem_wait( m_mainSemaphore ) ); - - printf( "destroy semaphore\n" ); - destroySem( threadStatus.startSemaphore ); - printf( "semaphore destroyed\n" ); - checkPThreadFunction( pthread_join( threadStatus.thread, 0 ) ); - - } - printf( "destroy main semaphore\n" ); - destroySem( m_mainSemaphore ); - printf( "main semaphore destroyed\n" ); - m_activeThreadStatus.clear(); + for (size_t t = 0; t < size_t(m_activeThreadStatus.size()); ++t) + { + btThreadStatus& threadStatus = m_activeThreadStatus[t]; + + threadStatus.m_userPtr = 0; + checkPThreadFunction(sem_post(threadStatus.startSemaphore)); + checkPThreadFunction(sem_wait(m_mainSemaphore)); + + destroySem(threadStatus.startSemaphore); + checkPThreadFunction(pthread_join(threadStatus.thread, 0)); + } + destroySem(m_mainSemaphore); + m_activeThreadStatus.clear(); } class btCriticalSectionPosix : public btCriticalSection { - pthread_mutex_t m_mutex; + pthread_mutex_t m_mutex; public: - btCriticalSectionPosix() - { - pthread_mutex_init( &m_mutex, NULL ); - } - virtual ~btCriticalSectionPosix() - { - pthread_mutex_destroy( &m_mutex ); - } - - virtual void lock() - { - pthread_mutex_lock( &m_mutex ); - } - virtual void unlock() - { - pthread_mutex_unlock( &m_mutex ); - } + btCriticalSectionPosix() + { + pthread_mutex_init(&m_mutex, NULL); + } + virtual ~btCriticalSectionPosix() + { + pthread_mutex_destroy(&m_mutex); + } + + virtual void lock() + { + pthread_mutex_lock(&m_mutex); + } + virtual void unlock() + { + pthread_mutex_unlock(&m_mutex); + } }; - btCriticalSection* btThreadSupportPosix::createCriticalSection() { - return new btCriticalSectionPosix(); + return new btCriticalSectionPosix(); } -void btThreadSupportPosix::deleteCriticalSection( btCriticalSection* cs ) +void btThreadSupportPosix::deleteCriticalSection(btCriticalSection* cs) { - delete cs; + delete cs; } - -btThreadSupportInterface* btThreadSupportInterface::create( const ConstructionInfo& info ) +btThreadSupportInterface* btThreadSupportInterface::create(const ConstructionInfo& info) { - return new btThreadSupportPosix( info ); + return new btThreadSupportPosix(info); } -#endif // BT_THREADSAFE && !defined( _WIN32 ) - +#endif // BT_THREADSAFE && !defined( _WIN32 ) diff --git a/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportWin32.cpp b/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportWin32.cpp index 00edac650b..922e449cce 100644 --- a/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportWin32.cpp +++ b/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportWin32.cpp @@ -13,7 +13,7 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ -#if defined( _WIN32 ) && BT_THREADSAFE +#if defined(_WIN32) && BT_THREADSAFE #include "LinearMath/btScalar.h" #include "LinearMath/btMinMax.h" @@ -23,450 +23,430 @@ subject to the following restrictions: #include #include - struct btProcessorInfo { - int numLogicalProcessors; - int numCores; - int numNumaNodes; - int numL1Cache; - int numL2Cache; - int numL3Cache; - int numPhysicalPackages; - static const int maxNumTeamMasks = 32; - int numTeamMasks; - UINT64 processorTeamMasks[ maxNumTeamMasks ]; + int numLogicalProcessors; + int numCores; + int numNumaNodes; + int numL1Cache; + int numL2Cache; + int numL3Cache; + int numPhysicalPackages; + static const int maxNumTeamMasks = 32; + int numTeamMasks; + UINT64 processorTeamMasks[maxNumTeamMasks]; }; -UINT64 getProcessorTeamMask( const btProcessorInfo& procInfo, int procId ) +UINT64 getProcessorTeamMask(const btProcessorInfo& procInfo, int procId) { - UINT64 procMask = UINT64( 1 ) << procId; - for ( int i = 0; i < procInfo.numTeamMasks; ++i ) - { - if ( procMask & procInfo.processorTeamMasks[ i ] ) - { - return procInfo.processorTeamMasks[ i ]; - } - } - return 0; + UINT64 procMask = UINT64(1) << procId; + for (int i = 0; i < procInfo.numTeamMasks; ++i) + { + if (procMask & procInfo.processorTeamMasks[i]) + { + return procInfo.processorTeamMasks[i]; + } + } + return 0; } -int getProcessorTeamIndex( const btProcessorInfo& procInfo, int procId ) +int getProcessorTeamIndex(const btProcessorInfo& procInfo, int procId) { - UINT64 procMask = UINT64( 1 ) << procId; - for ( int i = 0; i < procInfo.numTeamMasks; ++i ) - { - if ( procMask & procInfo.processorTeamMasks[ i ] ) - { - return i; - } - } - return -1; + UINT64 procMask = UINT64(1) << procId; + for (int i = 0; i < procInfo.numTeamMasks; ++i) + { + if (procMask & procInfo.processorTeamMasks[i]) + { + return i; + } + } + return -1; } -int countSetBits( ULONG64 bits ) +int countSetBits(ULONG64 bits) { - int count = 0; - while ( bits ) - { - if ( bits & 1 ) - { - count++; - } - bits >>= 1; - } - return count; + int count = 0; + while (bits) + { + if (bits & 1) + { + count++; + } + bits >>= 1; + } + return count; } +typedef BOOL(WINAPI* Pfn_GetLogicalProcessorInformation)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); -typedef BOOL( WINAPI *Pfn_GetLogicalProcessorInformation )( PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD ); - - -void getProcessorInformation( btProcessorInfo* procInfo ) +void getProcessorInformation(btProcessorInfo* procInfo) { - memset( procInfo, 0, sizeof( *procInfo ) ); - Pfn_GetLogicalProcessorInformation getLogicalProcInfo = - (Pfn_GetLogicalProcessorInformation) GetProcAddress( GetModuleHandle( TEXT( "kernel32" ) ), "GetLogicalProcessorInformation" ); - if ( getLogicalProcInfo == NULL ) - { - // no info - return; - } - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buf = NULL; - DWORD bufSize = 0; - while ( true ) - { - if ( getLogicalProcInfo( buf, &bufSize ) ) - { - break; - } - else - { - if ( GetLastError() == ERROR_INSUFFICIENT_BUFFER ) - { - if ( buf ) - { - free( buf ); - } - buf = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION) malloc( bufSize ); - } - } - } - - int len = bufSize / sizeof( *buf ); - for ( int i = 0; i < len; ++i ) - { - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION info = buf + i; - switch ( info->Relationship ) - { - case RelationNumaNode: - procInfo->numNumaNodes++; - break; - - case RelationProcessorCore: - procInfo->numCores++; - procInfo->numLogicalProcessors += countSetBits( info->ProcessorMask ); - break; - - case RelationCache: - if ( info->Cache.Level == 1 ) - { - procInfo->numL1Cache++; - } - else if ( info->Cache.Level == 2 ) - { - procInfo->numL2Cache++; - } - else if ( info->Cache.Level == 3 ) - { - procInfo->numL3Cache++; - // processors that share L3 cache are considered to be on the same team - // because they can more easily work together on the same data. - // Large performance penalties will occur if 2 or more threads from different - // teams attempt to frequently read and modify the same cache lines. - // - // On the AMD Ryzen 7 CPU for example, the 8 cores on the CPU are split into - // 2 CCX units of 4 cores each. Each CCX has a separate L3 cache, so if both - // CCXs are operating on the same data, many cycles will be spent keeping the - // two caches coherent. - if ( procInfo->numTeamMasks < btProcessorInfo::maxNumTeamMasks ) - { - procInfo->processorTeamMasks[ procInfo->numTeamMasks ] = info->ProcessorMask; - procInfo->numTeamMasks++; - } - } - break; - - case RelationProcessorPackage: - procInfo->numPhysicalPackages++; - break; - } - } - free( buf ); + memset(procInfo, 0, sizeof(*procInfo)); + Pfn_GetLogicalProcessorInformation getLogicalProcInfo = + (Pfn_GetLogicalProcessorInformation)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); + if (getLogicalProcInfo == NULL) + { + // no info + return; + } + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buf = NULL; + DWORD bufSize = 0; + while (true) + { + if (getLogicalProcInfo(buf, &bufSize)) + { + break; + } + else + { + if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) + { + if (buf) + { + free(buf); + } + buf = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(bufSize); + } + } + } + + int len = bufSize / sizeof(*buf); + for (int i = 0; i < len; ++i) + { + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION info = buf + i; + switch (info->Relationship) + { + case RelationNumaNode: + procInfo->numNumaNodes++; + break; + + case RelationProcessorCore: + procInfo->numCores++; + procInfo->numLogicalProcessors += countSetBits(info->ProcessorMask); + break; + + case RelationCache: + if (info->Cache.Level == 1) + { + procInfo->numL1Cache++; + } + else if (info->Cache.Level == 2) + { + procInfo->numL2Cache++; + } + else if (info->Cache.Level == 3) + { + procInfo->numL3Cache++; + // processors that share L3 cache are considered to be on the same team + // because they can more easily work together on the same data. + // Large performance penalties will occur if 2 or more threads from different + // teams attempt to frequently read and modify the same cache lines. + // + // On the AMD Ryzen 7 CPU for example, the 8 cores on the CPU are split into + // 2 CCX units of 4 cores each. Each CCX has a separate L3 cache, so if both + // CCXs are operating on the same data, many cycles will be spent keeping the + // two caches coherent. + if (procInfo->numTeamMasks < btProcessorInfo::maxNumTeamMasks) + { + procInfo->processorTeamMasks[procInfo->numTeamMasks] = info->ProcessorMask; + procInfo->numTeamMasks++; + } + } + break; + + case RelationProcessorPackage: + procInfo->numPhysicalPackages++; + break; + } + } + free(buf); } - - ///btThreadSupportWin32 helps to initialize/shutdown libspe2, start/stop SPU tasks and communication class btThreadSupportWin32 : public btThreadSupportInterface { public: - struct btThreadStatus - { - int m_taskId; - int m_commandId; - int m_status; + struct btThreadStatus + { + int m_taskId; + int m_commandId; + int m_status; - ThreadFunc m_userThreadFunc; - void* m_userPtr; //for taskDesc etc + ThreadFunc m_userThreadFunc; + void* m_userPtr; //for taskDesc etc - void* m_threadHandle; //this one is calling 'Win32ThreadFunc' + void* m_threadHandle; //this one is calling 'Win32ThreadFunc' - void* m_eventStartHandle; - char m_eventStartHandleName[ 32 ]; + void* m_eventStartHandle; + char m_eventStartHandleName[32]; - void* m_eventCompleteHandle; - char m_eventCompleteHandleName[ 32 ]; - }; + void* m_eventCompleteHandle; + char m_eventCompleteHandleName[32]; + }; private: - btAlignedObjectArray m_activeThreadStatus; - btAlignedObjectArray m_completeHandles; - int m_numThreads; - DWORD_PTR m_startedThreadMask; - btProcessorInfo m_processorInfo; + btAlignedObjectArray m_activeThreadStatus; + btAlignedObjectArray m_completeHandles; + int m_numThreads; + DWORD_PTR m_startedThreadMask; + btProcessorInfo m_processorInfo; - void startThreads( const ConstructionInfo& threadInfo ); - void stopThreads(); - int waitForResponse(); + void startThreads(const ConstructionInfo& threadInfo); + void stopThreads(); + int waitForResponse(); public: + btThreadSupportWin32(const ConstructionInfo& threadConstructionInfo); + virtual ~btThreadSupportWin32(); - btThreadSupportWin32( const ConstructionInfo& threadConstructionInfo ); - virtual ~btThreadSupportWin32(); + virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; } + virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return countSetBits(m_processorInfo.processorTeamMasks[0]); } + virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return m_processorInfo.numLogicalProcessors / m_processorInfo.numCores; } - virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; } - virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return countSetBits(m_processorInfo.processorTeamMasks[0]); } - virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return m_processorInfo.numLogicalProcessors / m_processorInfo.numCores; } + virtual void runTask(int threadIndex, void* userData) BT_OVERRIDE; + virtual void waitForAllTasks() BT_OVERRIDE; - virtual void runTask( int threadIndex, void* userData ) BT_OVERRIDE; - virtual void waitForAllTasks() BT_OVERRIDE; - - virtual btCriticalSection* createCriticalSection() BT_OVERRIDE; - virtual void deleteCriticalSection( btCriticalSection* criticalSection ) BT_OVERRIDE; + virtual btCriticalSection* createCriticalSection() BT_OVERRIDE; + virtual void deleteCriticalSection(btCriticalSection* criticalSection) BT_OVERRIDE; }; - -btThreadSupportWin32::btThreadSupportWin32( const ConstructionInfo & threadConstructionInfo ) +btThreadSupportWin32::btThreadSupportWin32(const ConstructionInfo& threadConstructionInfo) { - startThreads( threadConstructionInfo ); + startThreads(threadConstructionInfo); } - btThreadSupportWin32::~btThreadSupportWin32() { - stopThreads(); + stopThreads(); } - -DWORD WINAPI win32threadStartFunc( LPVOID lpParam ) +DWORD WINAPI win32threadStartFunc(LPVOID lpParam) { - btThreadSupportWin32::btThreadStatus* status = ( btThreadSupportWin32::btThreadStatus* )lpParam; - - while ( 1 ) - { - WaitForSingleObject( status->m_eventStartHandle, INFINITE ); - void* userPtr = status->m_userPtr; - - if ( userPtr ) - { - btAssert( status->m_status ); - status->m_userThreadFunc( userPtr ); - status->m_status = 2; - SetEvent( status->m_eventCompleteHandle ); - } - else - { - //exit Thread - status->m_status = 3; - printf( "Thread with taskId %i with handle %p exiting\n", status->m_taskId, status->m_threadHandle ); - SetEvent( status->m_eventCompleteHandle ); - break; - } - } - printf( "Thread TERMINATED\n" ); - return 0; + btThreadSupportWin32::btThreadStatus* status = (btThreadSupportWin32::btThreadStatus*)lpParam; + + while (1) + { + WaitForSingleObject(status->m_eventStartHandle, INFINITE); + void* userPtr = status->m_userPtr; + + if (userPtr) + { + btAssert(status->m_status); + status->m_userThreadFunc(userPtr); + status->m_status = 2; + SetEvent(status->m_eventCompleteHandle); + } + else + { + //exit Thread + status->m_status = 3; + printf("Thread with taskId %i with handle %p exiting\n", status->m_taskId, status->m_threadHandle); + SetEvent(status->m_eventCompleteHandle); + break; + } + } + printf("Thread TERMINATED\n"); + return 0; } - -void btThreadSupportWin32::runTask( int threadIndex, void* userData ) +void btThreadSupportWin32::runTask(int threadIndex, void* userData) { - btThreadStatus& threadStatus = m_activeThreadStatus[ threadIndex ]; - btAssert( threadIndex >= 0 ); - btAssert( int( threadIndex ) < m_activeThreadStatus.size() ); + btThreadStatus& threadStatus = m_activeThreadStatus[threadIndex]; + btAssert(threadIndex >= 0); + btAssert(int(threadIndex) < m_activeThreadStatus.size()); - threadStatus.m_commandId = 1; - threadStatus.m_status = 1; - threadStatus.m_userPtr = userData; - m_startedThreadMask |= DWORD_PTR( 1 ) << threadIndex; + threadStatus.m_commandId = 1; + threadStatus.m_status = 1; + threadStatus.m_userPtr = userData; + m_startedThreadMask |= DWORD_PTR(1) << threadIndex; - ///fire event to start new task - SetEvent( threadStatus.m_eventStartHandle ); + ///fire event to start new task + SetEvent(threadStatus.m_eventStartHandle); } - int btThreadSupportWin32::waitForResponse() { - btAssert( m_activeThreadStatus.size() ); + btAssert(m_activeThreadStatus.size()); - int last = -1; - DWORD res = WaitForMultipleObjects( m_completeHandles.size(), &m_completeHandles[ 0 ], FALSE, INFINITE ); - btAssert( res != WAIT_FAILED ); - last = res - WAIT_OBJECT_0; + int last = -1; + DWORD res = WaitForMultipleObjects(m_completeHandles.size(), &m_completeHandles[0], FALSE, INFINITE); + btAssert(res != WAIT_FAILED); + last = res - WAIT_OBJECT_0; - btThreadStatus& threadStatus = m_activeThreadStatus[ last ]; - btAssert( threadStatus.m_threadHandle ); - btAssert( threadStatus.m_eventCompleteHandle ); + btThreadStatus& threadStatus = m_activeThreadStatus[last]; + btAssert(threadStatus.m_threadHandle); + btAssert(threadStatus.m_eventCompleteHandle); - //WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE); - btAssert( threadStatus.m_status > 1 ); - threadStatus.m_status = 0; + //WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE); + btAssert(threadStatus.m_status > 1); + threadStatus.m_status = 0; - ///need to find an active spu - btAssert( last >= 0 ); - m_startedThreadMask &= ~( DWORD_PTR( 1 ) << last ); + ///need to find an active spu + btAssert(last >= 0); + m_startedThreadMask &= ~(DWORD_PTR(1) << last); - return last; + return last; } - void btThreadSupportWin32::waitForAllTasks() { - while ( m_startedThreadMask ) - { - waitForResponse(); - } + while (m_startedThreadMask) + { + waitForResponse(); + } } - -void btThreadSupportWin32::startThreads( const ConstructionInfo& threadConstructionInfo ) +void btThreadSupportWin32::startThreads(const ConstructionInfo& threadConstructionInfo) { - static int uniqueId = 0; - uniqueId++; - btProcessorInfo& procInfo = m_processorInfo; - getProcessorInformation( &procInfo ); - DWORD_PTR dwProcessAffinityMask = 0; - DWORD_PTR dwSystemAffinityMask = 0; - if ( !GetProcessAffinityMask( GetCurrentProcess(), &dwProcessAffinityMask, &dwSystemAffinityMask ) ) - { - dwProcessAffinityMask = 0; - } - ///The number of threads should be equal to the number of available cores - 1 - m_numThreads = btMin(procInfo.numLogicalProcessors, int(BT_MAX_THREAD_COUNT)) - 1; // cap to max thread count (-1 because main thread already exists) - - m_activeThreadStatus.resize( m_numThreads ); - m_completeHandles.resize( m_numThreads ); - m_startedThreadMask = 0; - - // set main thread affinity - if ( DWORD_PTR mask = dwProcessAffinityMask & getProcessorTeamMask( procInfo, 0 )) - { - SetThreadAffinityMask( GetCurrentThread(), mask ); - SetThreadIdealProcessor( GetCurrentThread(), 0 ); - } - - for ( int i = 0; i < m_numThreads; i++ ) - { - printf( "starting thread %d\n", i ); - - btThreadStatus& threadStatus = m_activeThreadStatus[ i ]; - - LPSECURITY_ATTRIBUTES lpThreadAttributes = NULL; - SIZE_T dwStackSize = threadConstructionInfo.m_threadStackSize; - LPTHREAD_START_ROUTINE lpStartAddress = &win32threadStartFunc; - LPVOID lpParameter = &threadStatus; - DWORD dwCreationFlags = 0; - LPDWORD lpThreadId = 0; - - threadStatus.m_userPtr = 0; - - sprintf( threadStatus.m_eventStartHandleName, "es%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i ); - threadStatus.m_eventStartHandle = CreateEventA( 0, false, false, threadStatus.m_eventStartHandleName ); - - sprintf( threadStatus.m_eventCompleteHandleName, "ec%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i ); - threadStatus.m_eventCompleteHandle = CreateEventA( 0, false, false, threadStatus.m_eventCompleteHandleName ); - - m_completeHandles[ i ] = threadStatus.m_eventCompleteHandle; - - HANDLE handle = CreateThread( lpThreadAttributes, dwStackSize, lpStartAddress, lpParameter, dwCreationFlags, lpThreadId ); - //SetThreadPriority( handle, THREAD_PRIORITY_HIGHEST ); - // highest priority -- can cause erratic performance when numThreads > numCores - // we don't want worker threads to be higher priority than the main thread or the main thread could get - // totally shut out and unable to tell the workers to stop - //SetThreadPriority( handle, THREAD_PRIORITY_BELOW_NORMAL ); - - { - int processorId = i + 1; // leave processor 0 for main thread - DWORD_PTR teamMask = getProcessorTeamMask( procInfo, processorId ); - if ( teamMask ) - { - // bind each thread to only execute on processors of it's assigned team - // - for single-socket Intel x86 CPUs this has no effect (only a single, shared L3 cache so there is only 1 team) - // - for multi-socket Intel this will keep threads from migrating from one socket to another - // - for AMD Ryzen this will keep threads from migrating from one CCX to another - DWORD_PTR mask = teamMask & dwProcessAffinityMask; - if ( mask ) - { - SetThreadAffinityMask( handle, mask ); - } - } - SetThreadIdealProcessor( handle, processorId ); - } - - threadStatus.m_taskId = i; - threadStatus.m_commandId = 0; - threadStatus.m_status = 0; - threadStatus.m_threadHandle = handle; - threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc; - - printf( "started %s thread %d with threadHandle %p\n", threadConstructionInfo.m_uniqueName, i, handle ); - } + static int uniqueId = 0; + uniqueId++; + btProcessorInfo& procInfo = m_processorInfo; + getProcessorInformation(&procInfo); + DWORD_PTR dwProcessAffinityMask = 0; + DWORD_PTR dwSystemAffinityMask = 0; + if (!GetProcessAffinityMask(GetCurrentProcess(), &dwProcessAffinityMask, &dwSystemAffinityMask)) + { + dwProcessAffinityMask = 0; + } + ///The number of threads should be equal to the number of available cores - 1 + m_numThreads = btMin(procInfo.numLogicalProcessors, int(BT_MAX_THREAD_COUNT)) - 1; // cap to max thread count (-1 because main thread already exists) + + m_activeThreadStatus.resize(m_numThreads); + m_completeHandles.resize(m_numThreads); + m_startedThreadMask = 0; + + // set main thread affinity + if (DWORD_PTR mask = dwProcessAffinityMask & getProcessorTeamMask(procInfo, 0)) + { + SetThreadAffinityMask(GetCurrentThread(), mask); + SetThreadIdealProcessor(GetCurrentThread(), 0); + } + + for (int i = 0; i < m_numThreads; i++) + { + printf("starting thread %d\n", i); + + btThreadStatus& threadStatus = m_activeThreadStatus[i]; + + LPSECURITY_ATTRIBUTES lpThreadAttributes = NULL; + SIZE_T dwStackSize = threadConstructionInfo.m_threadStackSize; + LPTHREAD_START_ROUTINE lpStartAddress = &win32threadStartFunc; + LPVOID lpParameter = &threadStatus; + DWORD dwCreationFlags = 0; + LPDWORD lpThreadId = 0; + + threadStatus.m_userPtr = 0; + + sprintf(threadStatus.m_eventStartHandleName, "es%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i); + threadStatus.m_eventStartHandle = CreateEventA(0, false, false, threadStatus.m_eventStartHandleName); + + sprintf(threadStatus.m_eventCompleteHandleName, "ec%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i); + threadStatus.m_eventCompleteHandle = CreateEventA(0, false, false, threadStatus.m_eventCompleteHandleName); + + m_completeHandles[i] = threadStatus.m_eventCompleteHandle; + + HANDLE handle = CreateThread(lpThreadAttributes, dwStackSize, lpStartAddress, lpParameter, dwCreationFlags, lpThreadId); + //SetThreadPriority( handle, THREAD_PRIORITY_HIGHEST ); + // highest priority -- can cause erratic performance when numThreads > numCores + // we don't want worker threads to be higher priority than the main thread or the main thread could get + // totally shut out and unable to tell the workers to stop + //SetThreadPriority( handle, THREAD_PRIORITY_BELOW_NORMAL ); + + { + int processorId = i + 1; // leave processor 0 for main thread + DWORD_PTR teamMask = getProcessorTeamMask(procInfo, processorId); + if (teamMask) + { + // bind each thread to only execute on processors of it's assigned team + // - for single-socket Intel x86 CPUs this has no effect (only a single, shared L3 cache so there is only 1 team) + // - for multi-socket Intel this will keep threads from migrating from one socket to another + // - for AMD Ryzen this will keep threads from migrating from one CCX to another + DWORD_PTR mask = teamMask & dwProcessAffinityMask; + if (mask) + { + SetThreadAffinityMask(handle, mask); + } + } + SetThreadIdealProcessor(handle, processorId); + } + + threadStatus.m_taskId = i; + threadStatus.m_commandId = 0; + threadStatus.m_status = 0; + threadStatus.m_threadHandle = handle; + threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc; + + printf("started %s thread %d with threadHandle %p\n", threadConstructionInfo.m_uniqueName, i, handle); + } } ///tell the task scheduler we are done with the SPU tasks void btThreadSupportWin32::stopThreads() { - for ( int i = 0; i < m_activeThreadStatus.size(); i++ ) - { - btThreadStatus& threadStatus = m_activeThreadStatus[ i ]; - if ( threadStatus.m_status > 0 ) - { - WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE ); - } - - threadStatus.m_userPtr = NULL; - SetEvent( threadStatus.m_eventStartHandle ); - WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE ); - - CloseHandle( threadStatus.m_eventCompleteHandle ); - CloseHandle( threadStatus.m_eventStartHandle ); - CloseHandle( threadStatus.m_threadHandle ); - - } - - m_activeThreadStatus.clear(); - m_completeHandles.clear(); + for (int i = 0; i < m_activeThreadStatus.size(); i++) + { + btThreadStatus& threadStatus = m_activeThreadStatus[i]; + if (threadStatus.m_status > 0) + { + WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE); + } + + threadStatus.m_userPtr = NULL; + SetEvent(threadStatus.m_eventStartHandle); + WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE); + + CloseHandle(threadStatus.m_eventCompleteHandle); + CloseHandle(threadStatus.m_eventStartHandle); + CloseHandle(threadStatus.m_threadHandle); + } + + m_activeThreadStatus.clear(); + m_completeHandles.clear(); } - class btWin32CriticalSection : public btCriticalSection { private: - CRITICAL_SECTION mCriticalSection; + CRITICAL_SECTION mCriticalSection; public: - btWin32CriticalSection() - { - InitializeCriticalSection( &mCriticalSection ); - } - - ~btWin32CriticalSection() - { - DeleteCriticalSection( &mCriticalSection ); - } - - void lock() - { - EnterCriticalSection( &mCriticalSection ); - } - - void unlock() - { - LeaveCriticalSection( &mCriticalSection ); - } + btWin32CriticalSection() + { + InitializeCriticalSection(&mCriticalSection); + } + + ~btWin32CriticalSection() + { + DeleteCriticalSection(&mCriticalSection); + } + + void lock() + { + EnterCriticalSection(&mCriticalSection); + } + + void unlock() + { + LeaveCriticalSection(&mCriticalSection); + } }; - btCriticalSection* btThreadSupportWin32::createCriticalSection() { - unsigned char* mem = (unsigned char*) btAlignedAlloc( sizeof( btWin32CriticalSection ), 16 ); - btWin32CriticalSection* cs = new( mem ) btWin32CriticalSection(); - return cs; + unsigned char* mem = (unsigned char*)btAlignedAlloc(sizeof(btWin32CriticalSection), 16); + btWin32CriticalSection* cs = new (mem) btWin32CriticalSection(); + return cs; } -void btThreadSupportWin32::deleteCriticalSection( btCriticalSection* criticalSection ) +void btThreadSupportWin32::deleteCriticalSection(btCriticalSection* criticalSection) { - criticalSection->~btCriticalSection(); - btAlignedFree( criticalSection ); + criticalSection->~btCriticalSection(); + btAlignedFree(criticalSection); } - -btThreadSupportInterface* btThreadSupportInterface::create( const ConstructionInfo& info ) +btThreadSupportInterface* btThreadSupportInterface::create(const ConstructionInfo& info) { - return new btThreadSupportWin32( info ); + return new btThreadSupportWin32(info); } - - -#endif //defined(_WIN32) && BT_THREADSAFE - +#endif //defined(_WIN32) && BT_THREADSAFE diff --git a/thirdparty/bullet/LinearMath/btAabbUtil2.h b/thirdparty/bullet/LinearMath/btAabbUtil2.h index d2997b4e65..eea49dd33f 100644 --- a/thirdparty/bullet/LinearMath/btAabbUtil2.h +++ b/thirdparty/bullet/LinearMath/btAabbUtil2.h @@ -12,8 +12,6 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - - #ifndef BT_AABB_UTIL2 #define BT_AABB_UTIL2 @@ -21,20 +19,18 @@ subject to the following restrictions: #include "btVector3.h" #include "btMinMax.h" - - -SIMD_FORCE_INLINE void AabbExpand (btVector3& aabbMin, - btVector3& aabbMax, - const btVector3& expansionMin, - const btVector3& expansionMax) +SIMD_FORCE_INLINE void AabbExpand(btVector3& aabbMin, + btVector3& aabbMax, + const btVector3& expansionMin, + const btVector3& expansionMax) { aabbMin = aabbMin + expansionMin; aabbMax = aabbMax + expansionMax; } /// conservative test for overlap between two aabbs -SIMD_FORCE_INLINE bool TestPointAgainstAabb2(const btVector3 &aabbMin1, const btVector3 &aabbMax1, - const btVector3 &point) +SIMD_FORCE_INLINE bool TestPointAgainstAabb2(const btVector3& aabbMin1, const btVector3& aabbMax1, + const btVector3& point) { bool overlap = true; overlap = (aabbMin1.getX() > point.getX() || aabbMax1.getX() < point.getX()) ? false : overlap; @@ -43,10 +39,9 @@ SIMD_FORCE_INLINE bool TestPointAgainstAabb2(const btVector3 &aabbMin1, const bt return overlap; } - /// conservative test for overlap between two aabbs -SIMD_FORCE_INLINE bool TestAabbAgainstAabb2(const btVector3 &aabbMin1, const btVector3 &aabbMax1, - const btVector3 &aabbMin2, const btVector3 &aabbMax2) +SIMD_FORCE_INLINE bool TestAabbAgainstAabb2(const btVector3& aabbMin1, const btVector3& aabbMax1, + const btVector3& aabbMin2, const btVector3& aabbMax2) { bool overlap = true; overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap; @@ -56,37 +51,34 @@ SIMD_FORCE_INLINE bool TestAabbAgainstAabb2(const btVector3 &aabbMin1, const btV } /// conservative test for overlap between triangle and aabb -SIMD_FORCE_INLINE bool TestTriangleAgainstAabb2(const btVector3 *vertices, - const btVector3 &aabbMin, const btVector3 &aabbMax) +SIMD_FORCE_INLINE bool TestTriangleAgainstAabb2(const btVector3* vertices, + const btVector3& aabbMin, const btVector3& aabbMax) { - const btVector3 &p1 = vertices[0]; - const btVector3 &p2 = vertices[1]; - const btVector3 &p3 = vertices[2]; + const btVector3& p1 = vertices[0]; + const btVector3& p2 = vertices[1]; + const btVector3& p3 = vertices[2]; if (btMin(btMin(p1[0], p2[0]), p3[0]) > aabbMax[0]) return false; if (btMax(btMax(p1[0], p2[0]), p3[0]) < aabbMin[0]) return false; if (btMin(btMin(p1[2], p2[2]), p3[2]) > aabbMax[2]) return false; if (btMax(btMax(p1[2], p2[2]), p3[2]) < aabbMin[2]) return false; - + if (btMin(btMin(p1[1], p2[1]), p3[1]) > aabbMax[1]) return false; if (btMax(btMax(p1[1], p2[1]), p3[1]) < aabbMin[1]) return false; return true; } - -SIMD_FORCE_INLINE int btOutcode(const btVector3& p,const btVector3& halfExtent) +SIMD_FORCE_INLINE int btOutcode(const btVector3& p, const btVector3& halfExtent) { - return (p.getX() < -halfExtent.getX() ? 0x01 : 0x0) | - (p.getX() > halfExtent.getX() ? 0x08 : 0x0) | - (p.getY() < -halfExtent.getY() ? 0x02 : 0x0) | - (p.getY() > halfExtent.getY() ? 0x10 : 0x0) | - (p.getZ() < -halfExtent.getZ() ? 0x4 : 0x0) | - (p.getZ() > halfExtent.getZ() ? 0x20 : 0x0); + return (p.getX() < -halfExtent.getX() ? 0x01 : 0x0) | + (p.getX() > halfExtent.getX() ? 0x08 : 0x0) | + (p.getY() < -halfExtent.getY() ? 0x02 : 0x0) | + (p.getY() > halfExtent.getY() ? 0x10 : 0x0) | + (p.getZ() < -halfExtent.getZ() ? 0x4 : 0x0) | + (p.getZ() > halfExtent.getZ() ? 0x20 : 0x0); } - - SIMD_FORCE_INLINE bool btRayAabb2(const btVector3& rayFrom, const btVector3& rayInvDirection, const unsigned int raySign[3], @@ -97,11 +89,11 @@ SIMD_FORCE_INLINE bool btRayAabb2(const btVector3& rayFrom, { btScalar tmax, tymin, tymax, tzmin, tzmax; tmin = (bounds[raySign[0]].getX() - rayFrom.getX()) * rayInvDirection.getX(); - tmax = (bounds[1-raySign[0]].getX() - rayFrom.getX()) * rayInvDirection.getX(); + tmax = (bounds[1 - raySign[0]].getX() - rayFrom.getX()) * rayInvDirection.getX(); tymin = (bounds[raySign[1]].getY() - rayFrom.getY()) * rayInvDirection.getY(); - tymax = (bounds[1-raySign[1]].getY() - rayFrom.getY()) * rayInvDirection.getY(); + tymax = (bounds[1 - raySign[1]].getY() - rayFrom.getY()) * rayInvDirection.getY(); - if ( (tmin > tymax) || (tymin > tmax) ) + if ((tmin > tymax) || (tymin > tmax)) return false; if (tymin > tmin) @@ -111,59 +103,59 @@ SIMD_FORCE_INLINE bool btRayAabb2(const btVector3& rayFrom, tmax = tymax; tzmin = (bounds[raySign[2]].getZ() - rayFrom.getZ()) * rayInvDirection.getZ(); - tzmax = (bounds[1-raySign[2]].getZ() - rayFrom.getZ()) * rayInvDirection.getZ(); + tzmax = (bounds[1 - raySign[2]].getZ() - rayFrom.getZ()) * rayInvDirection.getZ(); - if ( (tmin > tzmax) || (tzmin > tmax) ) + if ((tmin > tzmax) || (tzmin > tmax)) return false; if (tzmin > tmin) tmin = tzmin; if (tzmax < tmax) tmax = tzmax; - return ( (tmin < lambda_max) && (tmax > lambda_min) ); + return ((tmin < lambda_max) && (tmax > lambda_min)); } -SIMD_FORCE_INLINE bool btRayAabb(const btVector3& rayFrom, - const btVector3& rayTo, - const btVector3& aabbMin, +SIMD_FORCE_INLINE bool btRayAabb(const btVector3& rayFrom, + const btVector3& rayTo, + const btVector3& aabbMin, const btVector3& aabbMax, - btScalar& param, btVector3& normal) + btScalar& param, btVector3& normal) { - btVector3 aabbHalfExtent = (aabbMax-aabbMin)* btScalar(0.5); - btVector3 aabbCenter = (aabbMax+aabbMin)* btScalar(0.5); - btVector3 source = rayFrom - aabbCenter; - btVector3 target = rayTo - aabbCenter; - int sourceOutcode = btOutcode(source,aabbHalfExtent); - int targetOutcode = btOutcode(target,aabbHalfExtent); + btVector3 aabbHalfExtent = (aabbMax - aabbMin) * btScalar(0.5); + btVector3 aabbCenter = (aabbMax + aabbMin) * btScalar(0.5); + btVector3 source = rayFrom - aabbCenter; + btVector3 target = rayTo - aabbCenter; + int sourceOutcode = btOutcode(source, aabbHalfExtent); + int targetOutcode = btOutcode(target, aabbHalfExtent); if ((sourceOutcode & targetOutcode) == 0x0) { btScalar lambda_enter = btScalar(0.0); - btScalar lambda_exit = param; + btScalar lambda_exit = param; btVector3 r = target - source; int i; - btScalar normSign = 1; - btVector3 hitNormal(0,0,0); - int bit=1; + btScalar normSign = 1; + btVector3 hitNormal(0, 0, 0); + int bit = 1; - for (int j=0;j<2;j++) + for (int j = 0; j < 2; j++) { for (i = 0; i != 3; ++i) { if (sourceOutcode & bit) { - btScalar lambda = (-source[i] - aabbHalfExtent[i]*normSign) / r[i]; + btScalar lambda = (-source[i] - aabbHalfExtent[i] * normSign) / r[i]; if (lambda_enter <= lambda) { lambda_enter = lambda; - hitNormal.setValue(0,0,0); + hitNormal.setValue(0, 0, 0); hitNormal[i] = normSign; } } - else if (targetOutcode & bit) + else if (targetOutcode & bit) { - btScalar lambda = (-source[i] - aabbHalfExtent[i]*normSign) / r[i]; + btScalar lambda = (-source[i] - aabbHalfExtent[i] * normSign) / r[i]; btSetMin(lambda_exit, lambda); } - bit<<=1; + bit <<= 1; } normSign = btScalar(-1.); } @@ -177,56 +169,49 @@ SIMD_FORCE_INLINE bool btRayAabb(const btVector3& rayFrom, return false; } - - -SIMD_FORCE_INLINE void btTransformAabb(const btVector3& halfExtents, btScalar margin,const btTransform& t,btVector3& aabbMinOut,btVector3& aabbMaxOut) +SIMD_FORCE_INLINE void btTransformAabb(const btVector3& halfExtents, btScalar margin, const btTransform& t, btVector3& aabbMinOut, btVector3& aabbMaxOut) { - btVector3 halfExtentsWithMargin = halfExtents+btVector3(margin,margin,margin); - btMatrix3x3 abs_b = t.getBasis().absolute(); + btVector3 halfExtentsWithMargin = halfExtents + btVector3(margin, margin, margin); + btMatrix3x3 abs_b = t.getBasis().absolute(); btVector3 center = t.getOrigin(); - btVector3 extent = halfExtentsWithMargin.dot3( abs_b[0], abs_b[1], abs_b[2] ); + btVector3 extent = halfExtentsWithMargin.dot3(abs_b[0], abs_b[1], abs_b[2]); aabbMinOut = center - extent; aabbMaxOut = center + extent; } - -SIMD_FORCE_INLINE void btTransformAabb(const btVector3& localAabbMin,const btVector3& localAabbMax, btScalar margin,const btTransform& trans,btVector3& aabbMinOut,btVector3& aabbMaxOut) +SIMD_FORCE_INLINE void btTransformAabb(const btVector3& localAabbMin, const btVector3& localAabbMax, btScalar margin, const btTransform& trans, btVector3& aabbMinOut, btVector3& aabbMaxOut) { - btAssert(localAabbMin.getX() <= localAabbMax.getX()); - btAssert(localAabbMin.getY() <= localAabbMax.getY()); - btAssert(localAabbMin.getZ() <= localAabbMax.getZ()); - btVector3 localHalfExtents = btScalar(0.5)*(localAabbMax-localAabbMin); - localHalfExtents+=btVector3(margin,margin,margin); - - btVector3 localCenter = btScalar(0.5)*(localAabbMax+localAabbMin); - btMatrix3x3 abs_b = trans.getBasis().absolute(); - btVector3 center = trans(localCenter); - btVector3 extent = localHalfExtents.dot3( abs_b[0], abs_b[1], abs_b[2] ); - aabbMinOut = center-extent; - aabbMaxOut = center+extent; + btAssert(localAabbMin.getX() <= localAabbMax.getX()); + btAssert(localAabbMin.getY() <= localAabbMax.getY()); + btAssert(localAabbMin.getZ() <= localAabbMax.getZ()); + btVector3 localHalfExtents = btScalar(0.5) * (localAabbMax - localAabbMin); + localHalfExtents += btVector3(margin, margin, margin); + + btVector3 localCenter = btScalar(0.5) * (localAabbMax + localAabbMin); + btMatrix3x3 abs_b = trans.getBasis().absolute(); + btVector3 center = trans(localCenter); + btVector3 extent = localHalfExtents.dot3(abs_b[0], abs_b[1], abs_b[2]); + aabbMinOut = center - extent; + aabbMaxOut = center + extent; } #define USE_BANCHLESS 1 #ifdef USE_BANCHLESS - //This block replaces the block below and uses no branches, and replaces the 8 bit return with a 32 bit return for improved performance (~3x on XBox 360) - SIMD_FORCE_INLINE unsigned testQuantizedAabbAgainstQuantizedAabb(const unsigned short int* aabbMin1,const unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int* aabbMax2) - { - return static_cast(btSelect((unsigned)((aabbMin1[0] <= aabbMax2[0]) & (aabbMax1[0] >= aabbMin2[0]) - & (aabbMin1[2] <= aabbMax2[2]) & (aabbMax1[2] >= aabbMin2[2]) - & (aabbMin1[1] <= aabbMax2[1]) & (aabbMax1[1] >= aabbMin2[1])), - 1, 0)); - } +//This block replaces the block below and uses no branches, and replaces the 8 bit return with a 32 bit return for improved performance (~3x on XBox 360) +SIMD_FORCE_INLINE unsigned testQuantizedAabbAgainstQuantizedAabb(const unsigned short int* aabbMin1, const unsigned short int* aabbMax1, const unsigned short int* aabbMin2, const unsigned short int* aabbMax2) +{ + return static_cast(btSelect((unsigned)((aabbMin1[0] <= aabbMax2[0]) & (aabbMax1[0] >= aabbMin2[0]) & (aabbMin1[2] <= aabbMax2[2]) & (aabbMax1[2] >= aabbMin2[2]) & (aabbMin1[1] <= aabbMax2[1]) & (aabbMax1[1] >= aabbMin2[1])), + 1, 0)); +} #else - SIMD_FORCE_INLINE bool testQuantizedAabbAgainstQuantizedAabb(const unsigned short int* aabbMin1,const unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int* aabbMax2) - { - bool overlap = true; - overlap = (aabbMin1[0] > aabbMax2[0] || aabbMax1[0] < aabbMin2[0]) ? false : overlap; - overlap = (aabbMin1[2] > aabbMax2[2] || aabbMax1[2] < aabbMin2[2]) ? false : overlap; - overlap = (aabbMin1[1] > aabbMax2[1] || aabbMax1[1] < aabbMin2[1]) ? false : overlap; - return overlap; - } -#endif //USE_BANCHLESS - -#endif //BT_AABB_UTIL2 - +SIMD_FORCE_INLINE bool testQuantizedAabbAgainstQuantizedAabb(const unsigned short int* aabbMin1, const unsigned short int* aabbMax1, const unsigned short int* aabbMin2, const unsigned short int* aabbMax2) +{ + bool overlap = true; + overlap = (aabbMin1[0] > aabbMax2[0] || aabbMax1[0] < aabbMin2[0]) ? false : overlap; + overlap = (aabbMin1[2] > aabbMax2[2] || aabbMax1[2] < aabbMin2[2]) ? false : overlap; + overlap = (aabbMin1[1] > aabbMax2[1] || aabbMax1[1] < aabbMin2[1]) ? false : overlap; + return overlap; +} +#endif //USE_BANCHLESS +#endif //BT_AABB_UTIL2 diff --git a/thirdparty/bullet/LinearMath/btAlignedAllocator.cpp b/thirdparty/bullet/LinearMath/btAlignedAllocator.cpp index 0526a42283..39b302b600 100644 --- a/thirdparty/bullet/LinearMath/btAlignedAllocator.cpp +++ b/thirdparty/bullet/LinearMath/btAlignedAllocator.cpp @@ -18,8 +18,8 @@ subject to the following restrictions: #ifdef BT_DEBUG_MEMORY_ALLOCATIONS int gNumAlignedAllocs = 0; int gNumAlignedFree = 0; -int gTotalBytesAlignedAllocs = 0;//detect memory leaks -#endif //BT_DEBUG_MEMORY_ALLOCATIONST_DEBUG_ALLOCATIONS +int gTotalBytesAlignedAllocs = 0; //detect memory leaks +#endif //BT_DEBUG_MEMORY_ALLOCATIONST_DEBUG_ALLOCATIONS static void *btAllocDefault(size_t size) { @@ -34,9 +34,7 @@ static void btFreeDefault(void *ptr) static btAllocFunc *sAllocFunc = btAllocDefault; static btFreeFunc *sFreeFunc = btFreeDefault; - - -#if defined (BT_HAS_ALIGNED_ALLOCATOR) +#if defined(BT_HAS_ALIGNED_ALLOCATOR) #include static void *btAlignedAllocDefault(size_t size, int alignment) { @@ -61,49 +59,48 @@ static inline void btAlignedFreeDefault(void *ptr) } #else - - - - static inline void *btAlignedAllocDefault(size_t size, int alignment) { - void *ret; - char *real; - real = (char *)sAllocFunc(size + sizeof(void *) + (alignment-1)); - if (real) { - ret = btAlignPointer(real + sizeof(void *),alignment); - *((void **)(ret)-1) = (void *)(real); - } else { - ret = (void *)(real); - } - return (ret); + void *ret; + char *real; + real = (char *)sAllocFunc(size + sizeof(void *) + (alignment - 1)); + if (real) + { + ret = btAlignPointer(real + sizeof(void *), alignment); + *((void **)(ret)-1) = (void *)(real); + } + else + { + ret = (void *)(real); + } + return (ret); } static inline void btAlignedFreeDefault(void *ptr) { - void* real; + void *real; - if (ptr) { - real = *((void **)(ptr)-1); - sFreeFunc(real); - } + if (ptr) + { + real = *((void **)(ptr)-1); + sFreeFunc(real); + } } #endif - static btAlignedAllocFunc *sAlignedAllocFunc = btAlignedAllocDefault; static btAlignedFreeFunc *sAlignedFreeFunc = btAlignedFreeDefault; void btAlignedAllocSetCustomAligned(btAlignedAllocFunc *allocFunc, btAlignedFreeFunc *freeFunc) { - sAlignedAllocFunc = allocFunc ? allocFunc : btAlignedAllocDefault; - sAlignedFreeFunc = freeFunc ? freeFunc : btAlignedFreeDefault; + sAlignedAllocFunc = allocFunc ? allocFunc : btAlignedAllocDefault; + sAlignedFreeFunc = freeFunc ? freeFunc : btAlignedFreeDefault; } void btAlignedAllocSetCustom(btAllocFunc *allocFunc, btFreeFunc *freeFunc) { - sAllocFunc = allocFunc ? allocFunc : btAllocDefault; - sFreeFunc = freeFunc ? freeFunc : btFreeDefault; + sAllocFunc = allocFunc ? allocFunc : btAllocDefault; + sFreeFunc = freeFunc ? freeFunc : btFreeDefault; } #ifdef BT_DEBUG_MEMORY_ALLOCATIONS @@ -116,15 +113,15 @@ static int mynumallocs = 0; int btDumpMemoryLeaks() { int totalLeak = 0; - - for (int i=0;i1024*1024) -// { -// printf("big alloc!%d\n", size); -// } - - gTotalBytesAlignedAllocs += size; - gNumAlignedAllocs++; - - -int sz4prt = 4*sizeof(void *); - - real = (char *)sAllocFunc(size + sz4prt + (alignment-1)); - if (real) { - - ret = (void*) btAlignPointer(real + sz4prt, alignment); - btDebugPtrMagic p; - p.vptr = ret; - p.cptr-=sizeof(void*); - *p.vptrptr = (void*)real; - p.cptr-=sizeof(void*); - *p.iptr = size; - p.cptr-=sizeof(void*); - *p.iptr = allocId; - - allocations_id[mynumallocs] = allocId; - allocations_bytes[mynumallocs] = size; - mynumallocs++; - - } else { - ret = (void *)(real);//?? - } - - printf("allocation %d at address %x, from %s,line %d, size %d (total allocated = %d)\n",allocId,real, filename,line,size,gTotalBytesAlignedAllocs); + + void *ret; + char *real; + + // to find some particular memory leak, you could do something like this: + // if (allocId==172) + // { + // printf("catch me!\n"); + // } + // if (size>1024*1024) + // { + // printf("big alloc!%d\n", size); + // } + + gTotalBytesAlignedAllocs += size; + gNumAlignedAllocs++; + + int sz4prt = 4 * sizeof(void *); + + real = (char *)sAllocFunc(size + sz4prt + (alignment - 1)); + if (real) + { + ret = (void *)btAlignPointer(real + sz4prt, alignment); + btDebugPtrMagic p; + p.vptr = ret; + p.cptr -= sizeof(void *); + *p.vptrptr = (void *)real; + p.cptr -= sizeof(void *); + *p.iptr = size; + p.cptr -= sizeof(void *); + *p.iptr = allocId; + + allocations_id[mynumallocs] = allocId; + allocations_bytes[mynumallocs] = size; + mynumallocs++; + } + else + { + ret = (void *)(real); //?? + } + + printf("allocation %d at address %x, from %s,line %d, size %d (total allocated = %d)\n", allocId, real, filename, line, size, gTotalBytesAlignedAllocs); allocId++; - - int* ptr = (int*)ret; - *ptr = 12; - return (ret); + + int *ptr = (int *)ret; + *ptr = 12; + return (ret); } -void btAlignedFreeInternal (void* ptr,int line,char* filename) +void btAlignedFreeInternal(void *ptr, int line, char *filename) { + void *real; - void* real; - - if (ptr) { - gNumAlignedFree++; - - btDebugPtrMagic p; - p.vptr = ptr; - p.cptr-=sizeof(void*); - real = *p.vptrptr; - p.cptr-=sizeof(void*); - int size = *p.iptr; - p.cptr-=sizeof(void*); - int allocId = *p.iptr; - - bool found = false; - - for (int i=0;i -class btAlignedAllocator { - - typedef btAlignedAllocator< T , Alignment > self_type; - -public: +template +class btAlignedAllocator +{ + typedef btAlignedAllocator self_type; +public: //just going down a list: btAlignedAllocator() {} /* btAlignedAllocator( const self_type & ) {} */ - template < typename Other > - btAlignedAllocator( const btAlignedAllocator< Other , Alignment > & ) {} + template + btAlignedAllocator(const btAlignedAllocator&) + { + } - typedef const T* const_pointer; - typedef const T& const_reference; - typedef T* pointer; - typedef T& reference; - typedef T value_type; + typedef const T* const_pointer; + typedef const T& const_reference; + typedef T* pointer; + typedef T& reference; + typedef T value_type; - pointer address ( reference ref ) const { return &ref; } - const_pointer address ( const_reference ref ) const { return &ref; } - pointer allocate ( size_type n , const_pointer * hint = 0 ) { + pointer address(reference ref) const { return &ref; } + const_pointer address(const_reference ref) const { return &ref; } + pointer allocate(size_type n, const_pointer* hint = 0) + { (void)hint; - return reinterpret_cast< pointer >(btAlignedAlloc( sizeof(value_type) * n , Alignment )); + return reinterpret_cast(btAlignedAlloc(sizeof(value_type) * n, Alignment)); } - void construct ( pointer ptr , const value_type & value ) { new (ptr) value_type( value ); } - void deallocate( pointer ptr ) { - btAlignedFree( reinterpret_cast< void * >( ptr ) ); + void construct(pointer ptr, const value_type& value) { new (ptr) value_type(value); } + void deallocate(pointer ptr) + { + btAlignedFree(reinterpret_cast(ptr)); } - void destroy ( pointer ptr ) { ptr->~value_type(); } - + void destroy(pointer ptr) { ptr->~value_type(); } - template < typename O > struct rebind { - typedef btAlignedAllocator< O , Alignment > other; + template + struct rebind + { + typedef btAlignedAllocator other; }; - template < typename O > - self_type & operator=( const btAlignedAllocator< O , Alignment > & ) { return *this; } + template + self_type& operator=(const btAlignedAllocator&) + { + return *this; + } - friend bool operator==( const self_type & , const self_type & ) { return true; } + friend bool operator==(const self_type&, const self_type&) { return true; } }; - - -#endif //BT_ALIGNED_ALLOCATOR - +#endif //BT_ALIGNED_ALLOCATOR diff --git a/thirdparty/bullet/LinearMath/btAlignedObjectArray.h b/thirdparty/bullet/LinearMath/btAlignedObjectArray.h index f0b646529a..b4671bc19f 100644 --- a/thirdparty/bullet/LinearMath/btAlignedObjectArray.h +++ b/thirdparty/bullet/LinearMath/btAlignedObjectArray.h @@ -13,11 +13,10 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - #ifndef BT_OBJECT_ARRAY__ #define BT_OBJECT_ARRAY__ -#include "btScalar.h" // has definitions like SIMD_FORCE_INLINE +#include "btScalar.h" // has definitions like SIMD_FORCE_INLINE #include "btAlignedAllocator.h" ///If the platform doesn't support placement new, you can disable BT_USE_PLACEMENT_NEW @@ -28,16 +27,16 @@ subject to the following restrictions: #define BT_USE_PLACEMENT_NEW 1 //#define BT_USE_MEMCPY 1 //disable, because it is cumbersome to find out for each platform where memcpy is defined. It can be in or or otherwise... -#define BT_ALLOW_ARRAY_COPY_OPERATOR // enabling this can accidently perform deep copies of data if you are not careful +#define BT_ALLOW_ARRAY_COPY_OPERATOR // enabling this can accidently perform deep copies of data if you are not careful #ifdef BT_USE_MEMCPY #include #include -#endif //BT_USE_MEMCPY +#endif //BT_USE_MEMCPY #ifdef BT_USE_PLACEMENT_NEW -#include //for placement new -#endif //BT_USE_PLACEMENT_NEW +#include //for placement new +#endif //BT_USE_PLACEMENT_NEW // The register keyword is deprecated in C++11 so don't use it. #if __cplusplus > 199711L @@ -48,374 +47,358 @@ subject to the following restrictions: ///The btAlignedObjectArray template class uses a subset of the stl::vector interface for its methods ///It is developed to replace stl::vector to avoid portability issues, including STL alignment issues to add SIMD/SSE data -template -//template +template +//template class btAlignedObjectArray { - btAlignedAllocator m_allocator; + btAlignedAllocator m_allocator; - int m_size; - int m_capacity; - T* m_data; + int m_size; + int m_capacity; + T* m_data; //PCK: added this line - bool m_ownsMemory; + bool m_ownsMemory; #ifdef BT_ALLOW_ARRAY_COPY_OPERATOR public: - SIMD_FORCE_INLINE btAlignedObjectArray& operator=(const btAlignedObjectArray &other) + SIMD_FORCE_INLINE btAlignedObjectArray& operator=(const btAlignedObjectArray& other) { copyFromArray(other); return *this; } -#else//BT_ALLOW_ARRAY_COPY_OPERATOR +#else //BT_ALLOW_ARRAY_COPY_OPERATOR private: - SIMD_FORCE_INLINE btAlignedObjectArray& operator=(const btAlignedObjectArray &other); -#endif//BT_ALLOW_ARRAY_COPY_OPERATOR + SIMD_FORCE_INLINE btAlignedObjectArray& operator=(const btAlignedObjectArray& other); +#endif //BT_ALLOW_ARRAY_COPY_OPERATOR protected: - SIMD_FORCE_INLINE int allocSize(int size) - { - return (size ? size*2 : 1); - } - SIMD_FORCE_INLINE void copy(int start,int end, T* dest) const - { - int i; - for (i=start;i= 0); + btAssert(n < size()); + return m_data[n]; + } - ///Generally it is best to avoid using the copy constructor of an btAlignedObjectArray, and use a (const) reference to the array instead. - btAlignedObjectArray(const btAlignedObjectArray& otherArray) - { - init(); + SIMD_FORCE_INLINE T& at(int n) + { + btAssert(n >= 0); + btAssert(n < size()); + return m_data[n]; + } - int otherSize = otherArray.size(); - resize (otherSize); - otherArray.copy(0, otherSize, m_data); - } + SIMD_FORCE_INLINE const T& operator[](int n) const + { + btAssert(n >= 0); + btAssert(n < size()); + return m_data[n]; + } - - - /// return the number of elements in the array - SIMD_FORCE_INLINE int size() const - { - return m_size; - } - - SIMD_FORCE_INLINE const T& at(int n) const - { - btAssert(n>=0); - btAssert(n= 0); + btAssert(n < size()); + return m_data[n]; + } - SIMD_FORCE_INLINE T& at(int n) - { - btAssert(n>=0); - btAssert(n=0); - btAssert(n=0); - btAssert(n 0); + m_size--; + m_data[m_size].~T(); + } - SIMD_FORCE_INLINE void pop_back() + ///resize changes the number of elements in the array. If the new size is larger, the new elements will be constructed using the optional second argument. + ///when the new number of elements is smaller, the destructor will be called, but memory will not be freed, to reduce performance overhead of run-time memory (de)allocations. + SIMD_FORCE_INLINE void resizeNoInitialize(int newsize) + { + if (newsize > size()) { - btAssert(m_size>0); - m_size--; - m_data[m_size].~T(); + reserve(newsize); } + m_size = newsize; + } + SIMD_FORCE_INLINE void resize(int newsize, const T& fillData = T()) + { + const BT_REGISTER int curSize = size(); - ///resize changes the number of elements in the array. If the new size is larger, the new elements will be constructed using the optional second argument. - ///when the new number of elements is smaller, the destructor will be called, but memory will not be freed, to reduce performance overhead of run-time memory (de)allocations. - SIMD_FORCE_INLINE void resizeNoInitialize(int newsize) + if (newsize < curSize) { - if (newsize > size()) + for (int i = newsize; i < curSize; i++) { - reserve(newsize); + m_data[i].~T(); } - m_size = newsize; } - - SIMD_FORCE_INLINE void resize(int newsize, const T& fillData=T()) + else { - const BT_REGISTER int curSize = size(); - - if (newsize < curSize) + if (newsize > curSize) { - for(int i = newsize; i < curSize; i++) - { - m_data[i].~T(); - } - } else - { - if (newsize > curSize) - { - reserve(newsize); - } -#ifdef BT_USE_PLACEMENT_NEW - for (int i=curSize;i - void quickSortInternal(const L& CompareFunc,int lo, int hi) + class less + { + public: + bool operator()(const T& a, const T& b) const { - // lo is the lower index, hi is the upper index - // of the region of array a that is to be sorted - int i=lo, j=hi; - T x=m_data[(lo+hi)/2]; - - // partition - do - { - while (CompareFunc(m_data[i],x)) - i++; - while (CompareFunc(x,m_data[j])) - j--; - if (i<=j) - { - swap(i,j); - i++; j--; - } - } while (i<=j); - - // recursion - if (lo + void quickSortInternal(const L& CompareFunc, int lo, int hi) + { + // lo is the lower index, hi is the upper index + // of the region of array a that is to be sorted + int i = lo, j = hi; + T x = m_data[(lo + hi) / 2]; - template - void quickSort(const L& CompareFunc) + // partition + do { - //don't sort 0 or 1 elements - if (size()>1) + while (CompareFunc(m_data[i], x)) + i++; + while (CompareFunc(x, m_data[j])) + j--; + if (i <= j) { - quickSortInternal(CompareFunc,0,size()-1); + swap(i, j); + i++; + j--; } + } while (i <= j); + + // recursion + if (lo < j) + quickSortInternal(CompareFunc, lo, j); + if (i < hi) + quickSortInternal(CompareFunc, i, hi); + } + + template + void quickSort(const L& CompareFunc) + { + //don't sort 0 or 1 elements + if (size() > 1) + { + quickSortInternal(CompareFunc, 0, size() - 1); } + } + ///heap sort from http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Sort/Heap/ + template + void downHeap(T* pArr, int k, int n, const L& CompareFunc) + { + /* PRE: a[k+1..N] is a heap */ + /* POST: a[k..N] is a heap */ - ///heap sort from http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Sort/Heap/ - template - void downHeap(T *pArr, int k, int n, const L& CompareFunc) + T temp = pArr[k - 1]; + /* k has child(s) */ + while (k <= n / 2) { - /* PRE: a[k+1..N] is a heap */ - /* POST: a[k..N] is a heap */ - - T temp = pArr[k - 1]; - /* k has child(s) */ - while (k <= n/2) + int child = 2 * k; + + if ((child < n) && CompareFunc(pArr[child - 1], pArr[child])) + { + child++; + } + /* pick larger child */ + if (CompareFunc(temp, pArr[child - 1])) { - int child = 2*k; - - if ((child < n) && CompareFunc(pArr[child - 1] , pArr[child])) - { - child++; - } - /* pick larger child */ - if (CompareFunc(temp , pArr[child - 1])) - { - /* move child up */ - pArr[k - 1] = pArr[child - 1]; - k = child; - } - else - { - break; - } + /* move child up */ + pArr[k - 1] = pArr[child - 1]; + k = child; } - pArr[k - 1] = temp; - } /*downHeap*/ + else + { + break; + } + } + pArr[k - 1] = temp; + } /*downHeap*/ - void swap(int index0,int index1) - { + void swap(int index0, int index1) + { #ifdef BT_USE_MEMCPY - char temp[sizeof(T)]; - memcpy(temp,&m_data[index0],sizeof(T)); - memcpy(&m_data[index0],&m_data[index1],sizeof(T)); - memcpy(&m_data[index1],temp,sizeof(T)); + char temp[sizeof(T)]; + memcpy(temp, &m_data[index0], sizeof(T)); + memcpy(&m_data[index0], &m_data[index1], sizeof(T)); + memcpy(&m_data[index1], temp, sizeof(T)); #else - T temp = m_data[index0]; - m_data[index0] = m_data[index1]; - m_data[index1] = temp; -#endif //BT_USE_PLACEMENT_NEW - - } + T temp = m_data[index0]; + m_data[index0] = m_data[index1]; + m_data[index1] = temp; +#endif //BT_USE_PLACEMENT_NEW + } template void heapSort(const L& CompareFunc) @@ -423,49 +406,66 @@ protected: /* sort a[0..N-1], N.B. 0 to N-1 */ int k; int n = m_size; - for (k = n/2; k > 0; k--) + for (k = n / 2; k > 0; k--) { downHeap(m_data, k, n, CompareFunc); } /* a[1..N] is now a heap */ - while ( n>=1 ) + while (n >= 1) { - swap(0,n-1); /* largest of a[0..n-1] */ - + swap(0, n - 1); /* largest of a[0..n-1] */ n = n - 1; /* restore a[1..i-1] heap */ downHeap(m_data, 1, n, CompareFunc); - } + } } ///non-recursive binary search, assumes sorted array - int findBinarySearch(const T& key) const + int findBinarySearch(const T& key) const { int first = 0; - int last = size()-1; + int last = size() - 1; //assume sorted array - while (first <= last) { + while (first <= last) + { int mid = (first + last) / 2; // compute mid point. - if (key > m_data[mid]) + if (key > m_data[mid]) first = mid + 1; // repeat search in top half. - else if (key < m_data[mid]) - last = mid - 1; // repeat search in bottom half. + else if (key < m_data[mid]) + last = mid - 1; // repeat search in bottom half. else - return mid; // found it. return position ///// + return mid; // found it. return position ///// } - return size(); // failed to find key + return size(); // failed to find key } + int findLinearSearch(const T& key) const + { + int index = size(); + int i; - int findLinearSearch(const T& key) const + for (i = 0; i < size(); i++) + { + if (m_data[i] == key) + { + index = i; + break; + } + } + return index; + } + + // If the key is not in the array, return -1 instead of 0, + // since 0 also means the first element in the array. + int findLinearSearch2(const T& key) const { - int index=size(); + int index = -1; int i; - for (i=0;i btScalar(0.000001)); - + quotient = btScalar(-1.) / quotient; n2n3 *= p0.dist; n3n1 *= p1.dist; @@ -74,105 +75,96 @@ btVector3 ThreePlaneIntersection(const btPlane &p0,const btPlane &p1, const btP potentialVertex += n1n2; potentialVertex *= quotient; - btVector3 result(potentialVertex.getX(),potentialVertex.getY(),potentialVertex.getZ()); + btVector3 result(potentialVertex.getX(), potentialVertex.getY(), potentialVertex.getZ()); return result; - } -btScalar DistanceBetweenLines(const btVector3 &ustart, const btVector3 &udir, const btVector3 &vstart, const btVector3 &vdir, btVector3 *upoint=NULL, btVector3 *vpoint=NULL); -btVector3 TriNormal(const btVector3 &v0, const btVector3 &v1, const btVector3 &v2); -btVector3 NormalOf(const btVector3 *vert, const int n); - +btScalar DistanceBetweenLines(const btVector3 &ustart, const btVector3 &udir, const btVector3 &vstart, const btVector3 &vdir, btVector3 *upoint = NULL, btVector3 *vpoint = NULL); +btVector3 TriNormal(const btVector3 &v0, const btVector3 &v1, const btVector3 &v2); +btVector3 NormalOf(const btVector3 *vert, const int n); btVector3 PlaneLineIntersection(const btPlane &plane, const btVector3 &p0, const btVector3 &p1) { // returns the point where the line p0-p1 intersects the plane n&d - btVector3 dif; - dif = p1-p0; - btScalar dn= btDot(plane.normal,dif); - btScalar t = -(plane.dist+btDot(plane.normal,p0) )/dn; - return p0 + (dif*t); + btVector3 dif; + dif = p1 - p0; + btScalar dn = btDot(plane.normal, dif); + btScalar t = -(plane.dist + btDot(plane.normal, p0)) / dn; + return p0 + (dif * t); } btVector3 PlaneProject(const btPlane &plane, const btVector3 &point) { - return point - plane.normal * (btDot(point,plane.normal)+plane.dist); + return point - plane.normal * (btDot(point, plane.normal) + plane.dist); } btVector3 TriNormal(const btVector3 &v0, const btVector3 &v1, const btVector3 &v2) { // return the normal of the triangle // inscribed by v0, v1, and v2 - btVector3 cp=btCross(v1-v0,v2-v1); - btScalar m=cp.length(); - if(m==0) return btVector3(1,0,0); - return cp*(btScalar(1.0)/m); + btVector3 cp = btCross(v1 - v0, v2 - v1); + btScalar m = cp.length(); + if (m == 0) return btVector3(1, 0, 0); + return cp * (btScalar(1.0) / m); } - btScalar DistanceBetweenLines(const btVector3 &ustart, const btVector3 &udir, const btVector3 &vstart, const btVector3 &vdir, btVector3 *upoint, btVector3 *vpoint) { btVector3 cp; - cp = btCross(udir,vdir).normalized(); + cp = btCross(udir, vdir).normalized(); - btScalar distu = -btDot(cp,ustart); - btScalar distv = -btDot(cp,vstart); - btScalar dist = (btScalar)fabs(distu-distv); - if(upoint) - { + btScalar distu = -btDot(cp, ustart); + btScalar distv = -btDot(cp, vstart); + btScalar dist = (btScalar)fabs(distu - distv); + if (upoint) + { btPlane plane; - plane.normal = btCross(vdir,cp).normalized(); - plane.dist = -btDot(plane.normal,vstart); - *upoint = PlaneLineIntersection(plane,ustart,ustart+udir); + plane.normal = btCross(vdir, cp).normalized(); + plane.dist = -btDot(plane.normal, vstart); + *upoint = PlaneLineIntersection(plane, ustart, ustart + udir); } - if(vpoint) - { + if (vpoint) + { btPlane plane; - plane.normal = btCross(udir,cp).normalized(); - plane.dist = -btDot(plane.normal,ustart); - *vpoint = PlaneLineIntersection(plane,vstart,vstart+vdir); + plane.normal = btCross(udir, cp).normalized(); + plane.dist = -btDot(plane.normal, ustart); + *vpoint = PlaneLineIntersection(plane, vstart, vstart + vdir); } return dist; } - - - - - - -#define COPLANAR (0) -#define UNDER (1) -#define OVER (2) -#define SPLIT (OVER|UNDER) +#define COPLANAR (0) +#define UNDER (1) +#define OVER (2) +#define SPLIT (OVER | UNDER) #define PAPERWIDTH (btScalar(0.001)) btScalar planetestepsilon = PAPERWIDTH; - - typedef ConvexH::HalfEdge HalfEdge; -ConvexH::ConvexH(int vertices_size,int edges_size,int facets_size) +ConvexH::ConvexH(int vertices_size, int edges_size, int facets_size) { vertices.resize(vertices_size); edges.resize(edges_size); facets.resize(facets_size); } - int PlaneTest(const btPlane &p, const btVector3 &v); -int PlaneTest(const btPlane &p, const btVector3 &v) { - btScalar a = btDot(v,p.normal)+p.dist; - int flag = (a>planetestepsilon)?OVER:((a<-planetestepsilon)?UNDER:COPLANAR); +int PlaneTest(const btPlane &p, const btVector3 &v) +{ + btScalar a = btDot(v, p.normal) + p.dist; + int flag = (a > planetestepsilon) ? OVER : ((a < -planetestepsilon) ? UNDER : COPLANAR); return flag; } -int SplitTest(ConvexH &convex,const btPlane &plane); -int SplitTest(ConvexH &convex,const btPlane &plane) { - int flag=0; - for(int i=0;i -int maxdirfiltered(const T *p,int count,const T &dir,btAlignedObjectArray &allow) +template +int maxdirfiltered(const T *p, int count, const T &dir, btAlignedObjectArray &allow) { btAssert(count); - int m=-1; - for(int i=0;ibtDot(p[m],dir)) - m=i; + if (m == -1 || btDot(p[i], dir) > btDot(p[m], dir)) + m = i; } - btAssert(m!=-1); + btAssert(m != -1); return m; -} +} btVector3 orth(const btVector3 &v); btVector3 orth(const btVector3 &v) { - btVector3 a=btCross(v,btVector3(0,0,1)); - btVector3 b=btCross(v,btVector3(0,1,0)); + btVector3 a = btCross(v, btVector3(0, 0, 1)); + btVector3 b = btCross(v, btVector3(0, 1, 0)); if (a.length() > b.length()) { return a.normalized(); - } else { + } + else + { return b.normalized(); } } - -template -int maxdirsterid(const T *p,int count,const T &dir,btAlignedObjectArray &allow) +template +int maxdirsterid(const T *p, int count, const T &dir, btAlignedObjectArray &allow) { - int m=-1; - while(m==-1) + int m = -1; + while (m == -1) { - m = maxdirfiltered(p,count,dir,allow); - if(allow[m]==3) return m; + m = maxdirfiltered(p, count, dir, allow); + if (allow[m] == 3) return m; T u = orth(dir); - T v = btCross(u,dir); - int ma=-1; - for(btScalar x = btScalar(0.0) ; x<= btScalar(360.0) ; x+= btScalar(45.0)) + T v = btCross(u, dir); + int ma = -1; + for (btScalar x = btScalar(0.0); x <= btScalar(360.0); x += btScalar(45.0)) { - btScalar s = btSin(SIMD_RADS_PER_DEG*(x)); - btScalar c = btCos(SIMD_RADS_PER_DEG*(x)); - int mb = maxdirfiltered(p,count,dir+(u*s+v*c)*btScalar(0.025),allow); - if(ma==m && mb==m) + btScalar s = btSin(SIMD_RADS_PER_DEG * (x)); + btScalar c = btCos(SIMD_RADS_PER_DEG * (x)); + int mb = maxdirfiltered(p, count, dir + (u * s + v * c) * btScalar(0.025), allow); + if (ma == m && mb == m) { - allow[m]=3; + allow[m] = 3; return m; } - if(ma!=-1 && ma!=mb) // Yuck - this is really ugly + if (ma != -1 && ma != mb) // Yuck - this is really ugly { int mc = ma; - for(btScalar xx = x-btScalar(40.0) ; xx <= x ; xx+= btScalar(5.0)) + for (btScalar xx = x - btScalar(40.0); xx <= x; xx += btScalar(5.0)) { - btScalar s = btSin(SIMD_RADS_PER_DEG*(xx)); - btScalar c = btCos(SIMD_RADS_PER_DEG*(xx)); - int md = maxdirfiltered(p,count,dir+(u*s+v*c)*btScalar(0.025),allow); - if(mc==m && md==m) + btScalar s = btSin(SIMD_RADS_PER_DEG * (xx)); + btScalar c = btCos(SIMD_RADS_PER_DEG * (xx)); + int md = maxdirfiltered(p, count, dir + (u * s + v * c) * btScalar(0.025), allow); + if (mc == m && md == m) { - allow[m]=3; + allow[m] = 3; return m; } - mc=md; + mc = md; } } - ma=mb; + ma = mb; } - allow[m]=0; - m=-1; + allow[m] = 0; + m = -1; } btAssert(0); return m; -} - - - +} -int operator ==(const int3 &a,const int3 &b); -int operator ==(const int3 &a,const int3 &b) +int operator==(const int3 &a, const int3 &b); +int operator==(const int3 &a, const int3 &b) { - for(int i=0;i<3;i++) + for (int i = 0; i < 3; i++) { - if(a[i]!=b[i]) return 0; + if (a[i] != b[i]) return 0; } return 1; } - -int above(btVector3* vertices,const int3& t, const btVector3 &p, btScalar epsilon); -int above(btVector3* vertices,const int3& t, const btVector3 &p, btScalar epsilon) +int above(btVector3 *vertices, const int3 &t, const btVector3 &p, btScalar epsilon); +int above(btVector3 *vertices, const int3 &t, const btVector3 &p, btScalar epsilon) { - btVector3 n=TriNormal(vertices[t[0]],vertices[t[1]],vertices[t[2]]); - return (btDot(n,p-vertices[t[0]]) > epsilon); // EPSILON??? + btVector3 n = TriNormal(vertices[t[0]], vertices[t[1]], vertices[t[2]]); + return (btDot(n, p - vertices[t[0]]) > epsilon); // EPSILON??? } -int hasedge(const int3 &t, int a,int b); -int hasedge(const int3 &t, int a,int b) +int hasedge(const int3 &t, int a, int b); +int hasedge(const int3 &t, int a, int b) { - for(int i=0;i<3;i++) + for (int i = 0; i < 3; i++) { - int i1= (i+1)%3; - if(t[i]==a && t[i1]==b) return 1; + int i1 = (i + 1) % 3; + if (t[i] == a && t[i1] == b) return 1; } return 0; } int hasvert(const int3 &t, int v); int hasvert(const int3 &t, int v) { - return (t[0]==v || t[1]==v || t[2]==v) ; + return (t[0] == v || t[1] == v || t[2] == v); } -int shareedge(const int3 &a,const int3 &b); -int shareedge(const int3 &a,const int3 &b) +int shareedge(const int3 &a, const int3 &b); +int shareedge(const int3 &a, const int3 &b) { int i; - for(i=0;i<3;i++) + for (i = 0; i < 3; i++) { - int i1= (i+1)%3; - if(hasedge(a,b[i1],b[i])) return 1; + int i1 = (i + 1) % 3; + if (hasedge(a, b[i1], b[i])) return 1; } return 0; } class btHullTriangle; - - class btHullTriangle : public int3 { public: @@ -346,51 +327,50 @@ public: int id; int vmax; btScalar rise; - btHullTriangle(int a,int b,int c):int3(a,b,c),n(-1,-1,-1) + btHullTriangle(int a, int b, int c) : int3(a, b, c), n(-1, -1, -1) { - vmax=-1; + vmax = -1; rise = btScalar(0.0); } ~btHullTriangle() { } - int &neib(int a,int b); + int &neib(int a, int b); }; - -int &btHullTriangle::neib(int a,int b) +int &btHullTriangle::neib(int a, int b) { - static int er=-1; + static int er = -1; int i; - for(i=0;i<3;i++) + for (i = 0; i < 3; i++) { - int i1=(i+1)%3; - int i2=(i+2)%3; - if((*this)[i]==a && (*this)[i1]==b) return n[i2]; - if((*this)[i]==b && (*this)[i1]==a) return n[i2]; + int i1 = (i + 1) % 3; + int i2 = (i + 2) % 3; + if ((*this)[i] == a && (*this)[i1] == b) return n[i2]; + if ((*this)[i] == b && (*this)[i1] == a) return n[i2]; } btAssert(0); return er; } -void HullLibrary::b2bfix(btHullTriangle* s,btHullTriangle*t) +void HullLibrary::b2bfix(btHullTriangle *s, btHullTriangle *t) { int i; - for(i=0;i<3;i++) + for (i = 0; i < 3; i++) { - int i1=(i+1)%3; - int i2=(i+2)%3; + int i1 = (i + 1) % 3; + int i2 = (i + 2) % 3; int a = (*s)[i1]; int b = (*s)[i2]; - btAssert(m_tris[s->neib(a,b)]->neib(b,a) == s->id); - btAssert(m_tris[t->neib(a,b)]->neib(b,a) == t->id); - m_tris[s->neib(a,b)]->neib(b,a) = t->neib(b,a); - m_tris[t->neib(b,a)]->neib(a,b) = s->neib(a,b); + btAssert(m_tris[s->neib(a, b)]->neib(b, a) == s->id); + btAssert(m_tris[t->neib(a, b)]->neib(b, a) == t->id); + m_tris[s->neib(a, b)]->neib(b, a) = t->neib(b, a); + m_tris[t->neib(b, a)]->neib(a, b) = s->neib(a, b); } } -void HullLibrary::removeb2b(btHullTriangle* s,btHullTriangle*t) +void HullLibrary::removeb2b(btHullTriangle *s, btHullTriangle *t) { - b2bfix(s,t); + b2bfix(s, t); deAllocateTriangle(s); deAllocateTriangle(t); @@ -401,11 +381,11 @@ void HullLibrary::checkit(btHullTriangle *t) (void)t; int i; - btAssert(m_tris[t->id]==t); - for(i=0;i<3;i++) + btAssert(m_tris[t->id] == t); + for (i = 0; i < 3; i++) { - int i1=(i+1)%3; - int i2=(i+2)%3; + int i1 = (i + 1) % 3; + int i2 = (i + 2) % 3; int a = (*t)[i1]; int b = (*t)[i2]; @@ -415,226 +395,233 @@ void HullLibrary::checkit(btHullTriangle *t) (void)a; (void)b; - btAssert(a!=b); - btAssert( m_tris[t->n[i]]->neib(b,a) == t->id); + btAssert(a != b); + btAssert(m_tris[t->n[i]]->neib(b, a) == t->id); } } -btHullTriangle* HullLibrary::allocateTriangle(int a,int b,int c) +btHullTriangle *HullLibrary::allocateTriangle(int a, int b, int c) { - void* mem = btAlignedAlloc(sizeof(btHullTriangle),16); - btHullTriangle* tr = new (mem)btHullTriangle(a,b,c); + void *mem = btAlignedAlloc(sizeof(btHullTriangle), 16); + btHullTriangle *tr = new (mem) btHullTriangle(a, b, c); tr->id = m_tris.size(); m_tris.push_back(tr); return tr; } -void HullLibrary::deAllocateTriangle(btHullTriangle* tri) +void HullLibrary::deAllocateTriangle(btHullTriangle *tri) { - btAssert(m_tris[tri->id]==tri); - m_tris[tri->id]=NULL; + btAssert(m_tris[tri->id] == tri); + m_tris[tri->id] = NULL; tri->~btHullTriangle(); btAlignedFree(tri); } - -void HullLibrary::extrude(btHullTriangle *t0,int v) +void HullLibrary::extrude(btHullTriangle *t0, int v) { - int3 t= *t0; + int3 t = *t0; int n = m_tris.size(); - btHullTriangle* ta = allocateTriangle(v,t[1],t[2]); - ta->n = int3(t0->n[0],n+1,n+2); - m_tris[t0->n[0]]->neib(t[1],t[2]) = n+0; - btHullTriangle* tb = allocateTriangle(v,t[2],t[0]); - tb->n = int3(t0->n[1],n+2,n+0); - m_tris[t0->n[1]]->neib(t[2],t[0]) = n+1; - btHullTriangle* tc = allocateTriangle(v,t[0],t[1]); - tc->n = int3(t0->n[2],n+0,n+1); - m_tris[t0->n[2]]->neib(t[0],t[1]) = n+2; + btHullTriangle *ta = allocateTriangle(v, t[1], t[2]); + ta->n = int3(t0->n[0], n + 1, n + 2); + m_tris[t0->n[0]]->neib(t[1], t[2]) = n + 0; + btHullTriangle *tb = allocateTriangle(v, t[2], t[0]); + tb->n = int3(t0->n[1], n + 2, n + 0); + m_tris[t0->n[1]]->neib(t[2], t[0]) = n + 1; + btHullTriangle *tc = allocateTriangle(v, t[0], t[1]); + tc->n = int3(t0->n[2], n + 0, n + 1); + m_tris[t0->n[2]]->neib(t[0], t[1]) = n + 2; checkit(ta); checkit(tb); checkit(tc); - if(hasvert(*m_tris[ta->n[0]],v)) removeb2b(ta,m_tris[ta->n[0]]); - if(hasvert(*m_tris[tb->n[0]],v)) removeb2b(tb,m_tris[tb->n[0]]); - if(hasvert(*m_tris[tc->n[0]],v)) removeb2b(tc,m_tris[tc->n[0]]); + if (hasvert(*m_tris[ta->n[0]], v)) removeb2b(ta, m_tris[ta->n[0]]); + if (hasvert(*m_tris[tb->n[0]], v)) removeb2b(tb, m_tris[tb->n[0]]); + if (hasvert(*m_tris[tc->n[0]], v)) removeb2b(tc, m_tris[tc->n[0]]); deAllocateTriangle(t0); - } -btHullTriangle* HullLibrary::extrudable(btScalar epsilon) +btHullTriangle *HullLibrary::extrudable(btScalar epsilon) { int i; - btHullTriangle *t=NULL; - for(i=0;iriserise)) + if (!t || (m_tris[i] && t->rise < m_tris[i]->rise)) { t = m_tris[i]; } } - return (t->rise >epsilon)?t:NULL ; + return (t->rise > epsilon) ? t : NULL; } - - - -int4 HullLibrary::FindSimplex(btVector3 *verts,int verts_count,btAlignedObjectArray &allow) +int4 HullLibrary::FindSimplex(btVector3 *verts, int verts_count, btAlignedObjectArray &allow) { btVector3 basis[3]; - basis[0] = btVector3( btScalar(0.01), btScalar(0.02), btScalar(1.0) ); - int p0 = maxdirsterid(verts,verts_count, basis[0],allow); - int p1 = maxdirsterid(verts,verts_count,-basis[0],allow); - basis[0] = verts[p0]-verts[p1]; - if(p0==p1 || basis[0]==btVector3(0,0,0)) - return int4(-1,-1,-1,-1); - basis[1] = btCross(btVector3( btScalar(1),btScalar(0.02), btScalar(0)),basis[0]); - basis[2] = btCross(btVector3(btScalar(-0.02), btScalar(1), btScalar(0)),basis[0]); + basis[0] = btVector3(btScalar(0.01), btScalar(0.02), btScalar(1.0)); + int p0 = maxdirsterid(verts, verts_count, basis[0], allow); + int p1 = maxdirsterid(verts, verts_count, -basis[0], allow); + basis[0] = verts[p0] - verts[p1]; + if (p0 == p1 || basis[0] == btVector3(0, 0, 0)) + return int4(-1, -1, -1, -1); + basis[1] = btCross(btVector3(btScalar(1), btScalar(0.02), btScalar(0)), basis[0]); + basis[2] = btCross(btVector3(btScalar(-0.02), btScalar(1), btScalar(0)), basis[0]); if (basis[1].length() > basis[2].length()) { basis[1].normalize(); - } else { + } + else + { basis[1] = basis[2]; - basis[1].normalize (); + basis[1].normalize(); } - int p2 = maxdirsterid(verts,verts_count,basis[1],allow); - if(p2 == p0 || p2 == p1) + int p2 = maxdirsterid(verts, verts_count, basis[1], allow); + if (p2 == p0 || p2 == p1) { - p2 = maxdirsterid(verts,verts_count,-basis[1],allow); + p2 = maxdirsterid(verts, verts_count, -basis[1], allow); } - if(p2 == p0 || p2 == p1) - return int4(-1,-1,-1,-1); + if (p2 == p0 || p2 == p1) + return int4(-1, -1, -1, -1); basis[1] = verts[p2] - verts[p0]; - basis[2] = btCross(basis[1],basis[0]).normalized(); - int p3 = maxdirsterid(verts,verts_count,basis[2],allow); - if(p3==p0||p3==p1||p3==p2) p3 = maxdirsterid(verts,verts_count,-basis[2],allow); - if(p3==p0||p3==p1||p3==p2) - return int4(-1,-1,-1,-1); - btAssert(!(p0==p1||p0==p2||p0==p3||p1==p2||p1==p3||p2==p3)); - if(btDot(verts[p3]-verts[p0],btCross(verts[p1]-verts[p0],verts[p2]-verts[p0])) <0) {btSwap(p2,p3);} - return int4(p0,p1,p2,p3); + basis[2] = btCross(basis[1], basis[0]).normalized(); + int p3 = maxdirsterid(verts, verts_count, basis[2], allow); + if (p3 == p0 || p3 == p1 || p3 == p2) p3 = maxdirsterid(verts, verts_count, -basis[2], allow); + if (p3 == p0 || p3 == p1 || p3 == p2) + return int4(-1, -1, -1, -1); + btAssert(!(p0 == p1 || p0 == p2 || p0 == p3 || p1 == p2 || p1 == p3 || p2 == p3)); + if (btDot(verts[p3] - verts[p0], btCross(verts[p1] - verts[p0], verts[p2] - verts[p0])) < 0) + { + btSwap(p2, p3); + } + return int4(p0, p1, p2, p3); } -int HullLibrary::calchullgen(btVector3 *verts,int verts_count, int vlimit) +int HullLibrary::calchullgen(btVector3 *verts, int verts_count, int vlimit) { - if(verts_count <4) return 0; - if(vlimit==0) vlimit=1000000000; + if (verts_count < 4) return 0; + if (vlimit == 0) vlimit = 1000000000; int j; - btVector3 bmin(*verts),bmax(*verts); + btVector3 bmin(*verts), bmax(*verts); btAlignedObjectArray isextreme; isextreme.reserve(verts_count); btAlignedObjectArray allow; allow.reserve(verts_count); - for(j=0;jn=int3(2,3,1); - btHullTriangle *t1 = allocateTriangle(p[3],p[2],p[0]); t1->n=int3(3,2,0); - btHullTriangle *t2 = allocateTriangle(p[0],p[1],p[3]); t2->n=int3(0,1,3); - btHullTriangle *t3 = allocateTriangle(p[1],p[0],p[2]); t3->n=int3(1,0,2); - isextreme[p[0]]=isextreme[p[1]]=isextreme[p[2]]=isextreme[p[3]]=1; - checkit(t0);checkit(t1);checkit(t2);checkit(t3); - - for(j=0;jn = int3(2, 3, 1); + btHullTriangle *t1 = allocateTriangle(p[3], p[2], p[0]); + t1->n = int3(3, 2, 0); + btHullTriangle *t2 = allocateTriangle(p[0], p[1], p[3]); + t2->n = int3(0, 1, 3); + btHullTriangle *t3 = allocateTriangle(p[1], p[0], p[2]); + t3->n = int3(1, 0, 2); + isextreme[p[0]] = isextreme[p[1]] = isextreme[p[2]] = isextreme[p[3]] = 1; + checkit(t0); + checkit(t1); + checkit(t2); + checkit(t3); + + for (j = 0; j < m_tris.size(); j++) { - btHullTriangle *t=m_tris[j]; + btHullTriangle *t = m_tris[j]; btAssert(t); - btAssert(t->vmax<0); - btVector3 n=TriNormal(verts[(*t)[0]],verts[(*t)[1]],verts[(*t)[2]]); - t->vmax = maxdirsterid(verts,verts_count,n,allow); - t->rise = btDot(n,verts[t->vmax]-verts[(*t)[0]]); + btAssert(t->vmax < 0); + btVector3 n = TriNormal(verts[(*t)[0]], verts[(*t)[1]], verts[(*t)[2]]); + t->vmax = maxdirsterid(verts, verts_count, n, allow); + t->rise = btDot(n, verts[t->vmax] - verts[(*t)[0]]); } btHullTriangle *te; - vlimit-=4; - while(vlimit >0 && ((te=extrudable(epsilon)) != 0)) + vlimit -= 4; + while (vlimit > 0 && ((te = extrudable(epsilon)) != 0)) { //int3 ti=*te; - int v=te->vmax; + int v = te->vmax; btAssert(v != -1); btAssert(!isextreme[v]); // wtf we've already done this vertex - isextreme[v]=1; + isextreme[v] = 1; //if(v==p0 || v==p1 || v==p2 || v==p3) continue; // done these already - j=m_tris.size(); - while(j--) { - if(!m_tris[j]) continue; - int3 t=*m_tris[j]; - if(above(verts,t,verts[v],btScalar(0.01)*epsilon)) + j = m_tris.size(); + while (j--) + { + if (!m_tris[j]) continue; + int3 t = *m_tris[j]; + if (above(verts, t, verts[v], btScalar(0.01) * epsilon)) { - extrude(m_tris[j],v); + extrude(m_tris[j], v); } } // now check for those degenerate cases where we have a flipped triangle or a really skinny triangle - j=m_tris.size(); - while(j--) + j = m_tris.size(); + while (j--) { - if(!m_tris[j]) continue; - if(!hasvert(*m_tris[j],v)) break; - int3 nt=*m_tris[j]; - if(above(verts,nt,center,btScalar(0.01)*epsilon) || btCross(verts[nt[1]]-verts[nt[0]],verts[nt[2]]-verts[nt[1]]).length()< epsilon*epsilon*btScalar(0.1) ) + if (!m_tris[j]) continue; + if (!hasvert(*m_tris[j], v)) break; + int3 nt = *m_tris[j]; + if (above(verts, nt, center, btScalar(0.01) * epsilon) || btCross(verts[nt[1]] - verts[nt[0]], verts[nt[2]] - verts[nt[1]]).length() < epsilon * epsilon * btScalar(0.1)) { btHullTriangle *nb = m_tris[m_tris[j]->n[0]]; - btAssert(nb);btAssert(!hasvert(*nb,v));btAssert(nb->idid < j); + extrude(nb, v); + j = m_tris.size(); } - } - j=m_tris.size(); - while(j--) + } + j = m_tris.size(); + while (j--) { - btHullTriangle *t=m_tris[j]; - if(!t) continue; - if(t->vmax>=0) break; - btVector3 n=TriNormal(verts[(*t)[0]],verts[(*t)[1]],verts[(*t)[2]]); - t->vmax = maxdirsterid(verts,verts_count,n,allow); - if(isextreme[t->vmax]) + btHullTriangle *t = m_tris[j]; + if (!t) continue; + if (t->vmax >= 0) break; + btVector3 n = TriNormal(verts[(*t)[0]], verts[(*t)[1]], verts[(*t)[2]]); + t->vmax = maxdirsterid(verts, verts_count, n, allow); + if (isextreme[t->vmax]) { - t->vmax=-1; // already done that vertex - algorithm needs to be able to terminate. + t->vmax = -1; // already done that vertex - algorithm needs to be able to terminate. } else { - t->rise = btDot(n,verts[t->vmax]-verts[(*t)[0]]); + t->rise = btDot(n, verts[t->vmax] - verts[(*t)[0]]); } } - vlimit --; + vlimit--; } return 1; } -int HullLibrary::calchull(btVector3 *verts,int verts_count, TUIntArray& tris_out, int &tris_count,int vlimit) +int HullLibrary::calchull(btVector3 *verts, int verts_count, TUIntArray &tris_out, int &tris_count, int vlimit) { - int rc=calchullgen(verts,verts_count, vlimit) ; - if(!rc) return 0; + int rc = calchullgen(verts, verts_count, vlimit); + if (!rc) return 0; btAlignedObjectArray ts; int i; - for(i=0;i(ts[i]); } @@ -643,29 +630,22 @@ int HullLibrary::calchull(btVector3 *verts,int verts_count, TUIntArray& tris_out return 1; } - - - - -bool HullLibrary::ComputeHull(unsigned int vcount,const btVector3 *vertices,PHullResult &result,unsigned int vlimit) +bool HullLibrary::ComputeHull(unsigned int vcount, const btVector3 *vertices, PHullResult &result, unsigned int vlimit) { - - int tris_count; - int ret = calchull( (btVector3 *) vertices, (int) vcount, result.m_Indices, tris_count, static_cast(vlimit) ); - if(!ret) return false; - result.mIndexCount = (unsigned int) (tris_count*3); - result.mFaceCount = (unsigned int) tris_count; - result.mVertices = (btVector3*) vertices; - result.mVcount = (unsigned int) vcount; + int tris_count; + int ret = calchull((btVector3 *)vertices, (int)vcount, result.m_Indices, tris_count, static_cast(vlimit)); + if (!ret) return false; + result.mIndexCount = (unsigned int)(tris_count * 3); + result.mFaceCount = (unsigned int)tris_count; + result.mVertices = (btVector3 *)vertices; + result.mVcount = (unsigned int)vcount; return true; - } - void ReleaseHull(PHullResult &result); void ReleaseHull(PHullResult &result) { - if ( result.m_Indices.size() ) + if (result.m_Indices.size()) { result.m_Indices.clear(); } @@ -675,7 +655,6 @@ void ReleaseHull(PHullResult &result) result.mVertices = 0; } - //********************************************************************* //********************************************************************* //******** HullLib header @@ -688,16 +667,15 @@ void ReleaseHull(PHullResult &result) //********************************************************************* //********************************************************************* -HullError HullLibrary::CreateConvexHull(const HullDesc &desc, // describes the input request - HullResult &result) // contains the resulst +HullError HullLibrary::CreateConvexHull(const HullDesc &desc, // describes the input request + HullResult &result) // contains the resulst { HullError ret = QE_FAIL; - PHullResult hr; unsigned int vcount = desc.mVcount; - if ( vcount < 8 ) vcount = 8; + if (vcount < 8) vcount = 8; btAlignedObjectArray vertexSource; vertexSource.resize(static_cast(vcount)); @@ -706,87 +684,82 @@ HullError HullLibrary::CreateConvexHull(const HullDesc &desc, // unsigned int ovcount; - bool ok = CleanupVertices(desc.mVcount,desc.mVertices, desc.mVertexStride, ovcount, &vertexSource[0], desc.mNormalEpsilon, scale ); // normalize point cloud, remove duplicates! + bool ok = CleanupVertices(desc.mVcount, desc.mVertices, desc.mVertexStride, ovcount, &vertexSource[0], desc.mNormalEpsilon, scale); // normalize point cloud, remove duplicates! - if ( ok ) + if (ok) { - - -// if ( 1 ) // scale vertices back to their original size. + // if ( 1 ) // scale vertices back to their original size. { - for (unsigned int i=0; i(i)]; - v[0]*=scale[0]; - v[1]*=scale[1]; - v[2]*=scale[2]; + btVector3 &v = vertexSource[static_cast(i)]; + v[0] *= scale[0]; + v[1] *= scale[1]; + v[2] *= scale[2]; } } - ok = ComputeHull(ovcount,&vertexSource[0],hr,desc.mMaxVertices); + ok = ComputeHull(ovcount, &vertexSource[0], hr, desc.mMaxVertices); - if ( ok ) + if (ok) { - // re-index triangle mesh so it refers to only used vertices, rebuild a new vertex table. - btAlignedObjectArray vertexScratch; + btAlignedObjectArray vertexScratch; vertexScratch.resize(static_cast(hr.mVcount)); - BringOutYourDead(hr.mVertices,hr.mVcount, &vertexScratch[0], ovcount, &hr.m_Indices[0], hr.mIndexCount ); + BringOutYourDead(hr.mVertices, hr.mVcount, &vertexScratch[0], ovcount, &hr.m_Indices[0], hr.mIndexCount); ret = QE_OK; - if ( desc.HasHullFlag(QF_TRIANGLES) ) // if he wants the results as triangle! + if (desc.HasHullFlag(QF_TRIANGLES)) // if he wants the results as triangle! { - result.mPolygons = false; + result.mPolygons = false; result.mNumOutputVertices = ovcount; result.m_OutputVertices.resize(static_cast(ovcount)); - result.mNumFaces = hr.mFaceCount; - result.mNumIndices = hr.mIndexCount; + result.mNumFaces = hr.mFaceCount; + result.mNumIndices = hr.mIndexCount; result.m_Indices.resize(static_cast(hr.mIndexCount)); - memcpy(&result.m_OutputVertices[0], &vertexScratch[0], sizeof(btVector3)*ovcount ); + memcpy(&result.m_OutputVertices[0], &vertexScratch[0], sizeof(btVector3) * ovcount); - if ( desc.HasHullFlag(QF_REVERSE_ORDER) ) + if (desc.HasHullFlag(QF_REVERSE_ORDER)) { - const unsigned int *source = &hr.m_Indices[0]; - unsigned int *dest = &result.m_Indices[0]; + unsigned int *dest = &result.m_Indices[0]; - for (unsigned int i=0; i(ovcount)); - result.mNumFaces = hr.mFaceCount; - result.mNumIndices = hr.mIndexCount+hr.mFaceCount; + result.mNumFaces = hr.mFaceCount; + result.mNumIndices = hr.mIndexCount + hr.mFaceCount; result.m_Indices.resize(static_cast(result.mNumIndices)); - memcpy(&result.m_OutputVertices[0], &vertexScratch[0], sizeof(btVector3)*ovcount ); + memcpy(&result.m_OutputVertices[0], &vertexScratch[0], sizeof(btVector3) * ovcount); -// if ( 1 ) + // if ( 1 ) { const unsigned int *source = &hr.m_Indices[0]; - unsigned int *dest = &result.m_Indices[0]; - for (unsigned int i=0; i bmax[j] ) bmax[j] = p[j]; + if (p[j] < bmin[j]) bmin[j] = p[j]; + if (p[j] > bmax[j]) bmax[j] = p[j]; } } } @@ -905,28 +871,27 @@ bool HullLibrary::CleanupVertices(unsigned int svcount, btVector3 center; - center[0] = dx*btScalar(0.5) + bmin[0]; - center[1] = dy*btScalar(0.5) + bmin[1]; - center[2] = dz*btScalar(0.5) + bmin[2]; + center[0] = dx * btScalar(0.5) + bmin[0]; + center[1] = dy * btScalar(0.5) + bmin[1]; + center[2] = dz * btScalar(0.5) + bmin[2]; - if ( dx < EPSILON || dy < EPSILON || dz < EPSILON || svcount < 3 ) + if (dx < EPSILON || dy < EPSILON || dz < EPSILON || svcount < 3) { - btScalar len = FLT_MAX; - if ( dx > EPSILON && dx < len ) len = dx; - if ( dy > EPSILON && dy < len ) len = dy; - if ( dz > EPSILON && dz < len ) len = dz; + if (dx > EPSILON && dx < len) len = dx; + if (dy > EPSILON && dy < len) len = dy; + if (dz > EPSILON && dz < len) len = dz; - if ( len == FLT_MAX ) + if (len == FLT_MAX) { - dx = dy = dz = btScalar(0.01); // one centimeter + dx = dy = dz = btScalar(0.01); // one centimeter } else { - if ( dx < EPSILON ) dx = len * btScalar(0.05); // 1/5th the shortest non-zero edge. - if ( dy < EPSILON ) dy = len * btScalar(0.05); - if ( dz < EPSILON ) dz = len * btScalar(0.05); + if (dx < EPSILON) dx = len * btScalar(0.05); // 1/5th the shortest non-zero edge. + if (dy < EPSILON) dy = len * btScalar(0.05); + if (dz < EPSILON) dz = len * btScalar(0.05); } btScalar x1 = center[0] - dx; @@ -938,22 +903,20 @@ bool HullLibrary::CleanupVertices(unsigned int svcount, btScalar z1 = center[2] - dz; btScalar z2 = center[2] + dz; - addPoint(vcount,vertices,x1,y1,z1); - addPoint(vcount,vertices,x2,y1,z1); - addPoint(vcount,vertices,x2,y2,z1); - addPoint(vcount,vertices,x1,y2,z1); - addPoint(vcount,vertices,x1,y1,z2); - addPoint(vcount,vertices,x2,y1,z2); - addPoint(vcount,vertices,x2,y2,z2); - addPoint(vcount,vertices,x1,y2,z2); - - return true; // return cube - + addPoint(vcount, vertices, x1, y1, z1); + addPoint(vcount, vertices, x2, y1, z1); + addPoint(vcount, vertices, x2, y2, z1); + addPoint(vcount, vertices, x1, y2, z1); + addPoint(vcount, vertices, x1, y1, z2); + addPoint(vcount, vertices, x2, y1, z2); + addPoint(vcount, vertices, x2, y2, z2); + addPoint(vcount, vertices, x1, y2, z2); + return true; // return cube } else { - if ( scale ) + if (scale) { scale[0] = dx; scale[1] = dy; @@ -963,75 +926,70 @@ bool HullLibrary::CleanupVertices(unsigned int svcount, recip[1] = 1 / dy; recip[2] = 1 / dz; - center[0]*=recip[0]; - center[1]*=recip[1]; - center[2]*=recip[2]; - + center[0] *= recip[0]; + center[1] *= recip[1]; + center[2] *= recip[2]; } - } + vtx = (const char *)svertices; - - vtx = (const char *) svertices; - - for (unsigned int i=0; igetX(); btScalar py = p->getY(); btScalar pz = p->getZ(); - if ( scale ) + if (scale) { - px = px*recip[0]; // normalize - py = py*recip[1]; // normalize - pz = pz*recip[2]; // normalize + px = px * recip[0]; // normalize + py = py * recip[1]; // normalize + pz = pz * recip[2]; // normalize } -// if ( 1 ) + // if ( 1 ) { unsigned int j; - for (j=0; j dist2 ) + if (dist1 > dist2) { v[0] = px; v[1] = py; v[2] = pz; - } break; } } - if ( j == vcount ) + if (j == vcount) { - btVector3& dest = vertices[vcount]; + btVector3 &dest = vertices[vcount]; dest[0] = px; dest[1] = py; dest[2] = pz; @@ -1042,18 +1000,18 @@ bool HullLibrary::CleanupVertices(unsigned int svcount, } // ok..now make sure we didn't prune so many vertices it is now invalid. -// if ( 1 ) + // if ( 1 ) { - btScalar bmin[3] = { FLT_MAX, FLT_MAX, FLT_MAX }; - btScalar bmax[3] = { -FLT_MAX, -FLT_MAX, -FLT_MAX }; + btScalar bmin[3] = {FLT_MAX, FLT_MAX, FLT_MAX}; + btScalar bmax[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX}; - for (unsigned int i=0; i bmax[j] ) bmax[j] = p[j]; + if (p[j] < bmin[j]) bmin[j] = p[j]; + if (p[j] > bmax[j]) bmax[j] = p[j]; } } @@ -1061,27 +1019,27 @@ bool HullLibrary::CleanupVertices(unsigned int svcount, btScalar dy = bmax[1] - bmin[1]; btScalar dz = bmax[2] - bmin[2]; - if ( dx < EPSILON || dy < EPSILON || dz < EPSILON || vcount < 3) + if (dx < EPSILON || dy < EPSILON || dz < EPSILON || vcount < 3) { - btScalar cx = dx*btScalar(0.5) + bmin[0]; - btScalar cy = dy*btScalar(0.5) + bmin[1]; - btScalar cz = dz*btScalar(0.5) + bmin[2]; + btScalar cx = dx * btScalar(0.5) + bmin[0]; + btScalar cy = dy * btScalar(0.5) + bmin[1]; + btScalar cz = dz * btScalar(0.5) + bmin[2]; btScalar len = FLT_MAX; - if ( dx >= EPSILON && dx < len ) len = dx; - if ( dy >= EPSILON && dy < len ) len = dy; - if ( dz >= EPSILON && dz < len ) len = dz; + if (dx >= EPSILON && dx < len) len = dx; + if (dy >= EPSILON && dy < len) len = dy; + if (dz >= EPSILON && dz < len) len = dz; - if ( len == FLT_MAX ) + if (len == FLT_MAX) { - dx = dy = dz = btScalar(0.01); // one centimeter + dx = dy = dz = btScalar(0.01); // one centimeter } else { - if ( dx < EPSILON ) dx = len * btScalar(0.05); // 1/5th the shortest non-zero edge. - if ( dy < EPSILON ) dy = len * btScalar(0.05); - if ( dz < EPSILON ) dz = len * btScalar(0.05); + if (dx < EPSILON) dx = len * btScalar(0.05); // 1/5th the shortest non-zero edge. + if (dy < EPSILON) dy = len * btScalar(0.05); + if (dz < EPSILON) dz = len * btScalar(0.05); } btScalar x1 = cx - dx; @@ -1093,16 +1051,16 @@ bool HullLibrary::CleanupVertices(unsigned int svcount, btScalar z1 = cz - dz; btScalar z2 = cz + dz; - vcount = 0; // add box + vcount = 0; // add box - addPoint(vcount,vertices,x1,y1,z1); - addPoint(vcount,vertices,x2,y1,z1); - addPoint(vcount,vertices,x2,y2,z1); - addPoint(vcount,vertices,x1,y2,z1); - addPoint(vcount,vertices,x1,y1,z2); - addPoint(vcount,vertices,x2,y1,z2); - addPoint(vcount,vertices,x2,y2,z2); - addPoint(vcount,vertices,x1,y2,z2); + addPoint(vcount, vertices, x1, y1, z1); + addPoint(vcount, vertices, x2, y1, z1); + addPoint(vcount, vertices, x2, y2, z1); + addPoint(vcount, vertices, x1, y2, z1); + addPoint(vcount, vertices, x1, y1, z2); + addPoint(vcount, vertices, x2, y1, z2); + addPoint(vcount, vertices, x2, y2, z2); + addPoint(vcount, vertices, x1, y2, z2); return true; } @@ -1111,57 +1069,52 @@ bool HullLibrary::CleanupVertices(unsigned int svcount, return true; } -void HullLibrary::BringOutYourDead(const btVector3* verts,unsigned int vcount, btVector3* overts,unsigned int &ocount,unsigned int *indices,unsigned indexcount) +void HullLibrary::BringOutYourDead(const btVector3 *verts, unsigned int vcount, btVector3 *overts, unsigned int &ocount, unsigned int *indices, unsigned indexcount) { - btAlignedObjectArraytmpIndices; + btAlignedObjectArray tmpIndices; tmpIndices.resize(m_vertexIndexMapping.size()); int i; - for (i=0;i(vcount)); - memset(&usedIndices[0],0,sizeof(unsigned int)*vcount); + memset(&usedIndices[0], 0, sizeof(unsigned int) * vcount); ocount = 0; - for (i=0; i= 0 && v < vcount ); + btAssert(v >= 0 && v < vcount); - if ( usedIndices[static_cast(v)] ) // if already remapped + if (usedIndices[static_cast(v)]) // if already remapped { - indices[i] = usedIndices[static_cast(v)]-1; // index to new array + indices[i] = usedIndices[static_cast(v)] - 1; // index to new array } else { + indices[i] = ocount; // new index mapping - indices[i] = ocount; // new index mapping - - overts[ocount][0] = verts[v][0]; // copy old vert to new vert array + overts[ocount][0] = verts[v][0]; // copy old vert to new vert array overts[ocount][1] = verts[v][1]; overts[ocount][2] = verts[v][2]; - for (int k=0;k=0 && ocount <= vcount ); + ocount++; // increment output vert count - usedIndices[static_cast(v)] = ocount; // assign new index remapping + btAssert(ocount >= 0 && ocount <= vcount); - + usedIndices[static_cast(v)] = ocount; // assign new index remapping } } - - } diff --git a/thirdparty/bullet/LinearMath/btConvexHull.h b/thirdparty/bullet/LinearMath/btConvexHull.h index 69c52bc6f8..f890d75ea1 100644 --- a/thirdparty/bullet/LinearMath/btConvexHull.h +++ b/thirdparty/bullet/LinearMath/btConvexHull.h @@ -34,106 +34,102 @@ public: mNumFaces = 0; mNumIndices = 0; } - bool mPolygons; // true if indices represents polygons, false indices are triangles - unsigned int mNumOutputVertices; // number of vertices in the output hull - btAlignedObjectArray m_OutputVertices; // array of vertices - unsigned int mNumFaces; // the number of faces produced - unsigned int mNumIndices; // the total number of indices - btAlignedObjectArray m_Indices; // pointer to indices. - -// If triangles, then indices are array indexes into the vertex list. -// If polygons, indices are in the form (number of points in face) (p1, p2, p3, ..) etc.. + bool mPolygons; // true if indices represents polygons, false indices are triangles + unsigned int mNumOutputVertices; // number of vertices in the output hull + btAlignedObjectArray m_OutputVertices; // array of vertices + unsigned int mNumFaces; // the number of faces produced + unsigned int mNumIndices; // the total number of indices + btAlignedObjectArray m_Indices; // pointer to indices. + + // If triangles, then indices are array indexes into the vertex list. + // If polygons, indices are in the form (number of points in face) (p1, p2, p3, ..) etc.. }; enum HullFlag { - QF_TRIANGLES = (1<<0), // report results as triangles, not polygons. - QF_REVERSE_ORDER = (1<<1), // reverse order of the triangle indices. - QF_DEFAULT = QF_TRIANGLES + QF_TRIANGLES = (1 << 0), // report results as triangles, not polygons. + QF_REVERSE_ORDER = (1 << 1), // reverse order of the triangle indices. + QF_DEFAULT = QF_TRIANGLES }; - class HullDesc { public: HullDesc(void) { - mFlags = QF_DEFAULT; - mVcount = 0; - mVertices = 0; - mVertexStride = sizeof(btVector3); - mNormalEpsilon = 0.001f; - mMaxVertices = 4096; // maximum number of points to be considered for a convex hull. - mMaxFaces = 4096; + mFlags = QF_DEFAULT; + mVcount = 0; + mVertices = 0; + mVertexStride = sizeof(btVector3); + mNormalEpsilon = 0.001f; + mMaxVertices = 4096; // maximum number of points to be considered for a convex hull. + mMaxFaces = 4096; }; HullDesc(HullFlag flag, - unsigned int vcount, - const btVector3 *vertices, - unsigned int stride = sizeof(btVector3)) + unsigned int vcount, + const btVector3* vertices, + unsigned int stride = sizeof(btVector3)) { - mFlags = flag; - mVcount = vcount; - mVertices = vertices; - mVertexStride = stride; - mNormalEpsilon = btScalar(0.001); - mMaxVertices = 4096; + mFlags = flag; + mVcount = vcount; + mVertices = vertices; + mVertexStride = stride; + mNormalEpsilon = btScalar(0.001); + mMaxVertices = 4096; } bool HasHullFlag(HullFlag flag) const { - if ( mFlags & flag ) return true; + if (mFlags & flag) return true; return false; } void SetHullFlag(HullFlag flag) { - mFlags|=flag; + mFlags |= flag; } void ClearHullFlag(HullFlag flag) { - mFlags&=~flag; + mFlags &= ~flag; } - unsigned int mFlags; // flags to use when generating the convex hull. - unsigned int mVcount; // number of vertices in the input point cloud - const btVector3 *mVertices; // the array of vertices. - unsigned int mVertexStride; // the stride of each vertex, in bytes. - btScalar mNormalEpsilon; // the epsilon for removing duplicates. This is a normalized value, if normalized bit is on. - unsigned int mMaxVertices; // maximum number of vertices to be considered for the hull! - unsigned int mMaxFaces; + unsigned int mFlags; // flags to use when generating the convex hull. + unsigned int mVcount; // number of vertices in the input point cloud + const btVector3* mVertices; // the array of vertices. + unsigned int mVertexStride; // the stride of each vertex, in bytes. + btScalar mNormalEpsilon; // the epsilon for removing duplicates. This is a normalized value, if normalized bit is on. + unsigned int mMaxVertices; // maximum number of vertices to be considered for the hull! + unsigned int mMaxFaces; }; enum HullError { - QE_OK, // success! - QE_FAIL // failed. + QE_OK, // success! + QE_FAIL // failed. }; class btPlane { - public: - btVector3 normal; - btScalar dist; // distance below origin - the D from plane equasion Ax+By+Cz+D=0 - btPlane(const btVector3 &n,btScalar d):normal(n),dist(d){} - btPlane():normal(),dist(0){} - +public: + btVector3 normal; + btScalar dist; // distance below origin - the D from plane equasion Ax+By+Cz+D=0 + btPlane(const btVector3& n, btScalar d) : normal(n), dist(d) {} + btPlane() : normal(), dist(0) {} }; - - -class ConvexH +class ConvexH { - public: +public: class HalfEdge { - public: + public: short ea; // the other half of the edge (index into edges list) unsigned char v; // the vertex at the start of this edge (index into vertices list) unsigned char p; // the facet on which this edge lies (index into facets list) - HalfEdge(){} - HalfEdge(short _ea,unsigned char _v, unsigned char _p):ea(_ea),v(_v),p(_p){} + HalfEdge() {} + HalfEdge(short _ea, unsigned char _v, unsigned char _p) : ea(_ea), v(_v), p(_p) {} }; ConvexH() { @@ -143,25 +139,29 @@ class ConvexH } btAlignedObjectArray vertices; btAlignedObjectArray edges; - btAlignedObjectArray facets; - ConvexH(int vertices_size,int edges_size,int facets_size); + btAlignedObjectArray facets; + ConvexH(int vertices_size, int edges_size, int facets_size); }; - class int4 { public: - int x,y,z,w; + int x, y, z, w; int4(){}; - int4(int _x,int _y, int _z,int _w){x=_x;y=_y;z=_z;w=_w;} - const int& operator[](int i) const {return (&x)[i];} - int& operator[](int i) {return (&x)[i];} + int4(int _x, int _y, int _z, int _w) + { + x = _x; + y = _y; + z = _z; + w = _w; + } + const int& operator[](int i) const { return (&x)[i]; } + int& operator[](int i) { return (&x)[i]; } }; class PHullResult { public: - PHullResult(void) { mVcount = 0; @@ -173,69 +173,61 @@ public: unsigned int mVcount; unsigned int mIndexCount; unsigned int mFaceCount; - btVector3* mVertices; + btVector3* mVertices; TUIntArray m_Indices; }; - - ///The HullLibrary class can create a convex hull from a collection of vertices, using the ComputeHull method. ///The btShapeHull class uses this HullLibrary to create a approximate convex mesh given a general (non-polyhedral) convex shape. class HullLibrary { - btAlignedObjectArray m_tris; public: - btAlignedObjectArray m_vertexIndexMapping; - - HullError CreateConvexHull(const HullDesc& desc, // describes the input request - HullResult& result); // contains the resulst - HullError ReleaseResult(HullResult &result); // release memory allocated for this result, we are done with it. + HullError CreateConvexHull(const HullDesc& desc, // describes the input request + HullResult& result); // contains the resulst + HullError ReleaseResult(HullResult& result); // release memory allocated for this result, we are done with it. private: + bool ComputeHull(unsigned int vcount, const btVector3* vertices, PHullResult& result, unsigned int vlimit); - bool ComputeHull(unsigned int vcount,const btVector3 *vertices,PHullResult &result,unsigned int vlimit); - - class btHullTriangle* allocateTriangle(int a,int b,int c); - void deAllocateTriangle(btHullTriangle*); - void b2bfix(btHullTriangle* s,btHullTriangle*t); + class btHullTriangle* allocateTriangle(int a, int b, int c); + void deAllocateTriangle(btHullTriangle*); + void b2bfix(btHullTriangle* s, btHullTriangle* t); - void removeb2b(btHullTriangle* s,btHullTriangle*t); + void removeb2b(btHullTriangle* s, btHullTriangle* t); - void checkit(btHullTriangle *t); + void checkit(btHullTriangle* t); btHullTriangle* extrudable(btScalar epsilon); - int calchull(btVector3 *verts,int verts_count, TUIntArray& tris_out, int &tris_count,int vlimit); + int calchull(btVector3* verts, int verts_count, TUIntArray& tris_out, int& tris_count, int vlimit); - int calchullgen(btVector3 *verts,int verts_count, int vlimit); + int calchullgen(btVector3* verts, int verts_count, int vlimit); - int4 FindSimplex(btVector3 *verts,int verts_count,btAlignedObjectArray &allow); + int4 FindSimplex(btVector3* verts, int verts_count, btAlignedObjectArray& allow); - class ConvexH* ConvexHCrop(ConvexH& convex,const btPlane& slice); + class ConvexH* ConvexHCrop(ConvexH& convex, const btPlane& slice); - void extrude(class btHullTriangle* t0,int v); + void extrude(class btHullTriangle* t0, int v); ConvexH* test_cube(); - //BringOutYourDead (John Ratcliff): When you create a convex hull you hand it a large input set of vertices forming a 'point cloud'. + //BringOutYourDead (John Ratcliff): When you create a convex hull you hand it a large input set of vertices forming a 'point cloud'. //After the hull is generated it give you back a set of polygon faces which index the *original* point cloud. //The thing is, often times, there are many 'dead vertices' in the point cloud that are on longer referenced by the hull. //The routine 'BringOutYourDead' find only the referenced vertices, copies them to an new buffer, and re-indexes the hull so that it is a minimal representation. - void BringOutYourDead(const btVector3* verts,unsigned int vcount, btVector3* overts,unsigned int &ocount,unsigned int* indices,unsigned indexcount); + void BringOutYourDead(const btVector3* verts, unsigned int vcount, btVector3* overts, unsigned int& ocount, unsigned int* indices, unsigned indexcount); bool CleanupVertices(unsigned int svcount, - const btVector3* svertices, - unsigned int stride, - unsigned int &vcount, // output number of vertices - btVector3* vertices, // location to store the results. - btScalar normalepsilon, - btVector3& scale); + const btVector3* svertices, + unsigned int stride, + unsigned int& vcount, // output number of vertices + btVector3* vertices, // location to store the results. + btScalar normalepsilon, + btVector3& scale); }; - -#endif //BT_CD_HULL_H - +#endif //BT_CD_HULL_H diff --git a/thirdparty/bullet/LinearMath/btConvexHullComputer.cpp b/thirdparty/bullet/LinearMath/btConvexHullComputer.cpp index 2ea22cbe3b..8bbfdc5f25 100644 --- a/thirdparty/bullet/LinearMath/btConvexHullComputer.cpp +++ b/thirdparty/bullet/LinearMath/btConvexHullComputer.cpp @@ -20,846 +20,847 @@ subject to the following restrictions: #include "btVector3.h" #ifdef __GNUC__ - #include +#include #elif defined(_MSC_VER) - typedef __int32 int32_t; - typedef __int64 int64_t; - typedef unsigned __int32 uint32_t; - typedef unsigned __int64 uint64_t; +typedef __int32 int32_t; +typedef __int64 int64_t; +typedef unsigned __int32 uint32_t; +typedef unsigned __int64 uint64_t; #else - typedef int int32_t; - typedef long long int int64_t; - typedef unsigned int uint32_t; - typedef unsigned long long int uint64_t; +typedef int int32_t; +typedef long long int int64_t; +typedef unsigned int uint32_t; +typedef unsigned long long int uint64_t; #endif - //The definition of USE_X86_64_ASM is moved into the build system. You can enable it manually by commenting out the following lines //#if (defined(__GNUC__) && defined(__x86_64__) && !defined(__ICL)) // || (defined(__ICL) && defined(_M_X64)) bug in Intel compiler, disable inline assembly // #define USE_X86_64_ASM //#endif - //#define DEBUG_CONVEX_HULL //#define SHOW_ITERATIONS #if defined(DEBUG_CONVEX_HULL) || defined(SHOW_ITERATIONS) - #include +#include #endif // Convex hull implementation based on Preparata and Hong // Ole Kniemeyer, MAXON Computer GmbH class btConvexHullInternal { +public: + class Point64 + { public: - - class Point64 - { - public: - int64_t x; - int64_t y; - int64_t z; - - Point64(int64_t x, int64_t y, int64_t z): x(x), y(y), z(z) - { - } + int64_t x; + int64_t y; + int64_t z; - bool isZero() - { - return (x == 0) && (y == 0) && (z == 0); - } + Point64(int64_t x, int64_t y, int64_t z) : x(x), y(y), z(z) + { + } - int64_t dot(const Point64& b) const - { - return x * b.x + y * b.y + z * b.z; - } - }; - - class Point32 - { - public: - int32_t x; - int32_t y; - int32_t z; - int index; - - Point32() - { - } - - Point32(int32_t x, int32_t y, int32_t z): x(x), y(y), z(z), index(-1) - { - } - - bool operator==(const Point32& b) const - { - return (x == b.x) && (y == b.y) && (z == b.z); - } + bool isZero() + { + return (x == 0) && (y == 0) && (z == 0); + } - bool operator!=(const Point32& b) const - { - return (x != b.x) || (y != b.y) || (z != b.z); - } + int64_t dot(const Point64& b) const + { + return x * b.x + y * b.y + z * b.z; + } + }; - bool isZero() - { - return (x == 0) && (y == 0) && (z == 0); - } + class Point32 + { + public: + int32_t x; + int32_t y; + int32_t z; + int index; - Point64 cross(const Point32& b) const - { - return Point64(y * b.z - z * b.y, z * b.x - x * b.z, x * b.y - y * b.x); - } + Point32() + { + } - Point64 cross(const Point64& b) const - { - return Point64(y * b.z - z * b.y, z * b.x - x * b.z, x * b.y - y * b.x); - } + Point32(int32_t x, int32_t y, int32_t z) : x(x), y(y), z(z), index(-1) + { + } - int64_t dot(const Point32& b) const - { - return x * b.x + y * b.y + z * b.z; - } + bool operator==(const Point32& b) const + { + return (x == b.x) && (y == b.y) && (z == b.z); + } - int64_t dot(const Point64& b) const - { - return x * b.x + y * b.y + z * b.z; - } + bool operator!=(const Point32& b) const + { + return (x != b.x) || (y != b.y) || (z != b.z); + } - Point32 operator+(const Point32& b) const - { - return Point32(x + b.x, y + b.y, z + b.z); - } + bool isZero() + { + return (x == 0) && (y == 0) && (z == 0); + } - Point32 operator-(const Point32& b) const - { - return Point32(x - b.x, y - b.y, z - b.z); - } - }; + Point64 cross(const Point32& b) const + { + return Point64(y * b.z - z * b.y, z * b.x - x * b.z, x * b.y - y * b.x); + } - class Int128 + Point64 cross(const Point64& b) const { - public: - uint64_t low; - uint64_t high; + return Point64(y * b.z - z * b.y, z * b.x - x * b.z, x * b.y - y * b.x); + } - Int128() - { - } + int64_t dot(const Point32& b) const + { + return x * b.x + y * b.y + z * b.z; + } - Int128(uint64_t low, uint64_t high): low(low), high(high) - { - } + int64_t dot(const Point64& b) const + { + return x * b.x + y * b.y + z * b.z; + } - Int128(uint64_t low): low(low), high(0) - { - } + Point32 operator+(const Point32& b) const + { + return Point32(x + b.x, y + b.y, z + b.z); + } - Int128(int64_t value): low(value), high((value >= 0) ? 0 : (uint64_t) -1LL) - { - } + Point32 operator-(const Point32& b) const + { + return Point32(x - b.x, y - b.y, z - b.z); + } + }; - static Int128 mul(int64_t a, int64_t b); + class Int128 + { + public: + uint64_t low; + uint64_t high; - static Int128 mul(uint64_t a, uint64_t b); + Int128() + { + } - Int128 operator-() const - { - return Int128((uint64_t) -(int64_t)low, ~high + (low == 0)); - } + Int128(uint64_t low, uint64_t high) : low(low), high(high) + { + } - Int128 operator+(const Int128& b) const - { + Int128(uint64_t low) : low(low), high(0) + { + } + + Int128(int64_t value) : low(value), high((value >= 0) ? 0 : (uint64_t)-1LL) + { + } + + static Int128 mul(int64_t a, int64_t b); + + static Int128 mul(uint64_t a, uint64_t b); + + Int128 operator-() const + { + return Int128((uint64_t) - (int64_t)low, ~high + (low == 0)); + } + + Int128 operator+(const Int128& b) const + { #ifdef USE_X86_64_ASM - Int128 result; - __asm__ ("addq %[bl], %[rl]\n\t" - "adcq %[bh], %[rh]\n\t" - : [rl] "=r" (result.low), [rh] "=r" (result.high) - : "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high) - : "cc" ); - return result; + Int128 result; + __asm__( + "addq %[bl], %[rl]\n\t" + "adcq %[bh], %[rh]\n\t" + : [rl] "=r"(result.low), [rh] "=r"(result.high) + : "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high) + : "cc"); + return result; #else - uint64_t lo = low + b.low; - return Int128(lo, high + b.high + (lo < low)); + uint64_t lo = low + b.low; + return Int128(lo, high + b.high + (lo < low)); #endif - } + } - Int128 operator-(const Int128& b) const - { + Int128 operator-(const Int128& b) const + { #ifdef USE_X86_64_ASM - Int128 result; - __asm__ ("subq %[bl], %[rl]\n\t" - "sbbq %[bh], %[rh]\n\t" - : [rl] "=r" (result.low), [rh] "=r" (result.high) - : "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high) - : "cc" ); - return result; + Int128 result; + __asm__( + "subq %[bl], %[rl]\n\t" + "sbbq %[bh], %[rh]\n\t" + : [rl] "=r"(result.low), [rh] "=r"(result.high) + : "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high) + : "cc"); + return result; #else - return *this + -b; + return *this + -b; #endif - } + } - Int128& operator+=(const Int128& b) - { + Int128& operator+=(const Int128& b) + { #ifdef USE_X86_64_ASM - __asm__ ("addq %[bl], %[rl]\n\t" - "adcq %[bh], %[rh]\n\t" - : [rl] "=r" (low), [rh] "=r" (high) - : "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high) - : "cc" ); + __asm__( + "addq %[bl], %[rl]\n\t" + "adcq %[bh], %[rh]\n\t" + : [rl] "=r"(low), [rh] "=r"(high) + : "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high) + : "cc"); #else - uint64_t lo = low + b.low; - if (lo < low) - { - ++high; - } - low = lo; - high += b.high; + uint64_t lo = low + b.low; + if (lo < low) + { + ++high; + } + low = lo; + high += b.high; #endif - return *this; - } + return *this; + } - Int128& operator++() - { - if (++low == 0) - { - ++high; - } - return *this; - } + Int128& operator++() + { + if (++low == 0) + { + ++high; + } + return *this; + } - Int128 operator*(int64_t b) const; + Int128 operator*(int64_t b) const; - btScalar toScalar() const - { - return ((int64_t) high >= 0) ? btScalar(high) * (btScalar(0x100000000LL) * btScalar(0x100000000LL)) + btScalar(low) - : -(-*this).toScalar(); - } + btScalar toScalar() const + { + return ((int64_t)high >= 0) ? btScalar(high) * (btScalar(0x100000000LL) * btScalar(0x100000000LL)) + btScalar(low) + : -(-*this).toScalar(); + } - int getSign() const - { - return ((int64_t) high < 0) ? -1 : (high || low) ? 1 : 0; - } + int getSign() const + { + return ((int64_t)high < 0) ? -1 : (high || low) ? 1 : 0; + } - bool operator<(const Int128& b) const - { - return (high < b.high) || ((high == b.high) && (low < b.low)); - } + bool operator<(const Int128& b) const + { + return (high < b.high) || ((high == b.high) && (low < b.low)); + } - int ucmp(const Int128&b) const - { - if (high < b.high) - { - return -1; - } - if (high > b.high) - { - return 1; - } - if (low < b.low) - { - return -1; - } - if (low > b.low) - { - return 1; - } - return 0; - } - }; + int ucmp(const Int128& b) const + { + if (high < b.high) + { + return -1; + } + if (high > b.high) + { + return 1; + } + if (low < b.low) + { + return -1; + } + if (low > b.low) + { + return 1; + } + return 0; + } + }; + class Rational64 + { + private: + uint64_t m_numerator; + uint64_t m_denominator; + int sign; - class Rational64 + public: + Rational64(int64_t numerator, int64_t denominator) { - private: - uint64_t m_numerator; - uint64_t m_denominator; - int sign; - - public: - Rational64(int64_t numerator, int64_t denominator) - { - if (numerator > 0) - { - sign = 1; - m_numerator = (uint64_t) numerator; - } - else if (numerator < 0) - { - sign = -1; - m_numerator = (uint64_t) -numerator; - } - else - { - sign = 0; - m_numerator = 0; - } - if (denominator > 0) - { - m_denominator = (uint64_t) denominator; - } - else if (denominator < 0) - { - sign = -sign; - m_denominator = (uint64_t) -denominator; - } - else - { - m_denominator = 0; - } - } - - bool isNegativeInfinity() const - { - return (sign < 0) && (m_denominator == 0); - } - - bool isNaN() const - { - return (sign == 0) && (m_denominator == 0); - } - - int compare(const Rational64& b) const; - - btScalar toScalar() const - { - return sign * ((m_denominator == 0) ? SIMD_INFINITY : (btScalar) m_numerator / m_denominator); - } - }; + if (numerator > 0) + { + sign = 1; + m_numerator = (uint64_t)numerator; + } + else if (numerator < 0) + { + sign = -1; + m_numerator = (uint64_t)-numerator; + } + else + { + sign = 0; + m_numerator = 0; + } + if (denominator > 0) + { + m_denominator = (uint64_t)denominator; + } + else if (denominator < 0) + { + sign = -sign; + m_denominator = (uint64_t)-denominator; + } + else + { + m_denominator = 0; + } + } + bool isNegativeInfinity() const + { + return (sign < 0) && (m_denominator == 0); + } - class Rational128 + bool isNaN() const { - private: - Int128 numerator; - Int128 denominator; - int sign; - bool isInt64; + return (sign == 0) && (m_denominator == 0); + } - public: - Rational128(int64_t value) - { - if (value > 0) - { - sign = 1; - this->numerator = value; - } - else if (value < 0) - { - sign = -1; - this->numerator = -value; - } - else - { - sign = 0; - this->numerator = (uint64_t) 0; - } - this->denominator = (uint64_t) 1; - isInt64 = true; - } + int compare(const Rational64& b) const; - Rational128(const Int128& numerator, const Int128& denominator) - { - sign = numerator.getSign(); - if (sign >= 0) - { - this->numerator = numerator; - } - else - { - this->numerator = -numerator; - } - int dsign = denominator.getSign(); - if (dsign >= 0) - { - this->denominator = denominator; - } - else - { - sign = -sign; - this->denominator = -denominator; - } - isInt64 = false; - } + btScalar toScalar() const + { + return sign * ((m_denominator == 0) ? SIMD_INFINITY : (btScalar)m_numerator / m_denominator); + } + }; + + class Rational128 + { + private: + Int128 numerator; + Int128 denominator; + int sign; + bool isInt64; + + public: + Rational128(int64_t value) + { + if (value > 0) + { + sign = 1; + this->numerator = value; + } + else if (value < 0) + { + sign = -1; + this->numerator = -value; + } + else + { + sign = 0; + this->numerator = (uint64_t)0; + } + this->denominator = (uint64_t)1; + isInt64 = true; + } + + Rational128(const Int128& numerator, const Int128& denominator) + { + sign = numerator.getSign(); + if (sign >= 0) + { + this->numerator = numerator; + } + else + { + this->numerator = -numerator; + } + int dsign = denominator.getSign(); + if (dsign >= 0) + { + this->denominator = denominator; + } + else + { + sign = -sign; + this->denominator = -denominator; + } + isInt64 = false; + } + + int compare(const Rational128& b) const; + + int compare(int64_t b) const; + + btScalar toScalar() const + { + return sign * ((denominator.getSign() == 0) ? SIMD_INFINITY : numerator.toScalar() / denominator.toScalar()); + } + }; + + class PointR128 + { + public: + Int128 x; + Int128 y; + Int128 z; + Int128 denominator; + + PointR128() + { + } + + PointR128(Int128 x, Int128 y, Int128 z, Int128 denominator) : x(x), y(y), z(z), denominator(denominator) + { + } + + btScalar xvalue() const + { + return x.toScalar() / denominator.toScalar(); + } + + btScalar yvalue() const + { + return y.toScalar() / denominator.toScalar(); + } + + btScalar zvalue() const + { + return z.toScalar() / denominator.toScalar(); + } + }; + + class Edge; + class Face; + + class Vertex + { + public: + Vertex* next; + Vertex* prev; + Edge* edges; + Face* firstNearbyFace; + Face* lastNearbyFace; + PointR128 point128; + Point32 point; + int copy; + + Vertex() : next(NULL), prev(NULL), edges(NULL), firstNearbyFace(NULL), lastNearbyFace(NULL), copy(-1) + { + } + +#ifdef DEBUG_CONVEX_HULL + void print() + { + printf("V%d (%d, %d, %d)", point.index, point.x, point.y, point.z); + } + + void printGraph(); +#endif + + Point32 operator-(const Vertex& b) const + { + return point - b.point; + } + + Rational128 dot(const Point64& b) const + { + return (point.index >= 0) ? Rational128(point.dot(b)) + : Rational128(point128.x * b.x + point128.y * b.y + point128.z * b.z, point128.denominator); + } + + btScalar xvalue() const + { + return (point.index >= 0) ? btScalar(point.x) : point128.xvalue(); + } + + btScalar yvalue() const + { + return (point.index >= 0) ? btScalar(point.y) : point128.yvalue(); + } + + btScalar zvalue() const + { + return (point.index >= 0) ? btScalar(point.z) : point128.zvalue(); + } + + void receiveNearbyFaces(Vertex* src) + { + if (lastNearbyFace) + { + lastNearbyFace->nextWithSameNearbyVertex = src->firstNearbyFace; + } + else + { + firstNearbyFace = src->firstNearbyFace; + } + if (src->lastNearbyFace) + { + lastNearbyFace = src->lastNearbyFace; + } + for (Face* f = src->firstNearbyFace; f; f = f->nextWithSameNearbyVertex) + { + btAssert(f->nearbyVertex == src); + f->nearbyVertex = this; + } + src->firstNearbyFace = NULL; + src->lastNearbyFace = NULL; + } + }; + + class Edge + { + public: + Edge* next; + Edge* prev; + Edge* reverse; + Vertex* target; + Face* face; + int copy; + + ~Edge() + { + next = NULL; + prev = NULL; + reverse = NULL; + target = NULL; + face = NULL; + } - int compare(const Rational128& b) const; + void link(Edge* n) + { + btAssert(reverse->target == n->reverse->target); + next = n; + n->prev = this; + } - int compare(int64_t b) const; +#ifdef DEBUG_CONVEX_HULL + void print() + { + printf("E%p : %d -> %d, n=%p p=%p (0 %d\t%d\t%d) -> (%d %d %d)", this, reverse->target->point.index, target->point.index, next, prev, + reverse->target->point.x, reverse->target->point.y, reverse->target->point.z, target->point.x, target->point.y, target->point.z); + } +#endif + }; - btScalar toScalar() const - { - return sign * ((denominator.getSign() == 0) ? SIMD_INFINITY : numerator.toScalar() / denominator.toScalar()); - } - }; + class Face + { + public: + Face* next; + Vertex* nearbyVertex; + Face* nextWithSameNearbyVertex; + Point32 origin; + Point32 dir0; + Point32 dir1; - class PointR128 + Face() : next(NULL), nearbyVertex(NULL), nextWithSameNearbyVertex(NULL) { - public: - Int128 x; - Int128 y; - Int128 z; - Int128 denominator; + } - PointR128() - { - } + void init(Vertex* a, Vertex* b, Vertex* c) + { + nearbyVertex = a; + origin = a->point; + dir0 = *b - *a; + dir1 = *c - *a; + if (a->lastNearbyFace) + { + a->lastNearbyFace->nextWithSameNearbyVertex = this; + } + else + { + a->firstNearbyFace = this; + } + a->lastNearbyFace = this; + } - PointR128(Int128 x, Int128 y, Int128 z, Int128 denominator): x(x), y(y), z(z), denominator(denominator) - { - } + Point64 getNormal() + { + return dir0.cross(dir1); + } + }; - btScalar xvalue() const - { - return x.toScalar() / denominator.toScalar(); - } + template + class DMul + { + private: + static uint32_t high(uint64_t value) + { + return (uint32_t)(value >> 32); + } - btScalar yvalue() const - { - return y.toScalar() / denominator.toScalar(); - } + static uint32_t low(uint64_t value) + { + return (uint32_t)value; + } - btScalar zvalue() const - { - return z.toScalar() / denominator.toScalar(); - } - }; + static uint64_t mul(uint32_t a, uint32_t b) + { + return (uint64_t)a * (uint64_t)b; + } + static void shlHalf(uint64_t& value) + { + value <<= 32; + } - class Edge; - class Face; + static uint64_t high(Int128 value) + { + return value.high; + } - class Vertex + static uint64_t low(Int128 value) { - public: - Vertex* next; - Vertex* prev; - Edge* edges; - Face* firstNearbyFace; - Face* lastNearbyFace; - PointR128 point128; - Point32 point; - int copy; - - Vertex(): next(NULL), prev(NULL), edges(NULL), firstNearbyFace(NULL), lastNearbyFace(NULL), copy(-1) - { - } + return value.low; + } -#ifdef DEBUG_CONVEX_HULL - void print() - { - printf("V%d (%d, %d, %d)", point.index, point.x, point.y, point.z); - } + static Int128 mul(uint64_t a, uint64_t b) + { + return Int128::mul(a, b); + } - void printGraph(); -#endif + static void shlHalf(Int128& value) + { + value.high = value.low; + value.low = 0; + } - Point32 operator-(const Vertex& b) const - { - return point - b.point; - } + public: + static void mul(UWord a, UWord b, UWord& resLow, UWord& resHigh) + { + UWord p00 = mul(low(a), low(b)); + UWord p01 = mul(low(a), high(b)); + UWord p10 = mul(high(a), low(b)); + UWord p11 = mul(high(a), high(b)); + UWord p0110 = UWord(low(p01)) + UWord(low(p10)); + p11 += high(p01); + p11 += high(p10); + p11 += high(p0110); + shlHalf(p0110); + p00 += p0110; + if (p00 < p0110) + { + ++p11; + } + resLow = p00; + resHigh = p11; + } + }; - Rational128 dot(const Point64& b) const - { - return (point.index >= 0) ? Rational128(point.dot(b)) - : Rational128(point128.x * b.x + point128.y * b.y + point128.z * b.z, point128.denominator); - } +private: + class IntermediateHull + { + public: + Vertex* minXy; + Vertex* maxXy; + Vertex* minYx; + Vertex* maxYx; - btScalar xvalue() const - { - return (point.index >= 0) ? btScalar(point.x) : point128.xvalue(); - } + IntermediateHull() : minXy(NULL), maxXy(NULL), minYx(NULL), maxYx(NULL) + { + } - btScalar yvalue() const - { - return (point.index >= 0) ? btScalar(point.y) : point128.yvalue(); - } + void print(); + }; - btScalar zvalue() const - { - return (point.index >= 0) ? btScalar(point.z) : point128.zvalue(); - } + enum Orientation + { + NONE, + CLOCKWISE, + COUNTER_CLOCKWISE + }; - void receiveNearbyFaces(Vertex* src) - { - if (lastNearbyFace) - { - lastNearbyFace->nextWithSameNearbyVertex = src->firstNearbyFace; - } - else - { - firstNearbyFace = src->firstNearbyFace; - } - if (src->lastNearbyFace) - { - lastNearbyFace = src->lastNearbyFace; - } - for (Face* f = src->firstNearbyFace; f; f = f->nextWithSameNearbyVertex) - { - btAssert(f->nearbyVertex == src); - f->nearbyVertex = this; - } - src->firstNearbyFace = NULL; - src->lastNearbyFace = NULL; - } - }; + template + class PoolArray + { + private: + T* array; + int size; + public: + PoolArray* next; - class Edge + PoolArray(int size) : size(size), next(NULL) { - public: - Edge* next; - Edge* prev; - Edge* reverse; - Vertex* target; - Face* face; - int copy; - - ~Edge() - { - next = NULL; - prev = NULL; - reverse = NULL; - target = NULL; - face = NULL; - } - - void link(Edge* n) - { - btAssert(reverse->target == n->reverse->target); - next = n; - n->prev = this; - } - -#ifdef DEBUG_CONVEX_HULL - void print() - { - printf("E%p : %d -> %d, n=%p p=%p (0 %d\t%d\t%d) -> (%d %d %d)", this, reverse->target->point.index, target->point.index, next, prev, - reverse->target->point.x, reverse->target->point.y, reverse->target->point.z, target->point.x, target->point.y, target->point.z); - } -#endif - }; + array = (T*)btAlignedAlloc(sizeof(T) * size, 16); + } - class Face + ~PoolArray() { - public: - Face* next; - Vertex* nearbyVertex; - Face* nextWithSameNearbyVertex; - Point32 origin; - Point32 dir0; - Point32 dir1; - - Face(): next(NULL), nearbyVertex(NULL), nextWithSameNearbyVertex(NULL) - { - } - - void init(Vertex* a, Vertex* b, Vertex* c) - { - nearbyVertex = a; - origin = a->point; - dir0 = *b - *a; - dir1 = *c - *a; - if (a->lastNearbyFace) - { - a->lastNearbyFace->nextWithSameNearbyVertex = this; - } - else - { - a->firstNearbyFace = this; - } - a->lastNearbyFace = this; - } - - Point64 getNormal() - { - return dir0.cross(dir1); - } - }; + btAlignedFree(array); + } - template class DMul + T* init() { - private: - static uint32_t high(uint64_t value) - { - return (uint32_t) (value >> 32); - } - - static uint32_t low(uint64_t value) - { - return (uint32_t) value; - } - - static uint64_t mul(uint32_t a, uint32_t b) - { - return (uint64_t) a * (uint64_t) b; - } - - static void shlHalf(uint64_t& value) - { - value <<= 32; - } - - static uint64_t high(Int128 value) - { - return value.high; - } - - static uint64_t low(Int128 value) - { - return value.low; - } - - static Int128 mul(uint64_t a, uint64_t b) - { - return Int128::mul(a, b); - } - - static void shlHalf(Int128& value) - { - value.high = value.low; - value.low = 0; - } - - public: - - static void mul(UWord a, UWord b, UWord& resLow, UWord& resHigh) - { - UWord p00 = mul(low(a), low(b)); - UWord p01 = mul(low(a), high(b)); - UWord p10 = mul(high(a), low(b)); - UWord p11 = mul(high(a), high(b)); - UWord p0110 = UWord(low(p01)) + UWord(low(p10)); - p11 += high(p01); - p11 += high(p10); - p11 += high(p0110); - shlHalf(p0110); - p00 += p0110; - if (p00 < p0110) - { - ++p11; - } - resLow = p00; - resHigh = p11; - } - }; - + T* o = array; + for (int i = 0; i < size; i++, o++) + { + o->next = (i + 1 < size) ? o + 1 : NULL; + } + return array; + } + }; + + template + class Pool + { private: + PoolArray* arrays; + PoolArray* nextArray; + T* freeObjects; + int arraySize; - class IntermediateHull + public: + Pool() : arrays(NULL), nextArray(NULL), freeObjects(NULL), arraySize(256) { - public: - Vertex* minXy; - Vertex* maxXy; - Vertex* minYx; - Vertex* maxYx; - - IntermediateHull(): minXy(NULL), maxXy(NULL), minYx(NULL), maxYx(NULL) - { - } - - void print(); - }; - - enum Orientation {NONE, CLOCKWISE, COUNTER_CLOCKWISE}; + } - template class PoolArray + ~Pool() { - private: - T* array; - int size; + while (arrays) + { + PoolArray* p = arrays; + arrays = p->next; + p->~PoolArray(); + btAlignedFree(p); + } + } - public: - PoolArray* next; + void reset() + { + nextArray = arrays; + freeObjects = NULL; + } - PoolArray(int size): size(size), next(NULL) - { - array = (T*) btAlignedAlloc(sizeof(T) * size, 16); - } + void setArraySize(int arraySize) + { + this->arraySize = arraySize; + } - ~PoolArray() + T* newObject() + { + T* o = freeObjects; + if (!o) + { + PoolArray* p = nextArray; + if (p) { - btAlignedFree(array); + nextArray = p->next; } - - T* init() + else { - T* o = array; - for (int i = 0; i < size; i++, o++) - { - o->next = (i+1 < size) ? o + 1 : NULL; - } - return array; + p = new (btAlignedAlloc(sizeof(PoolArray), 16)) PoolArray(arraySize); + p->next = arrays; + arrays = p; } + o = p->init(); + } + freeObjects = o->next; + return new (o) T(); }; - template class Pool + void freeObject(T* object) { - private: - PoolArray* arrays; - PoolArray* nextArray; - T* freeObjects; - int arraySize; - - public: - Pool(): arrays(NULL), nextArray(NULL), freeObjects(NULL), arraySize(256) - { - } - - ~Pool() - { - while (arrays) - { - PoolArray* p = arrays; - arrays = p->next; - p->~PoolArray(); - btAlignedFree(p); - } - } + object->~T(); + object->next = freeObjects; + freeObjects = object; + } + }; - void reset() - { - nextArray = arrays; - freeObjects = NULL; - } + btVector3 scaling; + btVector3 center; + Pool vertexPool; + Pool edgePool; + Pool facePool; + btAlignedObjectArray originalVertices; + int mergeStamp; + int minAxis; + int medAxis; + int maxAxis; + int usedEdgePairs; + int maxUsedEdgePairs; - void setArraySize(int arraySize) - { - this->arraySize = arraySize; - } + static Orientation getOrientation(const Edge* prev, const Edge* next, const Point32& s, const Point32& t); + Edge* findMaxAngle(bool ccw, const Vertex* start, const Point32& s, const Point64& rxs, const Point64& sxrxs, Rational64& minCot); + void findEdgeForCoplanarFaces(Vertex* c0, Vertex* c1, Edge*& e0, Edge*& e1, Vertex* stop0, Vertex* stop1); - T* newObject() - { - T* o = freeObjects; - if (!o) - { - PoolArray* p = nextArray; - if (p) - { - nextArray = p->next; - } - else - { - p = new(btAlignedAlloc(sizeof(PoolArray), 16)) PoolArray(arraySize); - p->next = arrays; - arrays = p; - } - o = p->init(); - } - freeObjects = o->next; - return new(o) T(); - }; + Edge* newEdgePair(Vertex* from, Vertex* to); - void freeObject(T* object) - { - object->~T(); - object->next = freeObjects; - freeObjects = object; - } - }; + void removeEdgePair(Edge* edge) + { + Edge* n = edge->next; + Edge* r = edge->reverse; - btVector3 scaling; - btVector3 center; - Pool vertexPool; - Pool edgePool; - Pool facePool; - btAlignedObjectArray originalVertices; - int mergeStamp; - int minAxis; - int medAxis; - int maxAxis; - int usedEdgePairs; - int maxUsedEdgePairs; + btAssert(edge->target && r->target); - static Orientation getOrientation(const Edge* prev, const Edge* next, const Point32& s, const Point32& t); - Edge* findMaxAngle(bool ccw, const Vertex* start, const Point32& s, const Point64& rxs, const Point64& sxrxs, Rational64& minCot); - void findEdgeForCoplanarFaces(Vertex* c0, Vertex* c1, Edge*& e0, Edge*& e1, Vertex* stop0, Vertex* stop1); + if (n != edge) + { + n->prev = edge->prev; + edge->prev->next = n; + r->target->edges = n; + } + else + { + r->target->edges = NULL; + } - Edge* newEdgePair(Vertex* from, Vertex* to); + n = r->next; - void removeEdgePair(Edge* edge) + if (n != r) + { + n->prev = r->prev; + r->prev->next = n; + edge->target->edges = n; + } + else { - Edge* n = edge->next; - Edge* r = edge->reverse; + edge->target->edges = NULL; + } - btAssert(edge->target && r->target); + edgePool.freeObject(edge); + edgePool.freeObject(r); + usedEdgePairs--; + } - if (n != edge) - { - n->prev = edge->prev; - edge->prev->next = n; - r->target->edges = n; - } - else - { - r->target->edges = NULL; - } - - n = r->next; - - if (n != r) - { - n->prev = r->prev; - r->prev->next = n; - edge->target->edges = n; - } - else - { - edge->target->edges = NULL; - } + void computeInternal(int start, int end, IntermediateHull& result); - edgePool.freeObject(edge); - edgePool.freeObject(r); - usedEdgePairs--; - } - - void computeInternal(int start, int end, IntermediateHull& result); - - bool mergeProjection(IntermediateHull& h0, IntermediateHull& h1, Vertex*& c0, Vertex*& c1); - - void merge(IntermediateHull& h0, IntermediateHull& h1); + bool mergeProjection(IntermediateHull& h0, IntermediateHull& h1, Vertex*& c0, Vertex*& c1); - btVector3 toBtVector(const Point32& v); + void merge(IntermediateHull& h0, IntermediateHull& h1); - btVector3 getBtNormal(Face* face); + btVector3 toBtVector(const Point32& v); - bool shiftFace(Face* face, btScalar amount, btAlignedObjectArray stack); + btVector3 getBtNormal(Face* face); - public: - Vertex* vertexList; + bool shiftFace(Face* face, btScalar amount, btAlignedObjectArray stack); - void compute(const void* coords, bool doubleCoords, int stride, int count); +public: + Vertex* vertexList; - btVector3 getCoordinates(const Vertex* v); + void compute(const void* coords, bool doubleCoords, int stride, int count); - btScalar shrink(btScalar amount, btScalar clampAmount); -}; + btVector3 getCoordinates(const Vertex* v); + btScalar shrink(btScalar amount, btScalar clampAmount); +}; btConvexHullInternal::Int128 btConvexHullInternal::Int128::operator*(int64_t b) const { - bool negative = (int64_t) high < 0; + bool negative = (int64_t)high < 0; Int128 a = negative ? -*this : *this; if (b < 0) { negative = !negative; b = -b; } - Int128 result = mul(a.low, (uint64_t) b); - result.high += a.high * (uint64_t) b; + Int128 result = mul(a.low, (uint64_t)b); + result.high += a.high * (uint64_t)b; return negative ? -result : result; } btConvexHullInternal::Int128 btConvexHullInternal::Int128::mul(int64_t a, int64_t b) { Int128 result; - + #ifdef USE_X86_64_ASM - __asm__ ("imulq %[b]" - : "=a" (result.low), "=d" (result.high) - : "0"(a), [b] "r"(b) - : "cc" ); + __asm__("imulq %[b]" + : "=a"(result.low), "=d"(result.high) + : "0"(a), [b] "r"(b) + : "cc"); return result; - + #else bool negative = a < 0; if (negative) @@ -871,7 +872,7 @@ btConvexHullInternal::Int128 btConvexHullInternal::Int128::mul(int64_t a, int64_ negative = !negative; b = -b; } - DMul::mul((uint64_t) a, (uint64_t) b, result.low, result.high); + DMul::mul((uint64_t)a, (uint64_t)b, result.low, result.high); return negative ? -result : result; #endif } @@ -881,10 +882,10 @@ btConvexHullInternal::Int128 btConvexHullInternal::Int128::mul(uint64_t a, uint6 Int128 result; #ifdef USE_X86_64_ASM - __asm__ ("mulq %[b]" - : "=a" (result.low), "=d" (result.high) - : "0"(a), [b] "r"(b) - : "cc" ); + __asm__("mulq %[b]" + : "=a"(result.low), "=d"(result.high) + : "0"(a), [b] "r"(b) + : "cc"); #else DMul::mul(a, b, result.low, result.high); @@ -911,24 +912,25 @@ int btConvexHullInternal::Rational64::compare(const Rational64& b) const int result; int64_t tmp; int64_t dummy; - __asm__ ("mulq %[bn]\n\t" - "movq %%rax, %[tmp]\n\t" - "movq %%rdx, %%rbx\n\t" - "movq %[tn], %%rax\n\t" - "mulq %[bd]\n\t" - "subq %[tmp], %%rax\n\t" - "sbbq %%rbx, %%rdx\n\t" // rdx:rax contains 128-bit-difference "numerator*b.denominator - b.numerator*denominator" - "setnsb %%bh\n\t" // bh=1 if difference is non-negative, bh=0 otherwise - "orq %%rdx, %%rax\n\t" - "setnzb %%bl\n\t" // bl=1 if difference if non-zero, bl=0 if it is zero - "decb %%bh\n\t" // now bx=0x0000 if difference is zero, 0xff01 if it is negative, 0x0001 if it is positive (i.e., same sign as difference) - "shll $16, %%ebx\n\t" // ebx has same sign as difference - : "=&b"(result), [tmp] "=&r"(tmp), "=a"(dummy) - : "a"(denominator), [bn] "g"(b.numerator), [tn] "g"(numerator), [bd] "g"(b.denominator) - : "%rdx", "cc" ); - return result ? result ^ sign // if sign is +1, only bit 0 of result is inverted, which does not change the sign of result (and cannot result in zero) - // if sign is -1, all bits of result are inverted, which changes the sign of result (and again cannot result in zero) - : 0; + __asm__( + "mulq %[bn]\n\t" + "movq %%rax, %[tmp]\n\t" + "movq %%rdx, %%rbx\n\t" + "movq %[tn], %%rax\n\t" + "mulq %[bd]\n\t" + "subq %[tmp], %%rax\n\t" + "sbbq %%rbx, %%rdx\n\t" // rdx:rax contains 128-bit-difference "numerator*b.denominator - b.numerator*denominator" + "setnsb %%bh\n\t" // bh=1 if difference is non-negative, bh=0 otherwise + "orq %%rdx, %%rax\n\t" + "setnzb %%bl\n\t" // bl=1 if difference if non-zero, bl=0 if it is zero + "decb %%bh\n\t" // now bx=0x0000 if difference is zero, 0xff01 if it is negative, 0x0001 if it is positive (i.e., same sign as difference) + "shll $16, %%ebx\n\t" // ebx has same sign as difference + : "=&b"(result), [tmp] "=&r"(tmp), "=a"(dummy) + : "a"(m_denominator), [bn] "g"(b.m_numerator), [tn] "g"(m_numerator), [bd] "g"(b.m_denominator) + : "%rdx", "cc"); + return result ? result ^ sign // if sign is +1, only bit 0 of result is inverted, which does not change the sign of result (and cannot result in zero) + // if sign is -1, all bits of result are inverted, which changes the sign of result (and again cannot result in zero) + : 0; #else @@ -949,7 +951,7 @@ int btConvexHullInternal::Rational128::compare(const Rational128& b) const } if (isInt64) { - return -b.compare(sign * (int64_t) numerator.low); + return -b.compare(sign * (int64_t)numerator.low); } Int128 nbdLow, nbdHigh, dbnLow, dbnHigh; @@ -968,7 +970,7 @@ int btConvexHullInternal::Rational128::compare(int64_t b) const { if (isInt64) { - int64_t a = sign * (int64_t) numerator.low; + int64_t a = sign * (int64_t)numerator.low; return (a > b) ? 1 : (a < b) ? -1 : 0; } if (b > 0) @@ -994,7 +996,6 @@ int btConvexHullInternal::Rational128::compare(int64_t b) const return numerator.ucmp(denominator * b) * sign; } - btConvexHullInternal::Edge* btConvexHullInternal::newEdgePair(Vertex* from, Vertex* to) { btAssert(from && to); @@ -1062,7 +1063,7 @@ bool btConvexHullInternal::mergeProjection(IntermediateHull& h0, IntermediateHul } } } - + v0 = h0.maxXy; v1 = h1.maxXy; Vertex* v00 = NULL; @@ -1070,7 +1071,7 @@ bool btConvexHullInternal::mergeProjection(IntermediateHull& h0, IntermediateHul int32_t sign = 1; for (int side = 0; side <= 1; side++) - { + { int32_t dx = (v1->point.x - v0->point.x) * sign; if (dx > 0) { @@ -1113,7 +1114,7 @@ bool btConvexHullInternal::mergeProjection(IntermediateHull& h0, IntermediateHul while (true) { int32_t dy = v1->point.y - v0->point.y; - + Vertex* w1 = side ? v1->prev : v1->next; if (w1 != v1) { @@ -1126,7 +1127,7 @@ bool btConvexHullInternal::mergeProjection(IntermediateHull& h0, IntermediateHul continue; } } - + Vertex* w0 = side ? v0->prev : v0->next; if (w0 != v0) { @@ -1140,7 +1141,7 @@ bool btConvexHullInternal::mergeProjection(IntermediateHull& h0, IntermediateHul continue; } } - + break; } } @@ -1166,7 +1167,7 @@ bool btConvexHullInternal::mergeProjection(IntermediateHull& h0, IntermediateHul } v1 = w1; } - + if (side == 0) { v00 = v0; @@ -1192,7 +1193,7 @@ bool btConvexHullInternal::mergeProjection(IntermediateHull& h0, IntermediateHul { h0.maxXy = h1.maxXy; } - + h0.maxYx = h1.maxYx; c0 = v00; @@ -1279,19 +1280,19 @@ void btConvexHullInternal::computeInternal(int start, int end, IntermediateHull& } { Vertex* v = originalVertices[start]; - v->edges = NULL; - v->next = v; - v->prev = v; - - result.minXy = v; - result.maxXy = v; - result.minYx = v; - result.maxYx = v; + v->edges = NULL; + v->next = v; + v->prev = v; + + result.minXy = v; + result.maxXy = v; + result.minYx = v; + result.maxYx = v; } - + return; } - + case 1: { Vertex* v = originalVertices[start]; @@ -1309,7 +1310,7 @@ void btConvexHullInternal::computeInternal(int start, int end, IntermediateHull& } int split0 = start + n / 2; - Point32 p = originalVertices[split0-1]->point; + Point32 p = originalVertices[split0 - 1]->point; int split1 = split0; while ((split1 < end) && (originalVertices[split1]->point == p)) { @@ -1334,7 +1335,7 @@ void btConvexHullInternal::computeInternal(int start, int end, IntermediateHull& void btConvexHullInternal::IntermediateHull::print() { printf(" Hull\n"); - for (Vertex* v = minXy; v; ) + for (Vertex* v = minXy; v;) { printf(" "); v->print(); @@ -1362,7 +1363,7 @@ void btConvexHullInternal::IntermediateHull::print() } } if (minXy) - { + { minXy->copy = (minXy->copy == -1) ? -2 : -1; minXy->printGraph(); } @@ -1438,7 +1439,7 @@ btConvexHullInternal::Edge* btConvexHullInternal::findMaxAngle(bool ccw, const V Point32 t = *e->target - *start; Rational64 cot(t.dot(sxrxs), t.dot(rxs)); #ifdef DEBUG_CONVEX_HULL - printf(" Angle is %f (%d) for ", (float) btAtan(cot.toScalar()), (int) cot.isNaN()); + printf(" Angle is %f (%d) for ", (float)btAtan(cot.toScalar()), (int)cot.isNaN()); e->print(); #endif if (cot.isNaN()) @@ -1485,7 +1486,7 @@ void btConvexHullInternal::findEdgeForCoplanarFaces(Vertex* c0, Vertex* c1, Edge btAssert(!start1 || (start1->target->point.dot(normal) == dist)); Point64 perp = s.cross(normal); btAssert(!perp.isZero()); - + #ifdef DEBUG_CONVEX_HULL printf(" Advancing %d %d (%p %p, %d %d)\n", c0->point.index, c1->point.index, start0, start1, start0 ? start0->target->point.index : -1, start1 ? start1->target->point.index : -1); #endif @@ -1515,7 +1516,7 @@ void btConvexHullInternal::findEdgeForCoplanarFaces(Vertex* c0, Vertex* c1, Edge et0 = e->target->point; } } - + int64_t maxDot1 = et1.dot(perp); if (e1) { @@ -1552,7 +1553,7 @@ void btConvexHullInternal::findEdgeForCoplanarFaces(Vertex* c0, Vertex* c1, Edge while (true) { int64_t dy = (et1 - et0).dot(s); - + if (e0 && (e0->target != stop0)) { Edge* f0 = e0->next->reverse; @@ -1569,7 +1570,7 @@ void btConvexHullInternal::findEdgeForCoplanarFaces(Vertex* c0, Vertex* c1, Edge } } } - + if (e1 && (e1->target != stop1)) { Edge* f1 = e1->reverse->next; @@ -1604,7 +1605,7 @@ void btConvexHullInternal::findEdgeForCoplanarFaces(Vertex* c0, Vertex* c1, Edge while (true) { int64_t dy = (et1 - et0).dot(s); - + if (e1 && (e1->target != stop1)) { Edge* f1 = e1->prev->reverse; @@ -1621,7 +1622,7 @@ void btConvexHullInternal::findEdgeForCoplanarFaces(Vertex* c0, Vertex* c1, Edge } } } - + if (e0 && (e0->target != stop0)) { Edge* f0 = e0->reverse->prev; @@ -1656,7 +1657,6 @@ void btConvexHullInternal::findEdgeForCoplanarFaces(Vertex* c0, Vertex* c1, Edge #endif } - void btConvexHullInternal::merge(IntermediateHull& h0, IntermediateHull& h1) { if (!h1.maxXy) @@ -1668,7 +1668,7 @@ void btConvexHullInternal::merge(IntermediateHull& h0, IntermediateHull& h1) h0 = h1; return; } - + mergeStamp--; Vertex* c0 = NULL; @@ -1708,7 +1708,7 @@ void btConvexHullInternal::merge(IntermediateHull& h0, IntermediateHull& h1) e = e->next; } while (e != c0->edges); } - + e = c1->edges; Edge* start1 = NULL; if (e) @@ -1760,7 +1760,7 @@ void btConvexHullInternal::merge(IntermediateHull& h0, IntermediateHull& h1) Point32 r = prevPoint - c0->point; Point64 rxs = r.cross(s); Point64 sxrxs = s.cross(rxs); - + #ifdef DEBUG_CONVEX_HULL printf("\n Checking %d %d\n", c0->point.index, c1->point.index); #endif @@ -1811,7 +1811,7 @@ void btConvexHullInternal::merge(IntermediateHull& h0, IntermediateHull& h1) e->prev = pendingTail1; pendingTail1 = e; } - + Edge* e0 = min0; Edge* e1 = min1; @@ -1828,7 +1828,7 @@ void btConvexHullInternal::merge(IntermediateHull& h0, IntermediateHull& h1) { if (toPrev1) { - for (Edge* e = toPrev1->next, *n = NULL; e != min1; e = n) + for (Edge *e = toPrev1->next, *n = NULL; e != min1; e = n) { n = e->next; removeEdgePair(e); @@ -1864,7 +1864,7 @@ void btConvexHullInternal::merge(IntermediateHull& h0, IntermediateHull& h1) { if (toPrev0) { - for (Edge* e = toPrev0->prev, *n = NULL; e != min0; e = n) + for (Edge *e = toPrev0->prev, *n = NULL; e != min0; e = n) { n = e->prev; removeEdgePair(e); @@ -1906,7 +1906,7 @@ void btConvexHullInternal::merge(IntermediateHull& h0, IntermediateHull& h1) } else { - for (Edge* e = toPrev0->prev, *n = NULL; e != firstNew0; e = n) + for (Edge *e = toPrev0->prev, *n = NULL; e != firstNew0; e = n) { n = e->prev; removeEdgePair(e); @@ -1925,7 +1925,7 @@ void btConvexHullInternal::merge(IntermediateHull& h0, IntermediateHull& h1) } else { - for (Edge* e = toPrev1->next, *n = NULL; e != firstNew1; e = n) + for (Edge *e = toPrev1->next, *n = NULL; e != firstNew1; e = n) { n = e->next; removeEdgePair(e); @@ -1936,7 +1936,7 @@ void btConvexHullInternal::merge(IntermediateHull& h0, IntermediateHull& h1) pendingTail1->link(firstNew1); } } - + return; } @@ -1946,24 +1946,23 @@ void btConvexHullInternal::merge(IntermediateHull& h0, IntermediateHull& h1) class pointCmp { - public: - - bool operator() ( const btConvexHullInternal::Point32& p, const btConvexHullInternal::Point32& q ) const - { - return (p.y < q.y) || ((p.y == q.y) && ((p.x < q.x) || ((p.x == q.x) && (p.z < q.z)))); - } +public: + bool operator()(const btConvexHullInternal::Point32& p, const btConvexHullInternal::Point32& q) const + { + return (p.y < q.y) || ((p.y == q.y) && ((p.x < q.x) || ((p.x == q.x) && (p.z < q.z)))); + } }; void btConvexHullInternal::compute(const void* coords, bool doubleCoords, int stride, int count) { btVector3 min(btScalar(1e30), btScalar(1e30), btScalar(1e30)), max(btScalar(-1e30), btScalar(-1e30), btScalar(-1e30)); - const char* ptr = (const char*) coords; + const char* ptr = (const char*)coords; if (doubleCoords) { for (int i = 0; i < count; i++) { - const double* v = (const double*) ptr; - btVector3 p((btScalar) v[0], (btScalar) v[1], (btScalar) v[2]); + const double* v = (const double*)ptr; + btVector3 p((btScalar)v[0], (btScalar)v[1], (btScalar)v[2]); ptr += stride; min.setMin(p); max.setMax(p); @@ -1973,7 +1972,7 @@ void btConvexHullInternal::compute(const void* coords, bool doubleCoords, int st { for (int i = 0; i < count; i++) { - const float* v = (const float*) ptr; + const float* v = (const float*)ptr; btVector3 p(v[0], v[1], v[2]); ptr += stride; min.setMin(p); @@ -2014,18 +2013,18 @@ void btConvexHullInternal::compute(const void* coords, bool doubleCoords, int st btAlignedObjectArray points; points.resize(count); - ptr = (const char*) coords; + ptr = (const char*)coords; if (doubleCoords) { for (int i = 0; i < count; i++) { - const double* v = (const double*) ptr; - btVector3 p((btScalar) v[0], (btScalar) v[1], (btScalar) v[2]); + const double* v = (const double*)ptr; + btVector3 p((btScalar)v[0], (btScalar)v[1], (btScalar)v[2]); ptr += stride; p = (p - center) * s; - points[i].x = (int32_t) p[medAxis]; - points[i].y = (int32_t) p[maxAxis]; - points[i].z = (int32_t) p[minAxis]; + points[i].x = (int32_t)p[medAxis]; + points[i].y = (int32_t)p[maxAxis]; + points[i].z = (int32_t)p[minAxis]; points[i].index = i; } } @@ -2033,13 +2032,13 @@ void btConvexHullInternal::compute(const void* coords, bool doubleCoords, int st { for (int i = 0; i < count; i++) { - const float* v = (const float*) ptr; + const float* v = (const float*)ptr; btVector3 p(v[0], v[1], v[2]); ptr += stride; p = (p - center) * s; - points[i].x = (int32_t) p[medAxis]; - points[i].y = (int32_t) p[maxAxis]; - points[i].z = (int32_t) p[minAxis]; + points[i].x = (int32_t)p[medAxis]; + points[i].y = (int32_t)p[maxAxis]; + points[i].z = (int32_t)p[minAxis]; points[i].index = i; } } @@ -2193,7 +2192,7 @@ btScalar btConvexHullInternal::shrink(btScalar amount, btScalar clampAmount) minDist = dist; } } - + if (minDist <= 0) { return 0; @@ -2234,7 +2233,7 @@ bool btConvexHullInternal::shiftFace(Face* face, btScalar amount, btAlignedObjec { origShift[2] /= scaling[2]; } - Point32 shift((int32_t) origShift[medAxis], (int32_t) origShift[maxAxis], (int32_t) origShift[minAxis]); + Point32 shift((int32_t)origShift[medAxis], (int32_t)origShift[maxAxis], (int32_t)origShift[minAxis]); if (shift.isZero()) { return true; @@ -2242,7 +2241,7 @@ bool btConvexHullInternal::shiftFace(Face* face, btScalar amount, btAlignedObjec Point64 normal = face->getNormal(); #ifdef DEBUG_CONVEX_HULL printf("\nShrinking face (%d %d %d) (%d %d %d) (%d %d %d) by (%d %d %d)\n", - face->origin.x, face->origin.y, face->origin.z, face->dir0.x, face->dir0.y, face->dir0.z, face->dir1.x, face->dir1.y, face->dir1.z, shift.x, shift.y, shift.z); + face->origin.x, face->origin.y, face->origin.z, face->dir0.x, face->dir0.y, face->dir0.z, face->dir1.x, face->dir1.y, face->dir1.z, shift.x, shift.y, shift.z); #endif int64_t origDot = face->origin.dot(normal); Point32 shiftedOrigin = face->origin + shift; @@ -2279,7 +2278,7 @@ bool btConvexHullInternal::shiftFace(Face* face, btScalar amount, btAlignedObjec #ifdef DEBUG_CONVEX_HULL printf("Moving downwards, edge is "); e->print(); - printf(", dot is %f (%f %lld)\n", (float) dot.toScalar(), (float) optDot.toScalar(), shiftedDot); + printf(", dot is %f (%f %lld)\n", (float)dot.toScalar(), (float)optDot.toScalar(), shiftedDot); #endif if (dot.compare(optDot) < 0) { @@ -2315,7 +2314,7 @@ bool btConvexHullInternal::shiftFace(Face* face, btScalar amount, btAlignedObjec #ifdef DEBUG_CONVEX_HULL printf("Moving upwards, edge is "); e->print(); - printf(", dot is %f (%f %lld)\n", (float) dot.toScalar(), (float) optDot.toScalar(), shiftedDot); + printf(", dot is %f (%f %lld)\n", (float)dot.toScalar(), (float)optDot.toScalar(), shiftedDot); #endif if (dot.compare(optDot) > 0) { @@ -2331,7 +2330,7 @@ bool btConvexHullInternal::shiftFace(Face* face, btScalar amount, btAlignedObjec } e = e->prev; } while (e != startEdge); - + if (!intersection) { return true; @@ -2368,7 +2367,7 @@ bool btConvexHullInternal::shiftFace(Face* face, btScalar amount, btAlignedObjec printf("Needed %d iterations to check for complete containment\n", n); #endif } - + Edge* firstIntersection = NULL; Edge* faceEdge = NULL; Edge* firstFaceEdge = NULL; @@ -2477,7 +2476,7 @@ bool btConvexHullInternal::shiftFace(Face* face, btScalar amount, btAlignedObjec #ifdef DEBUG_CONVEX_HULL printf("1: Removed part contains (%d %d %d)\n", removed->point.x, removed->point.y, removed->point.z); #endif - + Point64 n0 = intersection->face->getNormal(); Point64 n1 = intersection->reverse->face->getNormal(); int64_t m00 = face->dir0.dot(n0); @@ -2491,16 +2490,13 @@ bool btConvexHullInternal::shiftFace(Face* face, btScalar amount, btAlignedObjec Vertex* v = vertexPool.newObject(); v->point.index = -1; v->copy = -1; - v->point128 = PointR128(Int128::mul(face->dir0.x * r0, m11) - Int128::mul(face->dir0.x * r1, m01) - + Int128::mul(face->dir1.x * r1, m00) - Int128::mul(face->dir1.x * r0, m10) + det * shiftedOrigin.x, - Int128::mul(face->dir0.y * r0, m11) - Int128::mul(face->dir0.y * r1, m01) - + Int128::mul(face->dir1.y * r1, m00) - Int128::mul(face->dir1.y * r0, m10) + det * shiftedOrigin.y, - Int128::mul(face->dir0.z * r0, m11) - Int128::mul(face->dir0.z * r1, m01) - + Int128::mul(face->dir1.z * r1, m00) - Int128::mul(face->dir1.z * r0, m10) + det * shiftedOrigin.z, - det); - v->point.x = (int32_t) v->point128.xvalue(); - v->point.y = (int32_t) v->point128.yvalue(); - v->point.z = (int32_t) v->point128.zvalue(); + v->point128 = PointR128(Int128::mul(face->dir0.x * r0, m11) - Int128::mul(face->dir0.x * r1, m01) + Int128::mul(face->dir1.x * r1, m00) - Int128::mul(face->dir1.x * r0, m10) + det * shiftedOrigin.x, + Int128::mul(face->dir0.y * r0, m11) - Int128::mul(face->dir0.y * r1, m01) + Int128::mul(face->dir1.y * r1, m00) - Int128::mul(face->dir1.y * r0, m10) + det * shiftedOrigin.y, + Int128::mul(face->dir0.z * r0, m11) - Int128::mul(face->dir0.z * r1, m01) + Int128::mul(face->dir1.z * r1, m00) - Int128::mul(face->dir1.z * r0, m10) + det * shiftedOrigin.z, + det); + v->point.x = (int32_t)v->point128.xvalue(); + v->point.y = (int32_t)v->point128.yvalue(); + v->point.z = (int32_t)v->point128.zvalue(); intersection->target = v; v->edges = e; @@ -2639,7 +2635,6 @@ bool btConvexHullInternal::shiftFace(Face* face, btScalar amount, btAlignedObjec return true; } - static int getVertexCopy(btConvexHullInternal::Vertex* vertex, btAlignedObjectArray& vertices) { int index = vertex->copy; @@ -2761,8 +2756,3 @@ btScalar btConvexHullComputer::compute(const void* coords, bool doubleCoords, in return shift; } - - - - - diff --git a/thirdparty/bullet/LinearMath/btConvexHullComputer.h b/thirdparty/bullet/LinearMath/btConvexHullComputer.h index 7240ac4fb5..cba684f2dc 100644 --- a/thirdparty/bullet/LinearMath/btConvexHullComputer.h +++ b/thirdparty/bullet/LinearMath/btConvexHullComputer.h @@ -23,58 +23,56 @@ subject to the following restrictions: /// Ole Kniemeyer, MAXON Computer GmbH class btConvexHullComputer { +private: + btScalar compute(const void* coords, bool doubleCoords, int stride, int count, btScalar shrink, btScalar shrinkClamp); + +public: + class Edge + { private: - btScalar compute(const void* coords, bool doubleCoords, int stride, int count, btScalar shrink, btScalar shrinkClamp); + int next; + int reverse; + int targetVertex; - public: + friend class btConvexHullComputer; - class Edge + public: + int getSourceVertex() const { - private: - int next; - int reverse; - int targetVertex; - - friend class btConvexHullComputer; - - public: - int getSourceVertex() const - { - return (this + reverse)->targetVertex; - } - - int getTargetVertex() const - { - return targetVertex; - } + return (this + reverse)->targetVertex; + } - const Edge* getNextEdgeOfVertex() const // clockwise list of all edges of a vertex - { - return this + next; - } + int getTargetVertex() const + { + return targetVertex; + } - const Edge* getNextEdgeOfFace() const // counter-clockwise list of all edges of a face - { - return (this + reverse)->getNextEdgeOfVertex(); - } + const Edge* getNextEdgeOfVertex() const // clockwise list of all edges of a vertex + { + return this + next; + } - const Edge* getReverseEdge() const - { - return this + reverse; - } - }; + const Edge* getNextEdgeOfFace() const // counter-clockwise list of all edges of a face + { + return (this + reverse)->getNextEdgeOfVertex(); + } + const Edge* getReverseEdge() const + { + return this + reverse; + } + }; - // Vertices of the output hull - btAlignedObjectArray vertices; + // Vertices of the output hull + btAlignedObjectArray vertices; - // Edges of the output hull - btAlignedObjectArray edges; + // Edges of the output hull + btAlignedObjectArray edges; - // Faces of the convex hull. Each entry is an index into the "edges" array pointing to an edge of the face. Faces are planar n-gons - btAlignedObjectArray faces; + // Faces of the convex hull. Each entry is an index into the "edges" array pointing to an edge of the face. Faces are planar n-gons + btAlignedObjectArray faces; - /* + /* Compute convex hull of "count" vertices stored in "coords". "stride" is the difference in bytes between the addresses of consecutive vertices. If "shrink" is positive, the convex hull is shrunken by that amount (each face is moved by "shrink" length units towards the center along its normal). @@ -86,18 +84,16 @@ class btConvexHullComputer The output convex hull can be found in the member variables "vertices", "edges", "faces". */ - btScalar compute(const float* coords, int stride, int count, btScalar shrink, btScalar shrinkClamp) - { - return compute(coords, false, stride, count, shrink, shrinkClamp); - } - - // same as above, but double precision - btScalar compute(const double* coords, int stride, int count, btScalar shrink, btScalar shrinkClamp) - { - return compute(coords, true, stride, count, shrink, shrinkClamp); - } + btScalar compute(const float* coords, int stride, int count, btScalar shrink, btScalar shrinkClamp) + { + return compute(coords, false, stride, count, shrink, shrinkClamp); + } + + // same as above, but double precision + btScalar compute(const double* coords, int stride, int count, btScalar shrink, btScalar shrinkClamp) + { + return compute(coords, true, stride, count, shrink, shrinkClamp); + } }; - -#endif //BT_CONVEX_HULL_COMPUTER_H - +#endif //BT_CONVEX_HULL_COMPUTER_H diff --git a/thirdparty/bullet/LinearMath/btCpuFeatureUtility.h b/thirdparty/bullet/LinearMath/btCpuFeatureUtility.h index d2cab52d48..5e4b9a313c 100644 --- a/thirdparty/bullet/LinearMath/btCpuFeatureUtility.h +++ b/thirdparty/bullet/LinearMath/btCpuFeatureUtility.h @@ -4,20 +4,20 @@ #include "LinearMath/btScalar.h" -#include //memset -#ifdef USE_SIMD +#include //memset +#ifdef USE_SIMD #include #ifdef BT_ALLOW_SSE4 #include -#endif //BT_ALLOW_SSE4 -#endif //USE_SIMD +#endif //BT_ALLOW_SSE4 +#endif //USE_SIMD #if defined BT_USE_NEON -#define ARM_NEON_GCC_COMPATIBILITY 1 +#define ARM_NEON_GCC_COMPATIBILITY 1 #include #include -#include //for sysctlbyname -#endif //BT_USE_NEON +#include //for sysctlbyname +#endif //BT_USE_NEON ///Rudimentary btCpuFeatureUtility for CPU features: only report the features that Bullet actually uses (SSE4/FMA3, NEON_HPFP) ///We assume SSE2 in case BT_USE_SSE2 is defined in LinearMath/btScalar.h @@ -26,14 +26,13 @@ class btCpuFeatureUtility public: enum btCpuFeature { - CPU_FEATURE_FMA3=1, - CPU_FEATURE_SSE4_1=2, - CPU_FEATURE_NEON_HPFP=4 + CPU_FEATURE_FMA3 = 1, + CPU_FEATURE_SSE4_1 = 2, + CPU_FEATURE_NEON_HPFP = 4 }; static int getCpuFeatures() { - static int capabilities = 0; static bool testedCapabilities = false; if (0 != testedCapabilities) @@ -49,15 +48,15 @@ public: if (0 == err && hasFeature) capabilities |= CPU_FEATURE_NEON_HPFP; } -#endif //BT_USE_NEON +#endif //BT_USE_NEON -#ifdef BT_ALLOW_SSE4 +#ifdef BT_ALLOW_SSE4 { - int cpuInfo[4]; + int cpuInfo[4]; memset(cpuInfo, 0, sizeof(cpuInfo)); - unsigned long long sseExt = 0; + unsigned long long sseExt = 0; __cpuid(cpuInfo, 1); - + bool osUsesXSAVE_XRSTORE = cpuInfo[2] & (1 << 27) || false; bool cpuAVXSuport = cpuInfo[2] & (1 << 28) || false; @@ -79,14 +78,11 @@ public: capabilities |= btCpuFeatureUtility::CPU_FEATURE_SSE4_1; } } -#endif//BT_ALLOW_SSE4 +#endif //BT_ALLOW_SSE4 testedCapabilities = true; return capabilities; } - - }; - -#endif //BT_CPU_UTILITY_H +#endif //BT_CPU_UTILITY_H diff --git a/thirdparty/bullet/LinearMath/btDefaultMotionState.h b/thirdparty/bullet/LinearMath/btDefaultMotionState.h index 01c5f8d932..14c40d36b0 100644 --- a/thirdparty/bullet/LinearMath/btDefaultMotionState.h +++ b/thirdparty/bullet/LinearMath/btDefaultMotionState.h @@ -4,39 +4,37 @@ #include "btMotionState.h" ///The btDefaultMotionState provides a common implementation to synchronize world transforms with offsets. -ATTRIBUTE_ALIGNED16(struct) btDefaultMotionState : public btMotionState +ATTRIBUTE_ALIGNED16(struct) +btDefaultMotionState : public btMotionState { btTransform m_graphicsWorldTrans; - btTransform m_centerOfMassOffset; + btTransform m_centerOfMassOffset; btTransform m_startWorldTrans; - void* m_userPointer; + void* m_userPointer; BT_DECLARE_ALIGNED_ALLOCATOR(); - btDefaultMotionState(const btTransform& startTrans = btTransform::getIdentity(),const btTransform& centerOfMassOffset = btTransform::getIdentity()) + btDefaultMotionState(const btTransform& startTrans = btTransform::getIdentity(), const btTransform& centerOfMassOffset = btTransform::getIdentity()) : m_graphicsWorldTrans(startTrans), - m_centerOfMassOffset(centerOfMassOffset), - m_startWorldTrans(startTrans), - m_userPointer(0) + m_centerOfMassOffset(centerOfMassOffset), + m_startWorldTrans(startTrans), + m_userPointer(0) { } ///synchronizes world transform from user to physics - virtual void getWorldTransform(btTransform& centerOfMassWorldTrans ) const + virtual void getWorldTransform(btTransform & centerOfMassWorldTrans) const { - centerOfMassWorldTrans = m_graphicsWorldTrans * m_centerOfMassOffset.inverse() ; + centerOfMassWorldTrans = m_graphicsWorldTrans * m_centerOfMassOffset.inverse(); } ///synchronizes world transform from physics to user ///Bullet only calls the update of worldtransform for active objects - virtual void setWorldTransform(const btTransform& centerOfMassWorldTrans) + virtual void setWorldTransform(const btTransform& centerOfMassWorldTrans) { - m_graphicsWorldTrans = centerOfMassWorldTrans * m_centerOfMassOffset; + m_graphicsWorldTrans = centerOfMassWorldTrans * m_centerOfMassOffset; } - - - }; -#endif //BT_DEFAULT_MOTION_STATE_H +#endif //BT_DEFAULT_MOTION_STATE_H diff --git a/thirdparty/bullet/LinearMath/btGeometryUtil.cpp b/thirdparty/bullet/LinearMath/btGeometryUtil.cpp index 5ac230f712..115e3eab81 100644 --- a/thirdparty/bullet/LinearMath/btGeometryUtil.cpp +++ b/thirdparty/bullet/LinearMath/btGeometryUtil.cpp @@ -12,49 +12,43 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - - #include "btGeometryUtil.h" - /* Make sure this dummy function never changes so that it can be used by probes that are checking whether the library is actually installed. */ extern "C" -{ - void btBulletMathProbe (); +{ + void btBulletMathProbe(); - void btBulletMathProbe () {} + void btBulletMathProbe() {} } - -bool btGeometryUtil::isPointInsidePlanes(const btAlignedObjectArray& planeEquations, const btVector3& point, btScalar margin) +bool btGeometryUtil::isPointInsidePlanes(const btAlignedObjectArray& planeEquations, const btVector3& point, btScalar margin) { int numbrushes = planeEquations.size(); - for (int i=0;ibtScalar(0.)) + btScalar dist = btScalar(N1.dot(point)) + btScalar(N1[3]) - margin; + if (dist > btScalar(0.)) { return false; } } return true; - } - -bool btGeometryUtil::areVerticesBehindPlane(const btVector3& planeNormal, const btAlignedObjectArray& vertices, btScalar margin) +bool btGeometryUtil::areVerticesBehindPlane(const btVector3& planeNormal, const btAlignedObjectArray& vertices, btScalar margin) { int numvertices = vertices.size(); - for (int i=0;ibtScalar(0.)) + btScalar dist = btScalar(planeNormal.dot(N1)) + btScalar(planeNormal[3]) - margin; + if (dist > btScalar(0.)) { return false; } @@ -62,102 +56,98 @@ bool btGeometryUtil::areVerticesBehindPlane(const btVector3& planeNormal, const return true; } -bool notExist(const btVector3& planeEquation,const btAlignedObjectArray& planeEquations); +bool notExist(const btVector3& planeEquation, const btAlignedObjectArray& planeEquations); -bool notExist(const btVector3& planeEquation,const btAlignedObjectArray& planeEquations) +bool notExist(const btVector3& planeEquation, const btAlignedObjectArray& planeEquations) { int numbrushes = planeEquations.size(); - for (int i=0;i btScalar(0.999)) { return false; - } + } } return true; } -void btGeometryUtil::getPlaneEquationsFromVertices(btAlignedObjectArray& vertices, btAlignedObjectArray& planeEquationsOut ) +void btGeometryUtil::getPlaneEquationsFromVertices(btAlignedObjectArray& vertices, btAlignedObjectArray& planeEquationsOut) { - const int numvertices = vertices.size(); + const int numvertices = vertices.size(); // brute force: - for (int i=0;i btScalar(0.0001)) { planeEquation.normalize(); - if (notExist(planeEquation,planeEquationsOut)) + if (notExist(planeEquation, planeEquationsOut)) { planeEquation[3] = -planeEquation.dot(N1); - - //check if inside, and replace supportingVertexOut if needed - if (areVerticesBehindPlane(planeEquation,vertices,btScalar(0.01))) - { - planeEquationsOut.push_back(planeEquation); - } + + //check if inside, and replace supportingVertexOut if needed + if (areVerticesBehindPlane(planeEquation, vertices, btScalar(0.01))) + { + planeEquationsOut.push_back(planeEquation); + } } } normalSign = btScalar(-1.); } - } } } - } -void btGeometryUtil::getVerticesFromPlaneEquations(const btAlignedObjectArray& planeEquations , btAlignedObjectArray& verticesOut ) +void btGeometryUtil::getVerticesFromPlaneEquations(const btAlignedObjectArray& planeEquations, btAlignedObjectArray& verticesOut) { const int numbrushes = planeEquations.size(); // brute force: - for (int i=0;i btScalar(0.0001) ) && - ( n3n1.length2() > btScalar(0.0001) ) && - ( n1n2.length2() > btScalar(0.0001) ) ) + btVector3 n2n3; + n2n3 = N2.cross(N3); + btVector3 n3n1; + n3n1 = N3.cross(N1); + btVector3 n1n2; + n1n2 = N1.cross(N2); + + if ((n2n3.length2() > btScalar(0.0001)) && + (n3n1.length2() > btScalar(0.0001)) && + (n1n2.length2() > btScalar(0.0001))) { //point P out of 3 plane equations: - // d1 ( N2 * N3 ) + d2 ( N3 * N1 ) + d3 ( N1 * N2 ) - //P = ------------------------------------------------------------------------- - // N1 . ( N2 * N3 ) - + // d1 ( N2 * N3 ) + d2 ( N3 * N1 ) + d3 ( N1 * N2 ) + //P = ------------------------------------------------------------------------- + // N1 . ( N2 * N3 ) btScalar quotient = (N1.dot(n2n3)); if (btFabs(quotient) > btScalar(0.000001)) @@ -172,7 +162,7 @@ void btGeometryUtil::getVerticesFromPlaneEquations(const btAlignedObjectArray& vertices, btAlignedObjectArray& planeEquationsOut ); - - static void getVerticesFromPlaneEquations(const btAlignedObjectArray& planeEquations , btAlignedObjectArray& verticesOut ); - - static bool isInside(const btAlignedObjectArray& vertices, const btVector3& planeNormal, btScalar margin); - - static bool isPointInsidePlanes(const btAlignedObjectArray& planeEquations, const btVector3& point, btScalar margin); +public: + static void getPlaneEquationsFromVertices(btAlignedObjectArray& vertices, btAlignedObjectArray& planeEquationsOut); - static bool areVerticesBehindPlane(const btVector3& planeNormal, const btAlignedObjectArray& vertices, btScalar margin); + static void getVerticesFromPlaneEquations(const btAlignedObjectArray& planeEquations, btAlignedObjectArray& verticesOut); -}; + static bool isInside(const btAlignedObjectArray& vertices, const btVector3& planeNormal, btScalar margin); + static bool isPointInsidePlanes(const btAlignedObjectArray& planeEquations, const btVector3& point, btScalar margin); -#endif //BT_GEOMETRY_UTIL_H + static bool areVerticesBehindPlane(const btVector3& planeNormal, const btAlignedObjectArray& vertices, btScalar margin); +}; +#endif //BT_GEOMETRY_UTIL_H diff --git a/thirdparty/bullet/LinearMath/btGrahamScan2dConvexHull.h b/thirdparty/bullet/LinearMath/btGrahamScan2dConvexHull.h index 13a79aa585..0fcb285971 100644 --- a/thirdparty/bullet/LinearMath/btGrahamScan2dConvexHull.h +++ b/thirdparty/bullet/LinearMath/btGrahamScan2dConvexHull.h @@ -13,41 +13,40 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - #ifndef GRAHAM_SCAN_2D_CONVEX_HULL_H #define GRAHAM_SCAN_2D_CONVEX_HULL_H - #include "btVector3.h" #include "btAlignedObjectArray.h" struct GrahamVector3 : public btVector3 { GrahamVector3(const btVector3& org, int orgIndex) - :btVector3(org), - m_orgIndex(orgIndex) + : btVector3(org), + m_orgIndex(orgIndex) { } - btScalar m_angle; + btScalar m_angle; int m_orgIndex; }; - -struct btAngleCompareFunc { +struct btAngleCompareFunc +{ btVector3 m_anchor; btAngleCompareFunc(const btVector3& anchor) - : m_anchor(anchor) + : m_anchor(anchor) { } - bool operator()(const GrahamVector3& a, const GrahamVector3& b) const { + bool operator()(const GrahamVector3& a, const GrahamVector3& b) const + { if (a.m_angle != b.m_angle) return a.m_angle < b.m_angle; else { - btScalar al = (a-m_anchor).length2(); - btScalar bl = (b-m_anchor).length2(); + btScalar al = (a - m_anchor).length2(); + btScalar bl = (b - m_anchor).length2(); if (al != bl) - return al < bl; + return al < bl; else { return a.m_orgIndex < b.m_orgIndex; @@ -58,73 +57,73 @@ struct btAngleCompareFunc { inline void GrahamScanConvexHull2D(btAlignedObjectArray& originalPoints, btAlignedObjectArray& hull, const btVector3& normalAxis) { - btVector3 axis0,axis1; - btPlaneSpace1(normalAxis,axis0,axis1); - + btVector3 axis0, axis1; + btPlaneSpace1(normalAxis, axis0, axis1); - if (originalPoints.size()<=1) + if (originalPoints.size() <= 1) { - for (int i=0;i1) { - btVector3& a = hull[hull.size()-2]; - btVector3& b = hull[hull.size()-1]; - isConvex = btCross(a-b,a-originalPoints[i]).dot(normalAxis)> 0; + while (!isConvex && hull.size() > 1) + { + btVector3& a = hull[hull.size() - 2]; + btVector3& b = hull[hull.size() - 1]; + isConvex = btCross(a - b, a - originalPoints[i]).dot(normalAxis) > 0; if (!isConvex) hull.pop_back(); - else + else hull.push_back(originalPoints[i]); } - if( hull.size() == 1 ) - { - hull.push_back( originalPoints[i] ); - } + if (hull.size() == 1) + { + hull.push_back(originalPoints[i]); + } } } -#endif //GRAHAM_SCAN_2D_CONVEX_HULL_H +#endif //GRAHAM_SCAN_2D_CONVEX_HULL_H diff --git a/thirdparty/bullet/LinearMath/btHashMap.h b/thirdparty/bullet/LinearMath/btHashMap.h index 180e7b44af..1fca0fb73a 100644 --- a/thirdparty/bullet/LinearMath/btHashMap.h +++ b/thirdparty/bullet/LinearMath/btHashMap.h @@ -13,7 +13,6 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - #ifndef BT_HASH_MAP_H #define BT_HASH_MAP_H @@ -24,32 +23,32 @@ subject to the following restrictions: struct btHashString { std::string m_string1; - unsigned int m_hash; + unsigned int m_hash; - SIMD_FORCE_INLINE unsigned int getHash()const + SIMD_FORCE_INLINE unsigned int getHash() const { return m_hash; } btHashString() { - m_string1=""; - m_hash=0; + m_string1 = ""; + m_hash = 0; } btHashString(const char* name) - :m_string1(name) + : m_string1(name) { /* magic numbers from http://www.isthe.com/chongo/tech/comp/fnv/ */ - static const unsigned int InitialFNV = 2166136261u; + static const unsigned int InitialFNV = 2166136261u; static const unsigned int FNVMultiple = 16777619u; /* Fowler / Noll / Vo (FNV) Hash */ unsigned int hash = InitialFNV; - - for(int i = 0; m_string1.c_str()[i]; i++) + + for (int i = 0; m_string1.c_str()[i]; i++) { - hash = hash ^ (m_string1.c_str()[i]); /* xor the low 8 bits */ - hash = hash * FNVMultiple; /* multiply by the magic number */ + hash = hash ^ (m_string1.c_str()[i]); /* xor the low 8 bits */ + hash = hash * FNVMultiple; /* multiply by the magic number */ } m_hash = hash; } @@ -60,28 +59,27 @@ struct btHashString } }; -const int BT_HASH_NULL=0xffffffff; - +const int BT_HASH_NULL = 0xffffffff; class btHashInt { - int m_uid; -public: + int m_uid; +public: btHashInt() { } - btHashInt(int uid) :m_uid(uid) + btHashInt(int uid) : m_uid(uid) { } - int getUid1() const + int getUid1() const { return m_uid; } - void setUid1(int uid) + void setUid1(int uid) { m_uid = uid; } @@ -91,35 +89,35 @@ public: return getUid1() == other.getUid1(); } //to our success - SIMD_FORCE_INLINE unsigned int getHash()const + SIMD_FORCE_INLINE unsigned int getHash() const { unsigned int key = m_uid; // Thomas Wang's hash - key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); - + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; } }; - - class btHashPtr { - - union - { - const void* m_pointer; - unsigned int m_hashValues[2]; + union { + const void* m_pointer; + unsigned int m_hashValues[2]; }; public: - btHashPtr(const void* ptr) - :m_pointer(ptr) + : m_pointer(ptr) { } - const void* getPointer() const + const void* getPointer() const { return m_pointer; } @@ -130,64 +128,68 @@ public: } //to our success - SIMD_FORCE_INLINE unsigned int getHash()const + SIMD_FORCE_INLINE unsigned int getHash() const { - const bool VOID_IS_8 = ((sizeof(void*)==8)); - - unsigned int key = VOID_IS_8? m_hashValues[0]+m_hashValues[1] : m_hashValues[0]; + const bool VOID_IS_8 = ((sizeof(void*) == 8)); + + unsigned int key = VOID_IS_8 ? m_hashValues[0] + m_hashValues[1] : m_hashValues[0]; // Thomas Wang's hash - key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); return key; } - - }; - template class btHashKeyPtr { - int m_uid; + int m_uid; + public: + btHashKeyPtr(int uid) : m_uid(uid) + { + } - btHashKeyPtr(int uid) :m_uid(uid) - { - } - - int getUid1() const - { - return m_uid; - } - - bool equals(const btHashKeyPtr& other) const - { - return getUid1() == other.getUid1(); - } - - //to our success - SIMD_FORCE_INLINE unsigned int getHash()const - { - unsigned int key = m_uid; - // Thomas Wang's hash - key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); - return key; - } - - -}; + int getUid1() const + { + return m_uid; + } + + bool equals(const btHashKeyPtr& other) const + { + return getUid1() == other.getUid1(); + } + //to our success + SIMD_FORCE_INLINE unsigned int getHash() const + { + unsigned int key = m_uid; + // Thomas Wang's hash + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; + } +}; template class btHashKey { - int m_uid; -public: + int m_uid; - btHashKey(int uid) :m_uid(uid) +public: + btHashKey(int uid) : m_uid(uid) { } - int getUid1() const + int getUid1() const { return m_uid; } @@ -197,30 +199,33 @@ public: return getUid1() == other.getUid1(); } //to our success - SIMD_FORCE_INLINE unsigned int getHash()const + SIMD_FORCE_INLINE unsigned int getHash() const { unsigned int key = m_uid; // Thomas Wang's hash - key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); return key; } }; - ///The btHashMap template class implements a generic and lightweight hashmap. ///A basic sample of how to use btHashMap is located in Demos\BasicDemo\main.cpp template class btHashMap { - protected: - btAlignedObjectArray m_hashTable; - btAlignedObjectArray m_next; - - btAlignedObjectArray m_valueArray; - btAlignedObjectArray m_keyArray; + btAlignedObjectArray m_hashTable; + btAlignedObjectArray m_next; + + btAlignedObjectArray m_valueArray; + btAlignedObjectArray m_keyArray; - void growTables(const Key& /*key*/) + void growTables(const Key& /*key*/) { int newCapacity = m_valueArray.capacity(); @@ -234,7 +239,7 @@ protected: int i; - for (i= 0; i < newCapacity; ++i) + for (i = 0; i < newCapacity; ++i) { m_hashTable[i] = BT_HASH_NULL; } @@ -243,30 +248,28 @@ protected: m_next[i] = BT_HASH_NULL; } - for(i=0;i=0); - if (index>=0 && index < m_valueArray.size()) + btAssert(index >= 0); + if (index >= 0 && index < m_valueArray.size()) { return &m_valueArray[index]; } @@ -388,38 +389,39 @@ protected: Value* getAtIndex(int index) { btAssert(index < m_valueArray.size()); - btAssert(index>=0); - if (index>=0 && index < m_valueArray.size()) + btAssert(index >= 0); + if (index >= 0 && index < m_valueArray.size()) { return &m_valueArray[index]; } return 0; } - Key getKeyAtIndex(int index) - { - btAssert(index < m_keyArray.size()); - btAssert(index>=0); - return m_keyArray[index]; - } - - const Key getKeyAtIndex(int index) const - { - btAssert(index < m_keyArray.size()); - btAssert(index>=0); + Key getKeyAtIndex(int index) + { + btAssert(index < m_keyArray.size()); + btAssert(index >= 0); return m_keyArray[index]; - } + } + const Key getKeyAtIndex(int index) const + { + btAssert(index < m_keyArray.size()); + btAssert(index >= 0); + return m_keyArray[index]; + } - Value* operator[](const Key& key) { + Value* operator[](const Key& key) + { return find(key); } - const Value* operator[](const Key& key) const { + const Value* operator[](const Key& key) const + { return find(key); } - const Value* find(const Key& key) const + const Value* find(const Key& key) const { int index = findIndex(key); if (index == BT_HASH_NULL) @@ -429,7 +431,7 @@ protected: return &m_valueArray[index]; } - Value* find(const Key& key) + Value* find(const Key& key) { int index = findIndex(key); if (index == BT_HASH_NULL) @@ -439,10 +441,9 @@ protected: return &m_valueArray[index]; } - - int findIndex(const Key& key) const + int findIndex(const Key& key) const { - unsigned int hash = key.getHash() & (m_valueArray.capacity()-1); + unsigned int hash = key.getHash() & (m_valueArray.capacity() - 1); if (hash >= (unsigned int)m_hashTable.size()) { @@ -457,14 +458,13 @@ protected: return index; } - void clear() + void clear() { m_hashTable.clear(); m_next.clear(); m_valueArray.clear(); m_keyArray.clear(); } - }; -#endif //BT_HASH_MAP_H +#endif //BT_HASH_MAP_H diff --git a/thirdparty/bullet/LinearMath/btIDebugDraw.h b/thirdparty/bullet/LinearMath/btIDebugDraw.h index b57282717d..82ec19a69b 100644 --- a/thirdparty/bullet/LinearMath/btIDebugDraw.h +++ b/thirdparty/bullet/LinearMath/btIDebugDraw.h @@ -13,86 +13,84 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - #ifndef BT_IDEBUG_DRAW__H #define BT_IDEBUG_DRAW__H #include "btVector3.h" #include "btTransform.h" - - ///The btIDebugDraw interface class allows hooking up a debug renderer to visually debug simulations. ///Typical use case: create a debug drawer object, and assign it to a btCollisionWorld or btDynamicsWorld using setDebugDrawer and call debugDrawWorld. ///A class that implements the btIDebugDraw interface has to implement the drawLine method at a minimum. ///For color arguments the X,Y,Z components refer to Red, Green and Blue each in the range [0..1] -class btIDebugDraw +class btIDebugDraw { - public: - - ATTRIBUTE_ALIGNED16(struct) DefaultColors +public: + ATTRIBUTE_ALIGNED16(struct) + DefaultColors { - btVector3 m_activeObject; - btVector3 m_deactivatedObject; - btVector3 m_wantsDeactivationObject; - btVector3 m_disabledDeactivationObject; - btVector3 m_disabledSimulationObject; - btVector3 m_aabb; + btVector3 m_activeObject; + btVector3 m_deactivatedObject; + btVector3 m_wantsDeactivationObject; + btVector3 m_disabledDeactivationObject; + btVector3 m_disabledSimulationObject; + btVector3 m_aabb; btVector3 m_contactPoint; - + DefaultColors() - : m_activeObject(1,1,1), - m_deactivatedObject(0,1,0), - m_wantsDeactivationObject(0,1,1), - m_disabledDeactivationObject(1,0,0), - m_disabledSimulationObject(1,1,0), - m_aabb(1,0,0), - m_contactPoint(1,1,0) + : m_activeObject(1, 1, 1), + m_deactivatedObject(0, 1, 0), + m_wantsDeactivationObject(0, 1, 1), + m_disabledDeactivationObject(1, 0, 0), + m_disabledSimulationObject(1, 1, 0), + m_aabb(1, 0, 0), + m_contactPoint(1, 1, 0) { } }; - - enum DebugDrawModes + enum DebugDrawModes { - DBG_NoDebug=0, + DBG_NoDebug = 0, DBG_DrawWireframe = 1, - DBG_DrawAabb=2, - DBG_DrawFeaturesText=4, - DBG_DrawContactPoints=8, - DBG_NoDeactivation=16, + DBG_DrawAabb = 2, + DBG_DrawFeaturesText = 4, + DBG_DrawContactPoints = 8, + DBG_NoDeactivation = 16, DBG_NoHelpText = 32, - DBG_DrawText=64, + DBG_DrawText = 64, DBG_ProfileTimings = 128, DBG_EnableSatComparison = 256, DBG_DisableBulletLCP = 512, DBG_EnableCCD = 1024, DBG_DrawConstraints = (1 << 11), DBG_DrawConstraintLimits = (1 << 12), - DBG_FastWireframe = (1<<13), - DBG_DrawNormals = (1<<14), - DBG_DrawFrames = (1<<15), + DBG_FastWireframe = (1 << 13), + DBG_DrawNormals = (1 << 14), + DBG_DrawFrames = (1 << 15), DBG_MAX_DEBUG_DRAW_MODE }; - virtual ~btIDebugDraw() {}; + virtual ~btIDebugDraw(){}; - - virtual DefaultColors getDefaultColors() const { DefaultColors colors; return colors; } + virtual DefaultColors getDefaultColors() const + { + DefaultColors colors; + return colors; + } ///the default implementation for setDefaultColors has no effect. A derived class can implement it and store the colors. virtual void setDefaultColors(const DefaultColors& /*colors*/) {} - - virtual void drawLine(const btVector3& from,const btVector3& to,const btVector3& color)=0; - - virtual void drawLine(const btVector3& from,const btVector3& to, const btVector3& fromColor, const btVector3& toColor) + + virtual void drawLine(const btVector3& from, const btVector3& to, const btVector3& color) = 0; + + virtual void drawLine(const btVector3& from, const btVector3& to, const btVector3& fromColor, const btVector3& toColor) { - (void) toColor; - drawLine (from, to, fromColor); + (void)toColor; + drawLine(from, to, fromColor); } - virtual void drawSphere(btScalar radius, const btTransform& transform, const btVector3& color) + virtual void drawSphere(btScalar radius, const btTransform& transform, const btVector3& color) { - btVector3 center = transform.getOrigin(); btVector3 up = transform.getBasis().getColumn(1); btVector3 axis = transform.getBasis().getColumn(0); @@ -101,103 +99,102 @@ class btIDebugDraw btScalar minPs = -SIMD_HALF_PI; btScalar maxPs = SIMD_HALF_PI; btScalar stepDegrees = 30.f; - drawSpherePatch(center, up, axis, radius,minTh, maxTh, minPs, maxPs, color, stepDegrees ,false); - drawSpherePatch(center, up, -axis, radius,minTh, maxTh, minPs, maxPs, color, stepDegrees,false ); + drawSpherePatch(center, up, axis, radius, minTh, maxTh, minPs, maxPs, color, stepDegrees, false); + drawSpherePatch(center, up, -axis, radius, minTh, maxTh, minPs, maxPs, color, stepDegrees, false); } - - virtual void drawSphere (const btVector3& p, btScalar radius, const btVector3& color) + + virtual void drawSphere(const btVector3& p, btScalar radius, const btVector3& color) { btTransform tr; tr.setIdentity(); tr.setOrigin(p); - drawSphere(radius,tr,color); + drawSphere(radius, tr, color); } - - virtual void drawTriangle(const btVector3& v0,const btVector3& v1,const btVector3& v2,const btVector3& /*n0*/,const btVector3& /*n1*/,const btVector3& /*n2*/,const btVector3& color, btScalar alpha) + + virtual void drawTriangle(const btVector3& v0, const btVector3& v1, const btVector3& v2, const btVector3& /*n0*/, const btVector3& /*n1*/, const btVector3& /*n2*/, const btVector3& color, btScalar alpha) { - drawTriangle(v0,v1,v2,color,alpha); + drawTriangle(v0, v1, v2, color, alpha); } - virtual void drawTriangle(const btVector3& v0,const btVector3& v1,const btVector3& v2,const btVector3& color, btScalar /*alpha*/) + virtual void drawTriangle(const btVector3& v0, const btVector3& v1, const btVector3& v2, const btVector3& color, btScalar /*alpha*/) { - drawLine(v0,v1,color); - drawLine(v1,v2,color); - drawLine(v2,v0,color); + drawLine(v0, v1, color); + drawLine(v1, v2, color); + drawLine(v2, v0, color); } - virtual void drawContactPoint(const btVector3& PointOnB,const btVector3& normalOnB,btScalar distance,int lifeTime,const btVector3& color)=0; + virtual void drawContactPoint(const btVector3& PointOnB, const btVector3& normalOnB, btScalar distance, int lifeTime, const btVector3& color) = 0; - virtual void reportErrorWarning(const char* warningString) = 0; + virtual void reportErrorWarning(const char* warningString) = 0; - virtual void draw3dText(const btVector3& location,const char* textString) = 0; - - virtual void setDebugMode(int debugMode) =0; - - virtual int getDebugMode() const = 0; + virtual void draw3dText(const btVector3& location, const char* textString) = 0; - virtual void drawAabb(const btVector3& from,const btVector3& to,const btVector3& color) - { + virtual void setDebugMode(int debugMode) = 0; + + virtual int getDebugMode() const = 0; - btVector3 halfExtents = (to-from)* 0.5f; - btVector3 center = (to+from) *0.5f; - int i,j; + virtual void drawAabb(const btVector3& from, const btVector3& to, const btVector3& color) + { + btVector3 halfExtents = (to - from) * 0.5f; + btVector3 center = (to + from) * 0.5f; + int i, j; - btVector3 edgecoord(1.f,1.f,1.f),pa,pb; - for (i=0;i<4;i++) + btVector3 edgecoord(1.f, 1.f, 1.f), pa, pb; + for (i = 0; i < 4; i++) { - for (j=0;j<3;j++) + for (j = 0; j < 3; j++) { - pa = btVector3(edgecoord[0]*halfExtents[0], edgecoord[1]*halfExtents[1], - edgecoord[2]*halfExtents[2]); - pa+=center; + pa = btVector3(edgecoord[0] * halfExtents[0], edgecoord[1] * halfExtents[1], + edgecoord[2] * halfExtents[2]); + pa += center; - int othercoord = j%3; - edgecoord[othercoord]*=-1.f; - pb = btVector3(edgecoord[0]*halfExtents[0], edgecoord[1]*halfExtents[1], - edgecoord[2]*halfExtents[2]); - pb+=center; + int othercoord = j % 3; + edgecoord[othercoord] *= -1.f; + pb = btVector3(edgecoord[0] * halfExtents[0], edgecoord[1] * halfExtents[1], + edgecoord[2] * halfExtents[2]); + pb += center; - drawLine(pa,pb,color); + drawLine(pa, pb, color); } - edgecoord = btVector3(-1.f,-1.f,-1.f); - if (i<3) - edgecoord[i]*=-1.f; + edgecoord = btVector3(-1.f, -1.f, -1.f); + if (i < 3) + edgecoord[i] *= -1.f; } } virtual void drawTransform(const btTransform& transform, btScalar orthoLen) { btVector3 start = transform.getOrigin(); - drawLine(start, start+transform.getBasis() * btVector3(orthoLen, 0, 0), btVector3(btScalar(1.), btScalar(0.3), btScalar(0.3))); - drawLine(start, start+transform.getBasis() * btVector3(0, orthoLen, 0), btVector3(btScalar(0.3), btScalar(1.), btScalar(0.3))); - drawLine(start, start+transform.getBasis() * btVector3(0, 0, orthoLen), btVector3(btScalar(0.3), btScalar(0.3), btScalar(1.))); + drawLine(start, start + transform.getBasis() * btVector3(orthoLen, 0, 0), btVector3(btScalar(1.), btScalar(0.3), btScalar(0.3))); + drawLine(start, start + transform.getBasis() * btVector3(0, orthoLen, 0), btVector3(btScalar(0.3), btScalar(1.), btScalar(0.3))); + drawLine(start, start + transform.getBasis() * btVector3(0, 0, orthoLen), btVector3(btScalar(0.3), btScalar(0.3), btScalar(1.))); } - virtual void drawArc(const btVector3& center, const btVector3& normal, const btVector3& axis, btScalar radiusA, btScalar radiusB, btScalar minAngle, btScalar maxAngle, - const btVector3& color, bool drawSect, btScalar stepDegrees = btScalar(10.f)) + virtual void drawArc(const btVector3& center, const btVector3& normal, const btVector3& axis, btScalar radiusA, btScalar radiusB, btScalar minAngle, btScalar maxAngle, + const btVector3& color, bool drawSect, btScalar stepDegrees = btScalar(10.f)) { const btVector3& vx = axis; btVector3 vy = normal.cross(axis); btScalar step = stepDegrees * SIMD_RADS_PER_DEG; int nSteps = (int)btFabs((maxAngle - minAngle) / step); - if(!nSteps) nSteps = 1; + if (!nSteps) nSteps = 1; btVector3 prev = center + radiusA * vx * btCos(minAngle) + radiusB * vy * btSin(minAngle); - if(drawSect) + if (drawSect) { drawLine(center, prev, color); } - for(int i = 1; i <= nSteps; i++) + for (int i = 1; i <= nSteps; i++) { btScalar angle = minAngle + (maxAngle - minAngle) * btScalar(i) / btScalar(nSteps); btVector3 next = center + radiusA * vx * btCos(angle) + radiusB * vy * btSin(angle); drawLine(prev, next, color); prev = next; } - if(drawSect) + if (drawSect) { drawLine(center, prev, color); } } - virtual void drawSpherePatch(const btVector3& center, const btVector3& up, const btVector3& axis, btScalar radius, - btScalar minTh, btScalar maxTh, btScalar minPs, btScalar maxPs, const btVector3& color, btScalar stepDegrees = btScalar(10.f),bool drawCenter = true) + virtual void drawSpherePatch(const btVector3& center, const btVector3& up, const btVector3& axis, btScalar radius, + btScalar minTh, btScalar maxTh, btScalar minPs, btScalar maxPs, const btVector3& color, btScalar stepDegrees = btScalar(10.f), bool drawCenter = true) { btVector3 vA[74]; btVector3 vB[74]; @@ -211,33 +208,33 @@ class btIDebugDraw btVector3 jv = kv.cross(iv); bool drawN = false; bool drawS = false; - if(minTh <= -SIMD_HALF_PI) + if (minTh <= -SIMD_HALF_PI) { minTh = -SIMD_HALF_PI + step; drawN = true; } - if(maxTh >= SIMD_HALF_PI) + if (maxTh >= SIMD_HALF_PI) { maxTh = SIMD_HALF_PI - step; drawS = true; } - if(minTh > maxTh) + if (minTh > maxTh) { minTh = -SIMD_HALF_PI + step; - maxTh = SIMD_HALF_PI - step; + maxTh = SIMD_HALF_PI - step; drawN = drawS = true; } int n_hor = (int)((maxTh - minTh) / step) + 1; - if(n_hor < 2) n_hor = 2; + if (n_hor < 2) n_hor = 2; btScalar step_h = (maxTh - minTh) / btScalar(n_hor - 1); bool isClosed = false; - if(minPs > maxPs) + if (minPs > maxPs) { minPs = -SIMD_PI + step; - maxPs = SIMD_PI; + maxPs = SIMD_PI; isClosed = true; } - else if((maxPs - minPs) >= SIMD_PI * btScalar(2.f)) + else if ((maxPs - minPs) >= SIMD_PI * btScalar(2.f)) { isClosed = true; } @@ -246,63 +243,64 @@ class btIDebugDraw isClosed = false; } int n_vert = (int)((maxPs - minPs) / step) + 1; - if(n_vert < 2) n_vert = 2; + if (n_vert < 2) n_vert = 2; btScalar step_v = (maxPs - minPs) / btScalar(n_vert - 1); - for(int i = 0; i < n_hor; i++) + for (int i = 0; i < n_hor; i++) { btScalar th = minTh + btScalar(i) * step_h; btScalar sth = radius * btSin(th); btScalar cth = radius * btCos(th); - for(int j = 0; j < n_vert; j++) + for (int j = 0; j < n_vert; j++) { btScalar psi = minPs + btScalar(j) * step_v; btScalar sps = btSin(psi); btScalar cps = btCos(psi); pvB[j] = center + cth * cps * iv + cth * sps * jv + sth * kv; - if(i) + if (i) { drawLine(pvA[j], pvB[j], color); } - else if(drawS) + else if (drawS) { drawLine(spole, pvB[j], color); } - if(j) + if (j) { - drawLine(pvB[j-1], pvB[j], color); + drawLine(pvB[j - 1], pvB[j], color); } else { arcStart = pvB[j]; } - if((i == (n_hor - 1)) && drawN) + if ((i == (n_hor - 1)) && drawN) { drawLine(npole, pvB[j], color); } - + if (drawCenter) { - if(isClosed) + if (isClosed) { - if(j == (n_vert-1)) + if (j == (n_vert - 1)) { drawLine(arcStart, pvB[j], color); } } else { - if(((!i) || (i == (n_hor-1))) && ((!j) || (j == (n_vert-1)))) + if (((!i) || (i == (n_hor - 1))) && ((!j) || (j == (n_vert - 1)))) { drawLine(center, pvB[j], color); } } } } - pT = pvA; pvA = pvB; pvB = pT; + pT = pvA; + pvA = pvB; + pvB = pT; } } - - + virtual void drawBox(const btVector3& bbMin, const btVector3& bbMax, const btVector3& color) { drawLine(btVector3(bbMin[0], bbMin[1], bbMin[2]), btVector3(bbMax[0], bbMin[1], bbMin[2]), color); @@ -338,31 +336,27 @@ class btIDebugDraw { int stepDegrees = 30; - btVector3 capStart(0.f,0.f,0.f); + btVector3 capStart(0.f, 0.f, 0.f); capStart[upAxis] = -halfHeight; - btVector3 capEnd(0.f,0.f,0.f); + btVector3 capEnd(0.f, 0.f, 0.f); capEnd[upAxis] = halfHeight; // Draw the ends { - btTransform childTransform = transform; childTransform.getOrigin() = transform * capStart; { btVector3 center = childTransform.getOrigin(); - btVector3 up = childTransform.getBasis().getColumn((upAxis+1)%3); + btVector3 up = childTransform.getBasis().getColumn((upAxis + 1) % 3); btVector3 axis = -childTransform.getBasis().getColumn(upAxis); btScalar minTh = -SIMD_HALF_PI; btScalar maxTh = SIMD_HALF_PI; btScalar minPs = -SIMD_HALF_PI; btScalar maxPs = SIMD_HALF_PI; - - drawSpherePatch(center, up, axis, radius,minTh, maxTh, minPs, maxPs, color, btScalar(stepDegrees) ,false); - } - - + drawSpherePatch(center, up, axis, radius, minTh, maxTh, minPs, maxPs, color, btScalar(stepDegrees), false); + } } { @@ -370,52 +364,51 @@ class btIDebugDraw childTransform.getOrigin() = transform * capEnd; { btVector3 center = childTransform.getOrigin(); - btVector3 up = childTransform.getBasis().getColumn((upAxis+1)%3); + btVector3 up = childTransform.getBasis().getColumn((upAxis + 1) % 3); btVector3 axis = childTransform.getBasis().getColumn(upAxis); btScalar minTh = -SIMD_HALF_PI; btScalar maxTh = SIMD_HALF_PI; btScalar minPs = -SIMD_HALF_PI; btScalar maxPs = SIMD_HALF_PI; - drawSpherePatch(center, up, axis, radius,minTh, maxTh, minPs, maxPs, color, btScalar(stepDegrees) ,false); + drawSpherePatch(center, up, axis, radius, minTh, maxTh, minPs, maxPs, color, btScalar(stepDegrees), false); } } // Draw some additional lines btVector3 start = transform.getOrigin(); - for (int i=0;i<360;i+=stepDegrees) + for (int i = 0; i < 360; i += stepDegrees) { - capEnd[(upAxis+1)%3] = capStart[(upAxis+1)%3] = btSin(btScalar(i)*SIMD_RADS_PER_DEG)*radius; - capEnd[(upAxis+2)%3] = capStart[(upAxis+2)%3] = btCos(btScalar(i)*SIMD_RADS_PER_DEG)*radius; - drawLine(start+transform.getBasis() * capStart,start+transform.getBasis() * capEnd, color); + capEnd[(upAxis + 1) % 3] = capStart[(upAxis + 1) % 3] = btSin(btScalar(i) * SIMD_RADS_PER_DEG) * radius; + capEnd[(upAxis + 2) % 3] = capStart[(upAxis + 2) % 3] = btCos(btScalar(i) * SIMD_RADS_PER_DEG) * radius; + drawLine(start + transform.getBasis() * capStart, start + transform.getBasis() * capEnd, color); } - } virtual void drawCylinder(btScalar radius, btScalar halfHeight, int upAxis, const btTransform& transform, const btVector3& color) { btVector3 start = transform.getOrigin(); - btVector3 offsetHeight(0,0,0); + btVector3 offsetHeight(0, 0, 0); offsetHeight[upAxis] = halfHeight; - int stepDegrees=30; - btVector3 capStart(0.f,0.f,0.f); + int stepDegrees = 30; + btVector3 capStart(0.f, 0.f, 0.f); capStart[upAxis] = -halfHeight; - btVector3 capEnd(0.f,0.f,0.f); + btVector3 capEnd(0.f, 0.f, 0.f); capEnd[upAxis] = halfHeight; - for (int i=0;i<360;i+=stepDegrees) + for (int i = 0; i < 360; i += stepDegrees) { - capEnd[(upAxis+1)%3] = capStart[(upAxis+1)%3] = btSin(btScalar(i)*SIMD_RADS_PER_DEG)*radius; - capEnd[(upAxis+2)%3] = capStart[(upAxis+2)%3] = btCos(btScalar(i)*SIMD_RADS_PER_DEG)*radius; - drawLine(start+transform.getBasis() * capStart,start+transform.getBasis() * capEnd, color); + capEnd[(upAxis + 1) % 3] = capStart[(upAxis + 1) % 3] = btSin(btScalar(i) * SIMD_RADS_PER_DEG) * radius; + capEnd[(upAxis + 2) % 3] = capStart[(upAxis + 2) % 3] = btCos(btScalar(i) * SIMD_RADS_PER_DEG) * radius; + drawLine(start + transform.getBasis() * capStart, start + transform.getBasis() * capEnd, color); } // Drawing top and bottom caps of the cylinder - btVector3 yaxis(0,0,0); + btVector3 yaxis(0, 0, 0); yaxis[upAxis] = btScalar(1.0); - btVector3 xaxis(0,0,0); - xaxis[(upAxis+1)%3] = btScalar(1.0); - drawArc(start-transform.getBasis()*(offsetHeight),transform.getBasis()*yaxis,transform.getBasis()*xaxis,radius,radius,0,SIMD_2_PI,color,false,btScalar(10.0)); - drawArc(start+transform.getBasis()*(offsetHeight),transform.getBasis()*yaxis,transform.getBasis()*xaxis,radius,radius,0,SIMD_2_PI,color,false,btScalar(10.0)); + btVector3 xaxis(0, 0, 0); + xaxis[(upAxis + 1) % 3] = btScalar(1.0); + drawArc(start - transform.getBasis() * (offsetHeight), transform.getBasis() * yaxis, transform.getBasis() * xaxis, radius, radius, 0, SIMD_2_PI, color, false, btScalar(10.0)); + drawArc(start + transform.getBasis() * (offsetHeight), transform.getBasis() * yaxis, transform.getBasis() * xaxis, radius, radius, 0, SIMD_2_PI, color, false, btScalar(10.0)); } virtual void drawCone(btScalar radius, btScalar height, int upAxis, const btTransform& transform, const btVector3& color) @@ -423,50 +416,49 @@ class btIDebugDraw int stepDegrees = 30; btVector3 start = transform.getOrigin(); - btVector3 offsetHeight(0,0,0); + btVector3 offsetHeight(0, 0, 0); btScalar halfHeight = height * btScalar(0.5); offsetHeight[upAxis] = halfHeight; - btVector3 offsetRadius(0,0,0); - offsetRadius[(upAxis+1)%3] = radius; - btVector3 offset2Radius(0,0,0); - offset2Radius[(upAxis+2)%3] = radius; + btVector3 offsetRadius(0, 0, 0); + offsetRadius[(upAxis + 1) % 3] = radius; + btVector3 offset2Radius(0, 0, 0); + offset2Radius[(upAxis + 2) % 3] = radius; - - btVector3 capEnd(0.f,0.f,0.f); + btVector3 capEnd(0.f, 0.f, 0.f); capEnd[upAxis] = -halfHeight; - for (int i=0;i<360;i+=stepDegrees) + for (int i = 0; i < 360; i += stepDegrees) { - capEnd[(upAxis+1)%3] = btSin(btScalar(i)*SIMD_RADS_PER_DEG)*radius; - capEnd[(upAxis+2)%3] = btCos(btScalar(i)*SIMD_RADS_PER_DEG)*radius; - drawLine(start+transform.getBasis() * (offsetHeight),start+transform.getBasis() * capEnd, color); + capEnd[(upAxis + 1) % 3] = btSin(btScalar(i) * SIMD_RADS_PER_DEG) * radius; + capEnd[(upAxis + 2) % 3] = btCos(btScalar(i) * SIMD_RADS_PER_DEG) * radius; + drawLine(start + transform.getBasis() * (offsetHeight), start + transform.getBasis() * capEnd, color); } - drawLine(start+transform.getBasis() * (offsetHeight),start+transform.getBasis() * (-offsetHeight+offsetRadius),color); - drawLine(start+transform.getBasis() * (offsetHeight),start+transform.getBasis() * (-offsetHeight-offsetRadius),color); - drawLine(start+transform.getBasis() * (offsetHeight),start+transform.getBasis() * (-offsetHeight+offset2Radius),color); - drawLine(start+transform.getBasis() * (offsetHeight),start+transform.getBasis() * (-offsetHeight-offset2Radius),color); + drawLine(start + transform.getBasis() * (offsetHeight), start + transform.getBasis() * (-offsetHeight + offsetRadius), color); + drawLine(start + transform.getBasis() * (offsetHeight), start + transform.getBasis() * (-offsetHeight - offsetRadius), color); + drawLine(start + transform.getBasis() * (offsetHeight), start + transform.getBasis() * (-offsetHeight + offset2Radius), color); + drawLine(start + transform.getBasis() * (offsetHeight), start + transform.getBasis() * (-offsetHeight - offset2Radius), color); // Drawing the base of the cone - btVector3 yaxis(0,0,0); + btVector3 yaxis(0, 0, 0); yaxis[upAxis] = btScalar(1.0); - btVector3 xaxis(0,0,0); - xaxis[(upAxis+1)%3] = btScalar(1.0); - drawArc(start-transform.getBasis()*(offsetHeight),transform.getBasis()*yaxis,transform.getBasis()*xaxis,radius,radius,0,SIMD_2_PI,color,false,10.0); + btVector3 xaxis(0, 0, 0); + xaxis[(upAxis + 1) % 3] = btScalar(1.0); + drawArc(start - transform.getBasis() * (offsetHeight), transform.getBasis() * yaxis, transform.getBasis() * xaxis, radius, radius, 0, SIMD_2_PI, color, false, 10.0); } virtual void drawPlane(const btVector3& planeNormal, btScalar planeConst, const btTransform& transform, const btVector3& color) { btVector3 planeOrigin = planeNormal * planeConst; - btVector3 vec0,vec1; - btPlaneSpace1(planeNormal,vec0,vec1); + btVector3 vec0, vec1; + btPlaneSpace1(planeNormal, vec0, vec1); btScalar vecLen = 100.f; - btVector3 pt0 = planeOrigin + vec0*vecLen; - btVector3 pt1 = planeOrigin - vec0*vecLen; - btVector3 pt2 = planeOrigin + vec1*vecLen; - btVector3 pt3 = planeOrigin - vec1*vecLen; - drawLine(transform*pt0,transform*pt1,color); - drawLine(transform*pt2,transform*pt3,color); + btVector3 pt0 = planeOrigin + vec0 * vecLen; + btVector3 pt1 = planeOrigin - vec0 * vecLen; + btVector3 pt2 = planeOrigin + vec1 * vecLen; + btVector3 pt3 = planeOrigin - vec1 * vecLen; + drawLine(transform * pt0, transform * pt1, color); + drawLine(transform * pt2, transform * pt3, color); } virtual void clearLines() @@ -478,6 +470,4 @@ class btIDebugDraw } }; - -#endif //BT_IDEBUG_DRAW__H - +#endif //BT_IDEBUG_DRAW__H diff --git a/thirdparty/bullet/LinearMath/btList.h b/thirdparty/bullet/LinearMath/btList.h index eec80a7064..b255938c30 100644 --- a/thirdparty/bullet/LinearMath/btList.h +++ b/thirdparty/bullet/LinearMath/btList.h @@ -12,62 +12,62 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - - #ifndef BT_GEN_LIST_H #define BT_GEN_LIST_H -class btGEN_Link { +class btGEN_Link +{ public: - btGEN_Link() : m_next(0), m_prev(0) {} - btGEN_Link(btGEN_Link *next, btGEN_Link *prev) : m_next(next), m_prev(prev) {} - - btGEN_Link *getNext() const { return m_next; } - btGEN_Link *getPrev() const { return m_prev; } - - bool isHead() const { return m_prev == 0; } - bool isTail() const { return m_next == 0; } - - void insertBefore(btGEN_Link *link) { - m_next = link; - m_prev = link->m_prev; - m_next->m_prev = this; - m_prev->m_next = this; - } - - void insertAfter(btGEN_Link *link) { - m_next = link->m_next; - m_prev = link; - m_next->m_prev = this; - m_prev->m_next = this; - } - - void remove() { - m_next->m_prev = m_prev; - m_prev->m_next = m_next; - } + btGEN_Link() : m_next(0), m_prev(0) {} + btGEN_Link(btGEN_Link *next, btGEN_Link *prev) : m_next(next), m_prev(prev) {} + + btGEN_Link *getNext() const { return m_next; } + btGEN_Link *getPrev() const { return m_prev; } + + bool isHead() const { return m_prev == 0; } + bool isTail() const { return m_next == 0; } + + void insertBefore(btGEN_Link *link) + { + m_next = link; + m_prev = link->m_prev; + m_next->m_prev = this; + m_prev->m_next = this; + } + + void insertAfter(btGEN_Link *link) + { + m_next = link->m_next; + m_prev = link; + m_next->m_prev = this; + m_prev->m_next = this; + } + + void remove() + { + m_next->m_prev = m_prev; + m_prev->m_next = m_next; + } -private: - btGEN_Link *m_next; - btGEN_Link *m_prev; +private: + btGEN_Link *m_next; + btGEN_Link *m_prev; }; -class btGEN_List { +class btGEN_List +{ public: - btGEN_List() : m_head(&m_tail, 0), m_tail(0, &m_head) {} + btGEN_List() : m_head(&m_tail, 0), m_tail(0, &m_head) {} + + btGEN_Link *getHead() const { return m_head.getNext(); } + btGEN_Link *getTail() const { return m_tail.getPrev(); } - btGEN_Link *getHead() const { return m_head.getNext(); } - btGEN_Link *getTail() const { return m_tail.getPrev(); } + void addHead(btGEN_Link *link) { link->insertAfter(&m_head); } + void addTail(btGEN_Link *link) { link->insertBefore(&m_tail); } - void addHead(btGEN_Link *link) { link->insertAfter(&m_head); } - void addTail(btGEN_Link *link) { link->insertBefore(&m_tail); } - private: - btGEN_Link m_head; - btGEN_Link m_tail; + btGEN_Link m_head; + btGEN_Link m_tail; }; -#endif //BT_GEN_LIST_H - - - +#endif //BT_GEN_LIST_H diff --git a/thirdparty/bullet/LinearMath/btMatrix3x3.h b/thirdparty/bullet/LinearMath/btMatrix3x3.h index 6cc4993da5..0a08ae409a 100644 --- a/thirdparty/bullet/LinearMath/btMatrix3x3.h +++ b/thirdparty/bullet/LinearMath/btMatrix3x3.h @@ -12,8 +12,7 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - -#ifndef BT_MATRIX3x3_H +#ifndef BT_MATRIX3x3_H #define BT_MATRIX3x3_H #include "btVector3.h" @@ -23,13 +22,13 @@ subject to the following restrictions: #ifdef BT_USE_SSE //const __m128 ATTRIBUTE_ALIGNED16(v2220) = {2.0f, 2.0f, 2.0f, 0.0f}; //const __m128 ATTRIBUTE_ALIGNED16(vMPPP) = {-0.0f, +0.0f, +0.0f, +0.0f}; -#define vMPPP (_mm_set_ps (+0.0f, +0.0f, +0.0f, -0.0f)) +#define vMPPP (_mm_set_ps(+0.0f, +0.0f, +0.0f, -0.0f)) #endif #if defined(BT_USE_SSE) -#define v1000 (_mm_set_ps(0.0f,0.0f,0.0f,1.0f)) -#define v0100 (_mm_set_ps(0.0f,0.0f,1.0f,0.0f)) -#define v0010 (_mm_set_ps(0.0f,1.0f,0.0f,0.0f)) +#define v1000 (_mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f)) +#define v0100 (_mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f)) +#define v0010 (_mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f)) #elif defined(BT_USE_NEON) const btSimdFloat4 ATTRIBUTE_ALIGNED16(v1000) = {1.0f, 0.0f, 0.0f, 0.0f}; const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0100) = {0.0f, 1.0f, 0.0f, 0.0f}; @@ -37,22 +36,22 @@ const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0010) = {0.0f, 0.0f, 1.0f, 0.0f}; #endif #ifdef BT_USE_DOUBLE_PRECISION -#define btMatrix3x3Data btMatrix3x3DoubleData +#define btMatrix3x3Data btMatrix3x3DoubleData #else -#define btMatrix3x3Data btMatrix3x3FloatData -#endif //BT_USE_DOUBLE_PRECISION - +#define btMatrix3x3Data btMatrix3x3FloatData +#endif //BT_USE_DOUBLE_PRECISION /**@brief The btMatrix3x3 class implements a 3x3 rotation matrix, to perform linear algebra in combination with btQuaternion, btTransform and btVector3. * Make sure to only include a pure orthogonal matrix without scaling. */ -ATTRIBUTE_ALIGNED16(class) btMatrix3x3 { - +ATTRIBUTE_ALIGNED16(class) +btMatrix3x3 +{ ///Data storage for the matrix, each vector is a row of the matrix btVector3 m_el[3]; public: /** @brief No initializaion constructor */ - btMatrix3x3 () {} + btMatrix3x3() {} // explicit btMatrix3x3(const btScalar *m) { setFromOpenGLSubMatrix(m); } @@ -67,27 +66,27 @@ public: */ /** @brief Constructor with row major formatting */ btMatrix3x3(const btScalar& xx, const btScalar& xy, const btScalar& xz, - const btScalar& yx, const btScalar& yy, const btScalar& yz, - const btScalar& zx, const btScalar& zy, const btScalar& zz) - { - setValue(xx, xy, xz, - yx, yy, yz, - zx, zy, zz); + const btScalar& yx, const btScalar& yy, const btScalar& yz, + const btScalar& zx, const btScalar& zy, const btScalar& zz) + { + setValue(xx, xy, xz, + yx, yy, yz, + zx, zy, zz); } -#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON) - SIMD_FORCE_INLINE btMatrix3x3 (const btSimdFloat4 v0, const btSimdFloat4 v1, const btSimdFloat4 v2 ) +#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) + SIMD_FORCE_INLINE btMatrix3x3(const btSimdFloat4 v0, const btSimdFloat4 v1, const btSimdFloat4 v2) { - m_el[0].mVec128 = v0; - m_el[1].mVec128 = v1; - m_el[2].mVec128 = v2; + m_el[0].mVec128 = v0; + m_el[1].mVec128 = v1; + m_el[2].mVec128 = v2; } - SIMD_FORCE_INLINE btMatrix3x3 (const btVector3& v0, const btVector3& v1, const btVector3& v2 ) + SIMD_FORCE_INLINE btMatrix3x3(const btVector3& v0, const btVector3& v1, const btVector3& v2) { - m_el[0] = v0; - m_el[1] = v1; - m_el[2] = v2; + m_el[0] = v0; + m_el[1] = v1; + m_el[2] = v2; } // Copy constructor @@ -99,25 +98,25 @@ public: } // Assignment Operator - SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& m) + SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& m) { m_el[0].mVec128 = m.m_el[0].mVec128; m_el[1].mVec128 = m.m_el[1].mVec128; m_el[2].mVec128 = m.m_el[2].mVec128; - + return *this; } #else /** @brief Copy constructor */ - SIMD_FORCE_INLINE btMatrix3x3 (const btMatrix3x3& other) + SIMD_FORCE_INLINE btMatrix3x3(const btMatrix3x3& other) { m_el[0] = other.m_el[0]; m_el[1] = other.m_el[1]; m_el[2] = other.m_el[2]; } - + /** @brief Assignment Operator */ SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& other) { @@ -133,10 +132,9 @@ public: * @param i Column number 0 indexed */ SIMD_FORCE_INLINE btVector3 getColumn(int i) const { - return btVector3(m_el[0][i],m_el[1][i],m_el[2][i]); + return btVector3(m_el[0][i], m_el[1][i], m_el[2][i]); } - /** @brief Get a row of the matrix as a vector * @param i Row number 0 indexed */ SIMD_FORCE_INLINE const btVector3& getRow(int i) const @@ -147,10 +145,10 @@ public: /** @brief Get a mutable reference to a row of the matrix as a vector * @param i Row number 0 indexed */ - SIMD_FORCE_INLINE btVector3& operator[](int i) - { + SIMD_FORCE_INLINE btVector3& operator[](int i) + { btFullAssert(0 <= i && i < 3); - return m_el[i]; + return m_el[i]; } /** @brief Get a const reference to a row of the matrix as a vector @@ -158,32 +156,31 @@ public: SIMD_FORCE_INLINE const btVector3& operator[](int i) const { btFullAssert(0 <= i && i < 3); - return m_el[i]; + return m_el[i]; } /** @brief Multiply by the target matrix on the right * @param m Rotation matrix to be applied * Equivilant to this = this * m */ - btMatrix3x3& operator*=(const btMatrix3x3& m); + btMatrix3x3& operator*=(const btMatrix3x3& m); /** @brief Adds by the target matrix on the right * @param m matrix to be applied * Equivilant to this = this + m */ - btMatrix3x3& operator+=(const btMatrix3x3& m); + btMatrix3x3& operator+=(const btMatrix3x3& m); /** @brief Substractss by the target matrix on the right * @param m matrix to be applied * Equivilant to this = this - m */ - btMatrix3x3& operator-=(const btMatrix3x3& m); + btMatrix3x3& operator-=(const btMatrix3x3& m); /** @brief Set from the rotational part of a 4x4 OpenGL matrix * @param m A pointer to the beginning of the array of scalars*/ - void setFromOpenGLSubMatrix(const btScalar *m) + void setFromOpenGLSubMatrix(const btScalar* m) { - m_el[0].setValue(m[0],m[4],m[8]); - m_el[1].setValue(m[1],m[5],m[9]); - m_el[2].setValue(m[2],m[6],m[10]); - + m_el[0].setValue(m[0], m[4], m[8]); + m_el[1].setValue(m[1], m[5], m[9]); + m_el[2].setValue(m[2], m[6], m[10]); } /** @brief Set the values of the matrix explicitly (row major) * @param xx Top left @@ -195,93 +192,92 @@ public: * @param zx Bottom Left * @param zy Bottom Middle * @param zz Bottom Right*/ - void setValue(const btScalar& xx, const btScalar& xy, const btScalar& xz, - const btScalar& yx, const btScalar& yy, const btScalar& yz, - const btScalar& zx, const btScalar& zy, const btScalar& zz) + void setValue(const btScalar& xx, const btScalar& xy, const btScalar& xz, + const btScalar& yx, const btScalar& yy, const btScalar& yz, + const btScalar& zx, const btScalar& zy, const btScalar& zz) { - m_el[0].setValue(xx,xy,xz); - m_el[1].setValue(yx,yy,yz); - m_el[2].setValue(zx,zy,zz); + m_el[0].setValue(xx, xy, xz); + m_el[1].setValue(yx, yy, yz); + m_el[2].setValue(zx, zy, zz); } /** @brief Set the matrix from a quaternion - * @param q The Quaternion to match */ - void setRotation(const btQuaternion& q) + * @param q The Quaternion to match */ + void setRotation(const btQuaternion& q) { btScalar d = q.length2(); btFullAssert(d != btScalar(0.0)); btScalar s = btScalar(2.0) / d; - - #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) - __m128 vs, Q = q.get128(); + +#if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) + __m128 vs, Q = q.get128(); __m128i Qi = btCastfTo128i(Q); - __m128 Y, Z; - __m128 V1, V2, V3; - __m128 V11, V21, V31; - __m128 NQ = _mm_xor_ps(Q, btvMzeroMask); + __m128 Y, Z; + __m128 V1, V2, V3; + __m128 V11, V21, V31; + __m128 NQ = _mm_xor_ps(Q, btvMzeroMask); __m128i NQi = btCastfTo128i(NQ); - - V1 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,2,3))); // Y X Z W - V2 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(0,0,1,3)); // -X -X Y W - V3 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(2,1,0,3))); // Z Y X W - V1 = _mm_xor_ps(V1, vMPPP); // change the sign of the first element - - V11 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,1,0,3))); // Y Y X W - V21 = _mm_unpackhi_ps(Q, Q); // Z Z W W - V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(0,2,0,3)); // X Z -X -W - - V2 = V2 * V1; // - V1 = V1 * V11; // - V3 = V3 * V31; // - - V11 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(2,3,1,3)); // -Z -W Y W - V11 = V11 * V21; // - V21 = _mm_xor_ps(V21, vMPPP); // change the sign of the first element - V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(3,3,1,3)); // W W -Y -W - V31 = _mm_xor_ps(V31, vMPPP); // change the sign of the first element - Y = btCastiTo128f(_mm_shuffle_epi32 (NQi, BT_SHUFFLE(3,2,0,3))); // -W -Z -X -W - Z = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,1,3))); // Y X Y W + + V1 = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(1, 0, 2, 3))); // Y X Z W + V2 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(0, 0, 1, 3)); // -X -X Y W + V3 = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(2, 1, 0, 3))); // Z Y X W + V1 = _mm_xor_ps(V1, vMPPP); // change the sign of the first element + + V11 = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(1, 1, 0, 3))); // Y Y X W + V21 = _mm_unpackhi_ps(Q, Q); // Z Z W W + V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(0, 2, 0, 3)); // X Z -X -W + + V2 = V2 * V1; // + V1 = V1 * V11; // + V3 = V3 * V31; // + + V11 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(2, 3, 1, 3)); // -Z -W Y W + V11 = V11 * V21; // + V21 = _mm_xor_ps(V21, vMPPP); // change the sign of the first element + V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(3, 3, 1, 3)); // W W -Y -W + V31 = _mm_xor_ps(V31, vMPPP); // change the sign of the first element + Y = btCastiTo128f(_mm_shuffle_epi32(NQi, BT_SHUFFLE(3, 2, 0, 3))); // -W -Z -X -W + Z = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(1, 0, 1, 3))); // Y X Y W vs = _mm_load_ss(&s); V21 = V21 * Y; V31 = V31 * Z; V1 = V1 + V11; - V2 = V2 + V21; - V3 = V3 + V31; - - vs = bt_splat3_ps(vs, 0); - // s ready - V1 = V1 * vs; - V2 = V2 * vs; - V3 = V3 * vs; - - V1 = V1 + v1000; - V2 = V2 + v0100; - V3 = V3 + v0010; - - m_el[0] = V1; - m_el[1] = V2; - m_el[2] = V3; - #else - btScalar xs = q.x() * s, ys = q.y() * s, zs = q.z() * s; - btScalar wx = q.w() * xs, wy = q.w() * ys, wz = q.w() * zs; - btScalar xx = q.x() * xs, xy = q.x() * ys, xz = q.x() * zs; - btScalar yy = q.y() * ys, yz = q.y() * zs, zz = q.z() * zs; + V2 = V2 + V21; + V3 = V3 + V31; + + vs = bt_splat3_ps(vs, 0); + // s ready + V1 = V1 * vs; + V2 = V2 * vs; + V3 = V3 * vs; + + V1 = V1 + v1000; + V2 = V2 + v0100; + V3 = V3 + v0010; + + m_el[0] = V1; + m_el[1] = V2; + m_el[2] = V3; +#else + btScalar xs = q.x() * s, ys = q.y() * s, zs = q.z() * s; + btScalar wx = q.w() * xs, wy = q.w() * ys, wz = q.w() * zs; + btScalar xx = q.x() * xs, xy = q.x() * ys, xz = q.x() * zs; + btScalar yy = q.y() * ys, yz = q.y() * zs, zz = q.z() * zs; setValue( - btScalar(1.0) - (yy + zz), xy - wz, xz + wy, + btScalar(1.0) - (yy + zz), xy - wz, xz + wy, xy + wz, btScalar(1.0) - (xx + zz), yz - wx, xz - wy, yz + wx, btScalar(1.0) - (xx + yy)); - #endif - } - +#endif + } /** @brief Set the matrix from euler angles using YPR around YXZ respectively * @param yaw Yaw about Y axis * @param pitch Pitch about X axis * @param roll Roll about Z axis */ - void setEulerYPR(const btScalar& yaw, const btScalar& pitch, const btScalar& roll) + void setEulerYPR(const btScalar& yaw, const btScalar& pitch, const btScalar& roll) { setEulerZYX(roll, pitch, yaw); } @@ -295,182 +291,197 @@ public: * angles are applied in ZYX order. I.e a vector is first rotated * about X then Y and then Z **/ - void setEulerZYX(btScalar eulerX,btScalar eulerY,btScalar eulerZ) { + void setEulerZYX(btScalar eulerX, btScalar eulerY, btScalar eulerZ) + { ///@todo proposed to reverse this since it's labeled zyx but takes arguments xyz and it will match all other parts of the code - btScalar ci ( btCos(eulerX)); - btScalar cj ( btCos(eulerY)); - btScalar ch ( btCos(eulerZ)); - btScalar si ( btSin(eulerX)); - btScalar sj ( btSin(eulerY)); - btScalar sh ( btSin(eulerZ)); - btScalar cc = ci * ch; - btScalar cs = ci * sh; - btScalar sc = si * ch; + btScalar ci(btCos(eulerX)); + btScalar cj(btCos(eulerY)); + btScalar ch(btCos(eulerZ)); + btScalar si(btSin(eulerX)); + btScalar sj(btSin(eulerY)); + btScalar sh(btSin(eulerZ)); + btScalar cc = ci * ch; + btScalar cs = ci * sh; + btScalar sc = si * ch; btScalar ss = si * sh; setValue(cj * ch, sj * sc - cs, sj * cc + ss, - cj * sh, sj * ss + cc, sj * cs - sc, - -sj, cj * si, cj * ci); + cj * sh, sj * ss + cc, sj * cs - sc, + -sj, cj * si, cj * ci); } /**@brief Set the matrix to the identity */ void setIdentity() - { -#if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined(BT_USE_NEON) - m_el[0] = v1000; - m_el[1] = v0100; - m_el[2] = v0010; + { +#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) + m_el[0] = v1000; + m_el[1] = v0100; + m_el[2] = v0010; #else - setValue(btScalar(1.0), btScalar(0.0), btScalar(0.0), - btScalar(0.0), btScalar(1.0), btScalar(0.0), - btScalar(0.0), btScalar(0.0), btScalar(1.0)); + setValue(btScalar(1.0), btScalar(0.0), btScalar(0.0), + btScalar(0.0), btScalar(1.0), btScalar(0.0), + btScalar(0.0), btScalar(0.0), btScalar(1.0)); #endif } - static const btMatrix3x3& getIdentity() + static const btMatrix3x3& getIdentity() { -#if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined(BT_USE_NEON) - static const btMatrix3x3 - identityMatrix(v1000, v0100, v0010); +#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) + static const btMatrix3x3 + identityMatrix(v1000, v0100, v0010); #else - static const btMatrix3x3 - identityMatrix( - btScalar(1.0), btScalar(0.0), btScalar(0.0), - btScalar(0.0), btScalar(1.0), btScalar(0.0), - btScalar(0.0), btScalar(0.0), btScalar(1.0)); + static const btMatrix3x3 + identityMatrix( + btScalar(1.0), btScalar(0.0), btScalar(0.0), + btScalar(0.0), btScalar(1.0), btScalar(0.0), + btScalar(0.0), btScalar(0.0), btScalar(1.0)); #endif return identityMatrix; } /**@brief Fill the rotational part of an OpenGL matrix and clear the shear/perspective * @param m The array to be filled */ - void getOpenGLSubMatrix(btScalar *m) const + void getOpenGLSubMatrix(btScalar * m) const { -#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) - __m128 v0 = m_el[0].mVec128; - __m128 v1 = m_el[1].mVec128; - __m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2 - __m128 *vm = (__m128 *)m; - __m128 vT; - - v2 = _mm_and_ps(v2, btvFFF0fMask); // x2 y2 z2 0 - - vT = _mm_unpackhi_ps(v0, v1); // z0 z1 * * - v0 = _mm_unpacklo_ps(v0, v1); // x0 x1 y0 y1 - - v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) ); // y0 y1 y2 0 - v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) ); // x0 x1 x2 0 - v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT))); // z0 z1 z2 0 - - vm[0] = v0; - vm[1] = v1; - vm[2] = v2; +#if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) + __m128 v0 = m_el[0].mVec128; + __m128 v1 = m_el[1].mVec128; + __m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2 + __m128* vm = (__m128*)m; + __m128 vT; + + v2 = _mm_and_ps(v2, btvFFF0fMask); // x2 y2 z2 0 + + vT = _mm_unpackhi_ps(v0, v1); // z0 z1 * * + v0 = _mm_unpacklo_ps(v0, v1); // x0 x1 y0 y1 + + v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3)); // y0 y1 y2 0 + v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3)); // x0 x1 x2 0 + v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT))); // z0 z1 z2 0 + + vm[0] = v0; + vm[1] = v1; + vm[2] = v2; #elif defined(BT_USE_NEON) - // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions. - static const uint32x2_t zMask = (const uint32x2_t) {static_cast(-1), 0 }; - float32x4_t *vm = (float32x4_t *)m; - float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 ); // {x0 x1 z0 z1}, {y0 y1 w0 w1} - float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) ); // {x2 0 }, {y2 0} - float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] ); - float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] ); - float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask ); - float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q ); // z0 z1 z2 0 - - vm[0] = v0; - vm[1] = v1; - vm[2] = v2; + // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions. + static const uint32x2_t zMask = (const uint32x2_t){static_cast(-1), 0}; + float32x4_t* vm = (float32x4_t*)m; + float32x4x2_t top = vtrnq_f32(m_el[0].mVec128, m_el[1].mVec128); // {x0 x1 z0 z1}, {y0 y1 w0 w1} + float32x2x2_t bl = vtrn_f32(vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f)); // {x2 0 }, {y2 0} + float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]); + float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]); + float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(m_el[2].mVec128), zMask); + float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q); // z0 z1 z2 0 + + vm[0] = v0; + vm[1] = v1; + vm[2] = v2; #else - m[0] = btScalar(m_el[0].x()); - m[1] = btScalar(m_el[1].x()); - m[2] = btScalar(m_el[2].x()); - m[3] = btScalar(0.0); - m[4] = btScalar(m_el[0].y()); - m[5] = btScalar(m_el[1].y()); - m[6] = btScalar(m_el[2].y()); - m[7] = btScalar(0.0); - m[8] = btScalar(m_el[0].z()); - m[9] = btScalar(m_el[1].z()); + m[0] = btScalar(m_el[0].x()); + m[1] = btScalar(m_el[1].x()); + m[2] = btScalar(m_el[2].x()); + m[3] = btScalar(0.0); + m[4] = btScalar(m_el[0].y()); + m[5] = btScalar(m_el[1].y()); + m[6] = btScalar(m_el[2].y()); + m[7] = btScalar(0.0); + m[8] = btScalar(m_el[0].z()); + m[9] = btScalar(m_el[1].z()); m[10] = btScalar(m_el[2].z()); - m[11] = btScalar(0.0); + m[11] = btScalar(0.0); #endif } /**@brief Get the matrix represented as a quaternion * @param q The quaternion which will be set */ - void getRotation(btQuaternion& q) const + void getRotation(btQuaternion & q) const { -#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON) - btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z(); - btScalar s, x; - - union { - btSimdFloat4 vec; - btScalar f[4]; - } temp; - - if (trace > btScalar(0.0)) - { - x = trace + btScalar(1.0); - - temp.f[0]=m_el[2].y() - m_el[1].z(); - temp.f[1]=m_el[0].z() - m_el[2].x(); - temp.f[2]=m_el[1].x() - m_el[0].y(); - temp.f[3]=x; - //temp.f[3]= s * btScalar(0.5); - } - else - { - int i, j, k; - if(m_el[0].x() < m_el[1].y()) - { - if( m_el[1].y() < m_el[2].z() ) - { i = 2; j = 0; k = 1; } - else - { i = 1; j = 2; k = 0; } - } - else - { - if( m_el[0].x() < m_el[2].z()) - { i = 2; j = 0; k = 1; } - else - { i = 0; j = 1; k = 2; } - } - - x = m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0); - - temp.f[3] = (m_el[k][j] - m_el[j][k]); - temp.f[j] = (m_el[j][i] + m_el[i][j]); - temp.f[k] = (m_el[k][i] + m_el[i][k]); - temp.f[i] = x; - //temp.f[i] = s * btScalar(0.5); - } - - s = btSqrt(x); - q.set128(temp.vec); - s = btScalar(0.5) / s; - - q *= s; -#else +#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) + btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z(); + btScalar s, x; + + union { + btSimdFloat4 vec; + btScalar f[4]; + } temp; + + if (trace > btScalar(0.0)) + { + x = trace + btScalar(1.0); + + temp.f[0] = m_el[2].y() - m_el[1].z(); + temp.f[1] = m_el[0].z() - m_el[2].x(); + temp.f[2] = m_el[1].x() - m_el[0].y(); + temp.f[3] = x; + //temp.f[3]= s * btScalar(0.5); + } + else + { + int i, j, k; + if (m_el[0].x() < m_el[1].y()) + { + if (m_el[1].y() < m_el[2].z()) + { + i = 2; + j = 0; + k = 1; + } + else + { + i = 1; + j = 2; + k = 0; + } + } + else + { + if (m_el[0].x() < m_el[2].z()) + { + i = 2; + j = 0; + k = 1; + } + else + { + i = 0; + j = 1; + k = 2; + } + } + + x = m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0); + + temp.f[3] = (m_el[k][j] - m_el[j][k]); + temp.f[j] = (m_el[j][i] + m_el[i][j]); + temp.f[k] = (m_el[k][i] + m_el[i][k]); + temp.f[i] = x; + //temp.f[i] = s * btScalar(0.5); + } + + s = btSqrt(x); + q.set128(temp.vec); + s = btScalar(0.5) / s; + + q *= s; +#else btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z(); btScalar temp[4]; - if (trace > btScalar(0.0)) + if (trace > btScalar(0.0)) { btScalar s = btSqrt(trace + btScalar(1.0)); - temp[3]=(s * btScalar(0.5)); + temp[3] = (s * btScalar(0.5)); s = btScalar(0.5) / s; - temp[0]=((m_el[2].y() - m_el[1].z()) * s); - temp[1]=((m_el[0].z() - m_el[2].x()) * s); - temp[2]=((m_el[1].x() - m_el[0].y()) * s); - } - else + temp[0] = ((m_el[2].y() - m_el[1].z()) * s); + temp[1] = ((m_el[0].z() - m_el[2].x()) * s); + temp[2] = ((m_el[1].x() - m_el[0].y()) * s); + } + else { - int i = m_el[0].x() < m_el[1].y() ? - (m_el[1].y() < m_el[2].z() ? 2 : 1) : - (m_el[0].x() < m_el[2].z() ? 2 : 0); - int j = (i + 1) % 3; + int i = m_el[0].x() < m_el[1].y() ? (m_el[1].y() < m_el[2].z() ? 2 : 1) : (m_el[0].x() < m_el[2].z() ? 2 : 0); + int j = (i + 1) % 3; int k = (i + 2) % 3; btScalar s = btSqrt(m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0)); @@ -481,44 +492,42 @@ public: temp[j] = (m_el[j][i] + m_el[i][j]) * s; temp[k] = (m_el[k][i] + m_el[i][k]) * s; } - q.setValue(temp[0],temp[1],temp[2],temp[3]); + q.setValue(temp[0], temp[1], temp[2], temp[3]); #endif } /**@brief Get the matrix represented as euler angles around YXZ, roundtrip with setEulerYPR * @param yaw Yaw around Y axis * @param pitch Pitch around X axis - * @param roll around Z axis */ - void getEulerYPR(btScalar& yaw, btScalar& pitch, btScalar& roll) const + * @param roll around Z axis */ + void getEulerYPR(btScalar & yaw, btScalar & pitch, btScalar & roll) const { - // first use the normal calculus yaw = btScalar(btAtan2(m_el[1].x(), m_el[0].x())); pitch = btScalar(btAsin(-m_el[2].x())); roll = btScalar(btAtan2(m_el[2].y(), m_el[2].z())); // on pitch = +/-HalfPI - if (btFabs(pitch)==SIMD_HALF_PI) + if (btFabs(pitch) == SIMD_HALF_PI) { - if (yaw>0) - yaw-=SIMD_PI; + if (yaw > 0) + yaw -= SIMD_PI; else - yaw+=SIMD_PI; + yaw += SIMD_PI; - if (roll>0) - roll-=SIMD_PI; + if (roll > 0) + roll -= SIMD_PI; else - roll+=SIMD_PI; + roll += SIMD_PI; } }; - /**@brief Get the matrix represented as euler angles around ZYX * @param yaw Yaw around Z axis * @param pitch Pitch around Y axis * @param roll around X axis - * @param solution_number Which solution of two possible solutions ( 1 or 2) are possible values*/ - void getEulerZYX(btScalar& yaw, btScalar& pitch, btScalar& roll, unsigned int solution_number = 1) const + * @param solution_number Which solution of two possible solutions ( 1 or 2) are possible values*/ + void getEulerZYX(btScalar & yaw, btScalar & pitch, btScalar & roll, unsigned int solution_number = 1) const { struct Euler { @@ -528,7 +537,7 @@ public: }; Euler euler_out; - Euler euler_out2; //second solution + Euler euler_out2; //second solution //get the pointer to the raw data // Check that pitch is not at a singularity @@ -538,7 +547,7 @@ public: euler_out2.yaw = 0; // From difference of angles formula - btScalar delta = btAtan2(m_el[0].x(),m_el[0].z()); + btScalar delta = btAtan2(m_el[0].x(), m_el[0].z()); if (m_el[2].x() > 0) //gimbal locked up { euler_out.pitch = SIMD_PI / btScalar(2.0); @@ -546,7 +555,7 @@ public: euler_out.roll = euler_out.pitch + delta; euler_out2.roll = euler_out.pitch + delta; } - else // gimbal locked down + else // gimbal locked down { euler_out.pitch = -SIMD_PI / btScalar(2.0); euler_out2.pitch = -SIMD_PI / btScalar(2.0); @@ -556,29 +565,29 @@ public: } else { - euler_out.pitch = - btAsin(m_el[2].x()); + euler_out.pitch = -btAsin(m_el[2].x()); euler_out2.pitch = SIMD_PI - euler_out.pitch; - euler_out.roll = btAtan2(m_el[2].y()/btCos(euler_out.pitch), - m_el[2].z()/btCos(euler_out.pitch)); - euler_out2.roll = btAtan2(m_el[2].y()/btCos(euler_out2.pitch), - m_el[2].z()/btCos(euler_out2.pitch)); + euler_out.roll = btAtan2(m_el[2].y() / btCos(euler_out.pitch), + m_el[2].z() / btCos(euler_out.pitch)); + euler_out2.roll = btAtan2(m_el[2].y() / btCos(euler_out2.pitch), + m_el[2].z() / btCos(euler_out2.pitch)); - euler_out.yaw = btAtan2(m_el[1].x()/btCos(euler_out.pitch), - m_el[0].x()/btCos(euler_out.pitch)); - euler_out2.yaw = btAtan2(m_el[1].x()/btCos(euler_out2.pitch), - m_el[0].x()/btCos(euler_out2.pitch)); + euler_out.yaw = btAtan2(m_el[1].x() / btCos(euler_out.pitch), + m_el[0].x() / btCos(euler_out.pitch)); + euler_out2.yaw = btAtan2(m_el[1].x() / btCos(euler_out2.pitch), + m_el[0].x() / btCos(euler_out2.pitch)); } if (solution_number == 1) - { - yaw = euler_out.yaw; + { + yaw = euler_out.yaw; pitch = euler_out.pitch; roll = euler_out.roll; } else - { - yaw = euler_out2.yaw; + { + yaw = euler_out2.yaw; pitch = euler_out2.pitch; roll = euler_out2.roll; } @@ -589,18 +598,18 @@ public: btMatrix3x3 scaled(const btVector3& s) const { -#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON) +#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) return btMatrix3x3(m_el[0] * s, m_el[1] * s, m_el[2] * s); -#else +#else return btMatrix3x3( - m_el[0].x() * s.x(), m_el[0].y() * s.y(), m_el[0].z() * s.z(), + m_el[0].x() * s.x(), m_el[0].y() * s.y(), m_el[0].z() * s.z(), m_el[1].x() * s.x(), m_el[1].y() * s.y(), m_el[1].z() * s.z(), m_el[2].x() * s.x(), m_el[2].y() * s.y(), m_el[2].z() * s.z()); #endif } /**@brief Return the determinant of the matrix */ - btScalar determinant() const; + btScalar determinant() const; /**@brief Return the adjoint of the matrix */ btMatrix3x3 adjoint() const; /**@brief Return the matrix with all values non negative */ @@ -608,7 +617,7 @@ public: /**@brief Return the transpose of the matrix */ btMatrix3x3 transpose() const; /**@brief Return the inverse of the matrix */ - btMatrix3x3 inverse() const; + btMatrix3x3 inverse() const; /// Solve A * x = b, where b is a column vector. This is more efficient /// than computing the inverse in one-shot cases. @@ -618,9 +627,9 @@ public: btVector3 col1 = getColumn(0); btVector3 col2 = getColumn(1); btVector3 col3 = getColumn(2); - + btScalar det = btDot(col1, btCross(col2, col3)); - if (btFabs(det)>SIMD_EPSILON) + if (btFabs(det) > SIMD_EPSILON) { det = 1.0f / det; } @@ -634,15 +643,15 @@ public: btMatrix3x3 transposeTimes(const btMatrix3x3& m) const; btMatrix3x3 timesTranspose(const btMatrix3x3& m) const; - SIMD_FORCE_INLINE btScalar tdotx(const btVector3& v) const + SIMD_FORCE_INLINE btScalar tdotx(const btVector3& v) const { return m_el[0].x() * v.x() + m_el[1].x() * v.y() + m_el[2].x() * v.z(); } - SIMD_FORCE_INLINE btScalar tdoty(const btVector3& v) const + SIMD_FORCE_INLINE btScalar tdoty(const btVector3& v) const { return m_el[0].y() * v.x() + m_el[1].y() * v.y() + m_el[2].y() * v.z(); } - SIMD_FORCE_INLINE btScalar tdotz(const btVector3& v) const + SIMD_FORCE_INLINE btScalar tdotz(const btVector3& v) const { return m_el[0].z() * v.x() + m_el[1].z() * v.y() + m_el[2].z() * v.z(); } @@ -653,31 +662,25 @@ public: ///symmetric matrix S: ///A = R*S. ///note that R can include both rotation and scaling. - SIMD_FORCE_INLINE void extractRotation(btQuaternion &q,btScalar tolerance = 1.0e-9, int maxIter=100) + SIMD_FORCE_INLINE void extractRotation(btQuaternion & q, btScalar tolerance = 1.0e-9, int maxIter = 100) { - int iter =0; + int iter = 0; btScalar w; - const btMatrix3x3& A=*this; - for(iter = 0; iter < maxIter; iter++) + const btMatrix3x3& A = *this; + for (iter = 0; iter < maxIter; iter++) { btMatrix3x3 R(q); - btVector3 omega = (R.getColumn(0).cross(A.getColumn(0)) + R.getColumn(1).cross(A.getColumn(1)) - + R.getColumn(2).cross(A.getColumn(2)) - ) * (btScalar(1.0) / btFabs(R.getColumn(0).dot(A.getColumn(0)) + R.getColumn - (1).dot(A.getColumn(1)) + R.getColumn(2).dot(A.getColumn(2))) + - tolerance); + btVector3 omega = (R.getColumn(0).cross(A.getColumn(0)) + R.getColumn(1).cross(A.getColumn(1)) + R.getColumn(2).cross(A.getColumn(2))) * (btScalar(1.0) / btFabs(R.getColumn(0).dot(A.getColumn(0)) + R.getColumn(1).dot(A.getColumn(1)) + R.getColumn(2).dot(A.getColumn(2))) + + tolerance); w = omega.norm(); - if(w < tolerance) + if (w < tolerance) break; - q = btQuaternion(btVector3((btScalar(1.0) / w) * omega),w) * + q = btQuaternion(btVector3((btScalar(1.0) / w) * omega), w) * q; q.normalize(); } } - - - /**@brief diagonalizes this matrix by the Jacobi method. * @param rot stores the rotation from the coordinate system in which the matrix is diagonal to the original * coordinate system, i.e., old_this = rot * new_this * rot^T. @@ -687,7 +690,7 @@ public: * * Note that this matrix is assumed to be symmetric. */ - void diagonalize(btMatrix3x3& rot, btScalar threshold, int maxSteps) + void diagonalize(btMatrix3x3 & rot, btScalar threshold, int maxSteps) { rot.setIdentity(); for (int step = maxSteps; step > 0; step--) @@ -723,7 +726,7 @@ public: step = 1; } - // compute Jacobi rotation J which leads to a zero for element [p][q] + // compute Jacobi rotation J which leads to a zero for element [p][q] btScalar mpq = m_el[p][q]; btScalar theta = (m_el[q][q] - m_el[p][p]) / (2 * mpq); btScalar theta2 = theta * theta; @@ -732,7 +735,7 @@ public: if (theta2 * theta2 < btScalar(10 / SIMD_EPSILON)) { t = (theta >= 0) ? 1 / (theta + btSqrt(1 + theta2)) - : 1 / (theta - btSqrt(1 + theta2)); + : 1 / (theta - btSqrt(1 + theta2)); cos = 1 / btSqrt(1 + t * t); sin = cos * t; } @@ -765,8 +768,6 @@ public: } } - - /**@brief Calculate the matrix cofactor * @param r1 The first row to use for calculating the cofactor * @param c1 The first column to use for calculating the cofactor @@ -774,304 +775,298 @@ public: * @param c1 The second column to use for calculating the cofactor * See http://en.wikipedia.org/wiki/Cofactor_(linear_algebra) for more details */ - btScalar cofac(int r1, int c1, int r2, int c2) const + btScalar cofac(int r1, int c1, int r2, int c2) const { return m_el[r1][c1] * m_el[r2][c2] - m_el[r1][c2] * m_el[r2][c1]; } - void serialize(struct btMatrix3x3Data& dataOut) const; + void serialize(struct btMatrix3x3Data & dataOut) const; - void serializeFloat(struct btMatrix3x3FloatData& dataOut) const; + void serializeFloat(struct btMatrix3x3FloatData & dataOut) const; - void deSerialize(const struct btMatrix3x3Data& dataIn); + void deSerialize(const struct btMatrix3x3Data& dataIn); - void deSerializeFloat(const struct btMatrix3x3FloatData& dataIn); - - void deSerializeDouble(const struct btMatrix3x3DoubleData& dataIn); + void deSerializeFloat(const struct btMatrix3x3FloatData& dataIn); + void deSerializeDouble(const struct btMatrix3x3DoubleData& dataIn); }; - -SIMD_FORCE_INLINE btMatrix3x3& +SIMD_FORCE_INLINE btMatrix3x3& btMatrix3x3::operator*=(const btMatrix3x3& m) { -#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) - __m128 rv00, rv01, rv02; - __m128 rv10, rv11, rv12; - __m128 rv20, rv21, rv22; - __m128 mv0, mv1, mv2; - - rv02 = m_el[0].mVec128; - rv12 = m_el[1].mVec128; - rv22 = m_el[2].mVec128; - - mv0 = _mm_and_ps(m[0].mVec128, btvFFF0fMask); - mv1 = _mm_and_ps(m[1].mVec128, btvFFF0fMask); - mv2 = _mm_and_ps(m[2].mVec128, btvFFF0fMask); - - // rv0 - rv00 = bt_splat_ps(rv02, 0); - rv01 = bt_splat_ps(rv02, 1); - rv02 = bt_splat_ps(rv02, 2); - - rv00 = _mm_mul_ps(rv00, mv0); - rv01 = _mm_mul_ps(rv01, mv1); - rv02 = _mm_mul_ps(rv02, mv2); - - // rv1 - rv10 = bt_splat_ps(rv12, 0); - rv11 = bt_splat_ps(rv12, 1); - rv12 = bt_splat_ps(rv12, 2); - - rv10 = _mm_mul_ps(rv10, mv0); - rv11 = _mm_mul_ps(rv11, mv1); - rv12 = _mm_mul_ps(rv12, mv2); - - // rv2 - rv20 = bt_splat_ps(rv22, 0); - rv21 = bt_splat_ps(rv22, 1); - rv22 = bt_splat_ps(rv22, 2); - - rv20 = _mm_mul_ps(rv20, mv0); - rv21 = _mm_mul_ps(rv21, mv1); - rv22 = _mm_mul_ps(rv22, mv2); - - rv00 = _mm_add_ps(rv00, rv01); - rv10 = _mm_add_ps(rv10, rv11); - rv20 = _mm_add_ps(rv20, rv21); - - m_el[0].mVec128 = _mm_add_ps(rv00, rv02); - m_el[1].mVec128 = _mm_add_ps(rv10, rv12); - m_el[2].mVec128 = _mm_add_ps(rv20, rv22); +#if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) + __m128 rv00, rv01, rv02; + __m128 rv10, rv11, rv12; + __m128 rv20, rv21, rv22; + __m128 mv0, mv1, mv2; + + rv02 = m_el[0].mVec128; + rv12 = m_el[1].mVec128; + rv22 = m_el[2].mVec128; + + mv0 = _mm_and_ps(m[0].mVec128, btvFFF0fMask); + mv1 = _mm_and_ps(m[1].mVec128, btvFFF0fMask); + mv2 = _mm_and_ps(m[2].mVec128, btvFFF0fMask); + + // rv0 + rv00 = bt_splat_ps(rv02, 0); + rv01 = bt_splat_ps(rv02, 1); + rv02 = bt_splat_ps(rv02, 2); + + rv00 = _mm_mul_ps(rv00, mv0); + rv01 = _mm_mul_ps(rv01, mv1); + rv02 = _mm_mul_ps(rv02, mv2); + + // rv1 + rv10 = bt_splat_ps(rv12, 0); + rv11 = bt_splat_ps(rv12, 1); + rv12 = bt_splat_ps(rv12, 2); + + rv10 = _mm_mul_ps(rv10, mv0); + rv11 = _mm_mul_ps(rv11, mv1); + rv12 = _mm_mul_ps(rv12, mv2); + + // rv2 + rv20 = bt_splat_ps(rv22, 0); + rv21 = bt_splat_ps(rv22, 1); + rv22 = bt_splat_ps(rv22, 2); + + rv20 = _mm_mul_ps(rv20, mv0); + rv21 = _mm_mul_ps(rv21, mv1); + rv22 = _mm_mul_ps(rv22, mv2); + + rv00 = _mm_add_ps(rv00, rv01); + rv10 = _mm_add_ps(rv10, rv11); + rv20 = _mm_add_ps(rv20, rv21); + + m_el[0].mVec128 = _mm_add_ps(rv00, rv02); + m_el[1].mVec128 = _mm_add_ps(rv10, rv12); + m_el[2].mVec128 = _mm_add_ps(rv20, rv22); #elif defined(BT_USE_NEON) - float32x4_t rv0, rv1, rv2; - float32x4_t v0, v1, v2; - float32x4_t mv0, mv1, mv2; - - v0 = m_el[0].mVec128; - v1 = m_el[1].mVec128; - v2 = m_el[2].mVec128; - - mv0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask); - mv1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask); - mv2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask); - - rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0); - rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0); - rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0); - - rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1); - rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1); - rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1); - - rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0); - rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0); - rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0); - - m_el[0].mVec128 = rv0; - m_el[1].mVec128 = rv1; - m_el[2].mVec128 = rv2; -#else + float32x4_t rv0, rv1, rv2; + float32x4_t v0, v1, v2; + float32x4_t mv0, mv1, mv2; + + v0 = m_el[0].mVec128; + v1 = m_el[1].mVec128; + v2 = m_el[2].mVec128; + + mv0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask); + mv1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask); + mv2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask); + + rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0); + rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0); + rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0); + + rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1); + rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1); + rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1); + + rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0); + rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0); + rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0); + + m_el[0].mVec128 = rv0; + m_el[1].mVec128 = rv1; + m_el[2].mVec128 = rv2; +#else setValue( - m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]), + m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]), m.tdotx(m_el[1]), m.tdoty(m_el[1]), m.tdotz(m_el[1]), m.tdotx(m_el[2]), m.tdoty(m_el[2]), m.tdotz(m_el[2])); #endif return *this; } -SIMD_FORCE_INLINE btMatrix3x3& +SIMD_FORCE_INLINE btMatrix3x3& btMatrix3x3::operator+=(const btMatrix3x3& m) { -#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON) - m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128; - m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128; - m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128; +#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) + m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128; + m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128; + m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128; #else setValue( - m_el[0][0]+m.m_el[0][0], - m_el[0][1]+m.m_el[0][1], - m_el[0][2]+m.m_el[0][2], - m_el[1][0]+m.m_el[1][0], - m_el[1][1]+m.m_el[1][1], - m_el[1][2]+m.m_el[1][2], - m_el[2][0]+m.m_el[2][0], - m_el[2][1]+m.m_el[2][1], - m_el[2][2]+m.m_el[2][2]); + m_el[0][0] + m.m_el[0][0], + m_el[0][1] + m.m_el[0][1], + m_el[0][2] + m.m_el[0][2], + m_el[1][0] + m.m_el[1][0], + m_el[1][1] + m.m_el[1][1], + m_el[1][2] + m.m_el[1][2], + m_el[2][0] + m.m_el[2][0], + m_el[2][1] + m.m_el[2][1], + m_el[2][2] + m.m_el[2][2]); #endif return *this; } SIMD_FORCE_INLINE btMatrix3x3 -operator*(const btMatrix3x3& m, const btScalar & k) +operator*(const btMatrix3x3& m, const btScalar& k) { -#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)) - __m128 vk = bt_splat_ps(_mm_load_ss((float *)&k), 0x80); - return btMatrix3x3( - _mm_mul_ps(m[0].mVec128, vk), - _mm_mul_ps(m[1].mVec128, vk), - _mm_mul_ps(m[2].mVec128, vk)); +#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) + __m128 vk = bt_splat_ps(_mm_load_ss((float*)&k), 0x80); + return btMatrix3x3( + _mm_mul_ps(m[0].mVec128, vk), + _mm_mul_ps(m[1].mVec128, vk), + _mm_mul_ps(m[2].mVec128, vk)); #elif defined(BT_USE_NEON) - return btMatrix3x3( - vmulq_n_f32(m[0].mVec128, k), - vmulq_n_f32(m[1].mVec128, k), - vmulq_n_f32(m[2].mVec128, k)); + return btMatrix3x3( + vmulq_n_f32(m[0].mVec128, k), + vmulq_n_f32(m[1].mVec128, k), + vmulq_n_f32(m[2].mVec128, k)); #else return btMatrix3x3( - m[0].x()*k,m[0].y()*k,m[0].z()*k, - m[1].x()*k,m[1].y()*k,m[1].z()*k, - m[2].x()*k,m[2].y()*k,m[2].z()*k); + m[0].x() * k, m[0].y() * k, m[0].z() * k, + m[1].x() * k, m[1].y() * k, m[1].z() * k, + m[2].x() * k, m[2].y() * k, m[2].z() * k); #endif } -SIMD_FORCE_INLINE btMatrix3x3 +SIMD_FORCE_INLINE btMatrix3x3 operator+(const btMatrix3x3& m1, const btMatrix3x3& m2) { -#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON) +#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) return btMatrix3x3( - m1[0].mVec128 + m2[0].mVec128, - m1[1].mVec128 + m2[1].mVec128, - m1[2].mVec128 + m2[2].mVec128); + m1[0].mVec128 + m2[0].mVec128, + m1[1].mVec128 + m2[1].mVec128, + m1[2].mVec128 + m2[2].mVec128); #else return btMatrix3x3( - m1[0][0]+m2[0][0], - m1[0][1]+m2[0][1], - m1[0][2]+m2[0][2], - - m1[1][0]+m2[1][0], - m1[1][1]+m2[1][1], - m1[1][2]+m2[1][2], - - m1[2][0]+m2[2][0], - m1[2][1]+m2[2][1], - m1[2][2]+m2[2][2]); -#endif + m1[0][0] + m2[0][0], + m1[0][1] + m2[0][1], + m1[0][2] + m2[0][2], + + m1[1][0] + m2[1][0], + m1[1][1] + m2[1][1], + m1[1][2] + m2[1][2], + + m1[2][0] + m2[2][0], + m1[2][1] + m2[2][1], + m1[2][2] + m2[2][2]); +#endif } -SIMD_FORCE_INLINE btMatrix3x3 +SIMD_FORCE_INLINE btMatrix3x3 operator-(const btMatrix3x3& m1, const btMatrix3x3& m2) { -#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON) +#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) return btMatrix3x3( - m1[0].mVec128 - m2[0].mVec128, - m1[1].mVec128 - m2[1].mVec128, - m1[2].mVec128 - m2[2].mVec128); + m1[0].mVec128 - m2[0].mVec128, + m1[1].mVec128 - m2[1].mVec128, + m1[2].mVec128 - m2[2].mVec128); #else return btMatrix3x3( - m1[0][0]-m2[0][0], - m1[0][1]-m2[0][1], - m1[0][2]-m2[0][2], - - m1[1][0]-m2[1][0], - m1[1][1]-m2[1][1], - m1[1][2]-m2[1][2], - - m1[2][0]-m2[2][0], - m1[2][1]-m2[2][1], - m1[2][2]-m2[2][2]); + m1[0][0] - m2[0][0], + m1[0][1] - m2[0][1], + m1[0][2] - m2[0][2], + + m1[1][0] - m2[1][0], + m1[1][1] - m2[1][1], + m1[1][2] - m2[1][2], + + m1[2][0] - m2[2][0], + m1[2][1] - m2[2][1], + m1[2][2] - m2[2][2]); #endif } - -SIMD_FORCE_INLINE btMatrix3x3& +SIMD_FORCE_INLINE btMatrix3x3& btMatrix3x3::operator-=(const btMatrix3x3& m) { -#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON) - m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128; - m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128; - m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128; +#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) + m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128; + m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128; + m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128; #else setValue( - m_el[0][0]-m.m_el[0][0], - m_el[0][1]-m.m_el[0][1], - m_el[0][2]-m.m_el[0][2], - m_el[1][0]-m.m_el[1][0], - m_el[1][1]-m.m_el[1][1], - m_el[1][2]-m.m_el[1][2], - m_el[2][0]-m.m_el[2][0], - m_el[2][1]-m.m_el[2][1], - m_el[2][2]-m.m_el[2][2]); + m_el[0][0] - m.m_el[0][0], + m_el[0][1] - m.m_el[0][1], + m_el[0][2] - m.m_el[0][2], + m_el[1][0] - m.m_el[1][0], + m_el[1][1] - m.m_el[1][1], + m_el[1][2] - m.m_el[1][2], + m_el[2][0] - m.m_el[2][0], + m_el[2][1] - m.m_el[2][1], + m_el[2][2] - m.m_el[2][2]); #endif return *this; } - -SIMD_FORCE_INLINE btScalar +SIMD_FORCE_INLINE btScalar btMatrix3x3::determinant() const -{ +{ return btTriple((*this)[0], (*this)[1], (*this)[2]); } - -SIMD_FORCE_INLINE btMatrix3x3 +SIMD_FORCE_INLINE btMatrix3x3 btMatrix3x3::absolute() const { -#if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)) - return btMatrix3x3( - _mm_and_ps(m_el[0].mVec128, btvAbsfMask), - _mm_and_ps(m_el[1].mVec128, btvAbsfMask), - _mm_and_ps(m_el[2].mVec128, btvAbsfMask)); +#if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) + return btMatrix3x3( + _mm_and_ps(m_el[0].mVec128, btvAbsfMask), + _mm_and_ps(m_el[1].mVec128, btvAbsfMask), + _mm_and_ps(m_el[2].mVec128, btvAbsfMask)); #elif defined(BT_USE_NEON) - return btMatrix3x3( - (float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, btv3AbsMask), - (float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, btv3AbsMask), - (float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, btv3AbsMask)); -#else return btMatrix3x3( - btFabs(m_el[0].x()), btFabs(m_el[0].y()), btFabs(m_el[0].z()), - btFabs(m_el[1].x()), btFabs(m_el[1].y()), btFabs(m_el[1].z()), - btFabs(m_el[2].x()), btFabs(m_el[2].y()), btFabs(m_el[2].z())); + (float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, btv3AbsMask), + (float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, btv3AbsMask), + (float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, btv3AbsMask)); +#else + return btMatrix3x3( + btFabs(m_el[0].x()), btFabs(m_el[0].y()), btFabs(m_el[0].z()), + btFabs(m_el[1].x()), btFabs(m_el[1].y()), btFabs(m_el[1].z()), + btFabs(m_el[2].x()), btFabs(m_el[2].y()), btFabs(m_el[2].z())); #endif } -SIMD_FORCE_INLINE btMatrix3x3 -btMatrix3x3::transpose() const +SIMD_FORCE_INLINE btMatrix3x3 +btMatrix3x3::transpose() const { -#if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)) - __m128 v0 = m_el[0].mVec128; - __m128 v1 = m_el[1].mVec128; - __m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2 - __m128 vT; - - v2 = _mm_and_ps(v2, btvFFF0fMask); // x2 y2 z2 0 - - vT = _mm_unpackhi_ps(v0, v1); // z0 z1 * * - v0 = _mm_unpacklo_ps(v0, v1); // x0 x1 y0 y1 - - v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) ); // y0 y1 y2 0 - v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) ); // x0 x1 x2 0 - v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT))); // z0 z1 z2 0 - - - return btMatrix3x3( v0, v1, v2 ); +#if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) + __m128 v0 = m_el[0].mVec128; + __m128 v1 = m_el[1].mVec128; + __m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2 + __m128 vT; + + v2 = _mm_and_ps(v2, btvFFF0fMask); // x2 y2 z2 0 + + vT = _mm_unpackhi_ps(v0, v1); // z0 z1 * * + v0 = _mm_unpacklo_ps(v0, v1); // x0 x1 y0 y1 + + v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3)); // y0 y1 y2 0 + v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3)); // x0 x1 x2 0 + v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT))); // z0 z1 z2 0 + + return btMatrix3x3(v0, v1, v2); #elif defined(BT_USE_NEON) - // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions. - static const uint32x2_t zMask = (const uint32x2_t) {static_cast(-1), 0 }; - float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 ); // {x0 x1 z0 z1}, {y0 y1 w0 w1} - float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) ); // {x2 0 }, {y2 0} - float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] ); - float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] ); - float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask ); - float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q ); // z0 z1 z2 0 - return btMatrix3x3( v0, v1, v2 ); + // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions. + static const uint32x2_t zMask = (const uint32x2_t){static_cast(-1), 0}; + float32x4x2_t top = vtrnq_f32(m_el[0].mVec128, m_el[1].mVec128); // {x0 x1 z0 z1}, {y0 y1 w0 w1} + float32x2x2_t bl = vtrn_f32(vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f)); // {x2 0 }, {y2 0} + float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]); + float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]); + float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(m_el[2].mVec128), zMask); + float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q); // z0 z1 z2 0 + return btMatrix3x3(v0, v1, v2); #else - return btMatrix3x3( m_el[0].x(), m_el[1].x(), m_el[2].x(), - m_el[0].y(), m_el[1].y(), m_el[2].y(), - m_el[0].z(), m_el[1].z(), m_el[2].z()); + return btMatrix3x3(m_el[0].x(), m_el[1].x(), m_el[2].x(), + m_el[0].y(), m_el[1].y(), m_el[2].y(), + m_el[0].z(), m_el[1].z(), m_el[2].z()); #endif } -SIMD_FORCE_INLINE btMatrix3x3 -btMatrix3x3::adjoint() const +SIMD_FORCE_INLINE btMatrix3x3 +btMatrix3x3::adjoint() const { return btMatrix3x3(cofac(1, 1, 2, 2), cofac(0, 2, 2, 1), cofac(0, 1, 1, 2), - cofac(1, 2, 2, 0), cofac(0, 0, 2, 2), cofac(0, 2, 1, 0), - cofac(1, 0, 2, 1), cofac(0, 1, 2, 0), cofac(0, 0, 1, 1)); + cofac(1, 2, 2, 0), cofac(0, 0, 2, 2), cofac(0, 2, 1, 0), + cofac(1, 0, 2, 1), cofac(0, 1, 2, 0), cofac(0, 0, 1, 1)); } -SIMD_FORCE_INLINE btMatrix3x3 +SIMD_FORCE_INLINE btMatrix3x3 btMatrix3x3::inverse() const { btVector3 co(cofac(1, 1, 2, 2), cofac(1, 2, 2, 0), cofac(1, 0, 2, 1)); @@ -1080,54 +1075,54 @@ btMatrix3x3::inverse() const btAssert(det != btScalar(0.0)); btScalar s = btScalar(1.0) / det; return btMatrix3x3(co.x() * s, cofac(0, 2, 2, 1) * s, cofac(0, 1, 1, 2) * s, - co.y() * s, cofac(0, 0, 2, 2) * s, cofac(0, 2, 1, 0) * s, - co.z() * s, cofac(0, 1, 2, 0) * s, cofac(0, 0, 1, 1) * s); + co.y() * s, cofac(0, 0, 2, 2) * s, cofac(0, 2, 1, 0) * s, + co.z() * s, cofac(0, 1, 2, 0) * s, cofac(0, 0, 1, 1) * s); } -SIMD_FORCE_INLINE btMatrix3x3 +SIMD_FORCE_INLINE btMatrix3x3 btMatrix3x3::transposeTimes(const btMatrix3x3& m) const { -#if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)) - // zeros w -// static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL }; - __m128 row = m_el[0].mVec128; - __m128 m0 = _mm_and_ps( m.getRow(0).mVec128, btvFFF0fMask ); - __m128 m1 = _mm_and_ps( m.getRow(1).mVec128, btvFFF0fMask); - __m128 m2 = _mm_and_ps( m.getRow(2).mVec128, btvFFF0fMask ); - __m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0)); - __m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55)); - __m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa)); - row = m_el[1].mVec128; - r0 = _mm_add_ps( r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0))); - r1 = _mm_add_ps( r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55))); - r2 = _mm_add_ps( r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa))); - row = m_el[2].mVec128; - r0 = _mm_add_ps( r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0))); - r1 = _mm_add_ps( r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55))); - r2 = _mm_add_ps( r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa))); - return btMatrix3x3( r0, r1, r2 ); +#if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) + // zeros w + // static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL }; + __m128 row = m_el[0].mVec128; + __m128 m0 = _mm_and_ps(m.getRow(0).mVec128, btvFFF0fMask); + __m128 m1 = _mm_and_ps(m.getRow(1).mVec128, btvFFF0fMask); + __m128 m2 = _mm_and_ps(m.getRow(2).mVec128, btvFFF0fMask); + __m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0)); + __m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55)); + __m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa)); + row = m_el[1].mVec128; + r0 = _mm_add_ps(r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0))); + r1 = _mm_add_ps(r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55))); + r2 = _mm_add_ps(r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa))); + row = m_el[2].mVec128; + r0 = _mm_add_ps(r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0))); + r1 = _mm_add_ps(r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55))); + r2 = _mm_add_ps(r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa))); + return btMatrix3x3(r0, r1, r2); #elif defined BT_USE_NEON - // zeros w - static const uint32x4_t xyzMask = (const uint32x4_t){ static_cast(-1), static_cast(-1), static_cast(-1), 0 }; - float32x4_t m0 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(0).mVec128, xyzMask ); - float32x4_t m1 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(1).mVec128, xyzMask ); - float32x4_t m2 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(2).mVec128, xyzMask ); - float32x4_t row = m_el[0].mVec128; - float32x4_t r0 = vmulq_lane_f32( m0, vget_low_f32(row), 0); - float32x4_t r1 = vmulq_lane_f32( m0, vget_low_f32(row), 1); - float32x4_t r2 = vmulq_lane_f32( m0, vget_high_f32(row), 0); - row = m_el[1].mVec128; - r0 = vmlaq_lane_f32( r0, m1, vget_low_f32(row), 0); - r1 = vmlaq_lane_f32( r1, m1, vget_low_f32(row), 1); - r2 = vmlaq_lane_f32( r2, m1, vget_high_f32(row), 0); - row = m_el[2].mVec128; - r0 = vmlaq_lane_f32( r0, m2, vget_low_f32(row), 0); - r1 = vmlaq_lane_f32( r1, m2, vget_low_f32(row), 1); - r2 = vmlaq_lane_f32( r2, m2, vget_high_f32(row), 0); - return btMatrix3x3( r0, r1, r2 ); + // zeros w + static const uint32x4_t xyzMask = (const uint32x4_t){static_cast(-1), static_cast(-1), static_cast(-1), 0}; + float32x4_t m0 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(0).mVec128, xyzMask); + float32x4_t m1 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(1).mVec128, xyzMask); + float32x4_t m2 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(2).mVec128, xyzMask); + float32x4_t row = m_el[0].mVec128; + float32x4_t r0 = vmulq_lane_f32(m0, vget_low_f32(row), 0); + float32x4_t r1 = vmulq_lane_f32(m0, vget_low_f32(row), 1); + float32x4_t r2 = vmulq_lane_f32(m0, vget_high_f32(row), 0); + row = m_el[1].mVec128; + r0 = vmlaq_lane_f32(r0, m1, vget_low_f32(row), 0); + r1 = vmlaq_lane_f32(r1, m1, vget_low_f32(row), 1); + r2 = vmlaq_lane_f32(r2, m1, vget_high_f32(row), 0); + row = m_el[2].mVec128; + r0 = vmlaq_lane_f32(r0, m2, vget_low_f32(row), 0); + r1 = vmlaq_lane_f32(r1, m2, vget_low_f32(row), 1); + r2 = vmlaq_lane_f32(r2, m2, vget_high_f32(row), 0); + return btMatrix3x3(r0, r1, r2); #else - return btMatrix3x3( + return btMatrix3x3( m_el[0].x() * m[0].x() + m_el[1].x() * m[1].x() + m_el[2].x() * m[2].x(), m_el[0].x() * m[0].y() + m_el[1].x() * m[1].y() + m_el[2].x() * m[2].y(), m_el[0].x() * m[0].z() + m_el[1].x() * m[1].z() + m_el[2].x() * m[2].z(), @@ -1140,51 +1135,51 @@ btMatrix3x3::transposeTimes(const btMatrix3x3& m) const #endif } -SIMD_FORCE_INLINE btMatrix3x3 +SIMD_FORCE_INLINE btMatrix3x3 btMatrix3x3::timesTranspose(const btMatrix3x3& m) const { -#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)) - __m128 a0 = m_el[0].mVec128; - __m128 a1 = m_el[1].mVec128; - __m128 a2 = m_el[2].mVec128; - - btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here - __m128 mx = mT[0].mVec128; - __m128 my = mT[1].mVec128; - __m128 mz = mT[2].mVec128; - - __m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00)); - __m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00)); - __m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00)); - r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55))); - r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55))); - r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55))); - r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa))); - r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa))); - r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa))); - return btMatrix3x3( r0, r1, r2); - +#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) + __m128 a0 = m_el[0].mVec128; + __m128 a1 = m_el[1].mVec128; + __m128 a2 = m_el[2].mVec128; + + btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here + __m128 mx = mT[0].mVec128; + __m128 my = mT[1].mVec128; + __m128 mz = mT[2].mVec128; + + __m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00)); + __m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00)); + __m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00)); + r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55))); + r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55))); + r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55))); + r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa))); + r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa))); + r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa))); + return btMatrix3x3(r0, r1, r2); + #elif defined BT_USE_NEON - float32x4_t a0 = m_el[0].mVec128; - float32x4_t a1 = m_el[1].mVec128; - float32x4_t a2 = m_el[2].mVec128; - - btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here - float32x4_t mx = mT[0].mVec128; - float32x4_t my = mT[1].mVec128; - float32x4_t mz = mT[2].mVec128; - - float32x4_t r0 = vmulq_lane_f32( mx, vget_low_f32(a0), 0); - float32x4_t r1 = vmulq_lane_f32( mx, vget_low_f32(a1), 0); - float32x4_t r2 = vmulq_lane_f32( mx, vget_low_f32(a2), 0); - r0 = vmlaq_lane_f32( r0, my, vget_low_f32(a0), 1); - r1 = vmlaq_lane_f32( r1, my, vget_low_f32(a1), 1); - r2 = vmlaq_lane_f32( r2, my, vget_low_f32(a2), 1); - r0 = vmlaq_lane_f32( r0, mz, vget_high_f32(a0), 0); - r1 = vmlaq_lane_f32( r1, mz, vget_high_f32(a1), 0); - r2 = vmlaq_lane_f32( r2, mz, vget_high_f32(a2), 0); - return btMatrix3x3( r0, r1, r2 ); - + float32x4_t a0 = m_el[0].mVec128; + float32x4_t a1 = m_el[1].mVec128; + float32x4_t a2 = m_el[2].mVec128; + + btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here + float32x4_t mx = mT[0].mVec128; + float32x4_t my = mT[1].mVec128; + float32x4_t mz = mT[2].mVec128; + + float32x4_t r0 = vmulq_lane_f32(mx, vget_low_f32(a0), 0); + float32x4_t r1 = vmulq_lane_f32(mx, vget_low_f32(a1), 0); + float32x4_t r2 = vmulq_lane_f32(mx, vget_low_f32(a2), 0); + r0 = vmlaq_lane_f32(r0, my, vget_low_f32(a0), 1); + r1 = vmlaq_lane_f32(r1, my, vget_low_f32(a1), 1); + r2 = vmlaq_lane_f32(r2, my, vget_low_f32(a2), 1); + r0 = vmlaq_lane_f32(r0, mz, vget_high_f32(a0), 0); + r1 = vmlaq_lane_f32(r1, mz, vget_high_f32(a1), 0); + r2 = vmlaq_lane_f32(r2, mz, vget_high_f32(a2), 0); + return btMatrix3x3(r0, r1, r2); + #else return btMatrix3x3( m_el[0].dot(m[0]), m_el[0].dot(m[1]), m_el[0].dot(m[2]), @@ -1193,139 +1188,138 @@ btMatrix3x3::timesTranspose(const btMatrix3x3& m) const #endif } -SIMD_FORCE_INLINE btVector3 -operator*(const btMatrix3x3& m, const btVector3& v) +SIMD_FORCE_INLINE btVector3 +operator*(const btMatrix3x3& m, const btVector3& v) { -#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON) - return v.dot3(m[0], m[1], m[2]); +#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) + return v.dot3(m[0], m[1], m[2]); #else return btVector3(m[0].dot(v), m[1].dot(v), m[2].dot(v)); #endif } - SIMD_FORCE_INLINE btVector3 operator*(const btVector3& v, const btMatrix3x3& m) { -#if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)) +#if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) + + const __m128 vv = v.mVec128; - const __m128 vv = v.mVec128; + __m128 c0 = bt_splat_ps(vv, 0); + __m128 c1 = bt_splat_ps(vv, 1); + __m128 c2 = bt_splat_ps(vv, 2); - __m128 c0 = bt_splat_ps( vv, 0); - __m128 c1 = bt_splat_ps( vv, 1); - __m128 c2 = bt_splat_ps( vv, 2); + c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, btvFFF0fMask)); + c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, btvFFF0fMask)); + c0 = _mm_add_ps(c0, c1); + c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, btvFFF0fMask)); - c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, btvFFF0fMask) ); - c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, btvFFF0fMask) ); - c0 = _mm_add_ps(c0, c1); - c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, btvFFF0fMask) ); - - return btVector3(_mm_add_ps(c0, c2)); + return btVector3(_mm_add_ps(c0, c2)); #elif defined(BT_USE_NEON) - const float32x4_t vv = v.mVec128; - const float32x2_t vlo = vget_low_f32(vv); - const float32x2_t vhi = vget_high_f32(vv); - - float32x4_t c0, c1, c2; - - c0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask); - c1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask); - c2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask); - - c0 = vmulq_lane_f32(c0, vlo, 0); - c1 = vmulq_lane_f32(c1, vlo, 1); - c2 = vmulq_lane_f32(c2, vhi, 0); - c0 = vaddq_f32(c0, c1); - c0 = vaddq_f32(c0, c2); - - return btVector3(c0); + const float32x4_t vv = v.mVec128; + const float32x2_t vlo = vget_low_f32(vv); + const float32x2_t vhi = vget_high_f32(vv); + + float32x4_t c0, c1, c2; + + c0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask); + c1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask); + c2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask); + + c0 = vmulq_lane_f32(c0, vlo, 0); + c1 = vmulq_lane_f32(c1, vlo, 1); + c2 = vmulq_lane_f32(c2, vhi, 0); + c0 = vaddq_f32(c0, c1); + c0 = vaddq_f32(c0, c2); + + return btVector3(c0); #else return btVector3(m.tdotx(v), m.tdoty(v), m.tdotz(v)); #endif } -SIMD_FORCE_INLINE btMatrix3x3 +SIMD_FORCE_INLINE btMatrix3x3 operator*(const btMatrix3x3& m1, const btMatrix3x3& m2) { -#if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)) - - __m128 m10 = m1[0].mVec128; - __m128 m11 = m1[1].mVec128; - __m128 m12 = m1[2].mVec128; - - __m128 m2v = _mm_and_ps(m2[0].mVec128, btvFFF0fMask); - - __m128 c0 = bt_splat_ps( m10, 0); - __m128 c1 = bt_splat_ps( m11, 0); - __m128 c2 = bt_splat_ps( m12, 0); - - c0 = _mm_mul_ps(c0, m2v); - c1 = _mm_mul_ps(c1, m2v); - c2 = _mm_mul_ps(c2, m2v); - - m2v = _mm_and_ps(m2[1].mVec128, btvFFF0fMask); - - __m128 c0_1 = bt_splat_ps( m10, 1); - __m128 c1_1 = bt_splat_ps( m11, 1); - __m128 c2_1 = bt_splat_ps( m12, 1); - - c0_1 = _mm_mul_ps(c0_1, m2v); - c1_1 = _mm_mul_ps(c1_1, m2v); - c2_1 = _mm_mul_ps(c2_1, m2v); - - m2v = _mm_and_ps(m2[2].mVec128, btvFFF0fMask); - - c0 = _mm_add_ps(c0, c0_1); - c1 = _mm_add_ps(c1, c1_1); - c2 = _mm_add_ps(c2, c2_1); - - m10 = bt_splat_ps( m10, 2); - m11 = bt_splat_ps( m11, 2); - m12 = bt_splat_ps( m12, 2); - - m10 = _mm_mul_ps(m10, m2v); - m11 = _mm_mul_ps(m11, m2v); - m12 = _mm_mul_ps(m12, m2v); - - c0 = _mm_add_ps(c0, m10); - c1 = _mm_add_ps(c1, m11); - c2 = _mm_add_ps(c2, m12); - - return btMatrix3x3(c0, c1, c2); +#if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) + + __m128 m10 = m1[0].mVec128; + __m128 m11 = m1[1].mVec128; + __m128 m12 = m1[2].mVec128; + + __m128 m2v = _mm_and_ps(m2[0].mVec128, btvFFF0fMask); + + __m128 c0 = bt_splat_ps(m10, 0); + __m128 c1 = bt_splat_ps(m11, 0); + __m128 c2 = bt_splat_ps(m12, 0); + + c0 = _mm_mul_ps(c0, m2v); + c1 = _mm_mul_ps(c1, m2v); + c2 = _mm_mul_ps(c2, m2v); + + m2v = _mm_and_ps(m2[1].mVec128, btvFFF0fMask); + + __m128 c0_1 = bt_splat_ps(m10, 1); + __m128 c1_1 = bt_splat_ps(m11, 1); + __m128 c2_1 = bt_splat_ps(m12, 1); + + c0_1 = _mm_mul_ps(c0_1, m2v); + c1_1 = _mm_mul_ps(c1_1, m2v); + c2_1 = _mm_mul_ps(c2_1, m2v); + + m2v = _mm_and_ps(m2[2].mVec128, btvFFF0fMask); + + c0 = _mm_add_ps(c0, c0_1); + c1 = _mm_add_ps(c1, c1_1); + c2 = _mm_add_ps(c2, c2_1); + + m10 = bt_splat_ps(m10, 2); + m11 = bt_splat_ps(m11, 2); + m12 = bt_splat_ps(m12, 2); + + m10 = _mm_mul_ps(m10, m2v); + m11 = _mm_mul_ps(m11, m2v); + m12 = _mm_mul_ps(m12, m2v); + + c0 = _mm_add_ps(c0, m10); + c1 = _mm_add_ps(c1, m11); + c2 = _mm_add_ps(c2, m12); + + return btMatrix3x3(c0, c1, c2); #elif defined(BT_USE_NEON) - float32x4_t rv0, rv1, rv2; - float32x4_t v0, v1, v2; - float32x4_t mv0, mv1, mv2; - - v0 = m1[0].mVec128; - v1 = m1[1].mVec128; - v2 = m1[2].mVec128; - - mv0 = (float32x4_t) vandq_s32((int32x4_t)m2[0].mVec128, btvFFF0Mask); - mv1 = (float32x4_t) vandq_s32((int32x4_t)m2[1].mVec128, btvFFF0Mask); - mv2 = (float32x4_t) vandq_s32((int32x4_t)m2[2].mVec128, btvFFF0Mask); - - rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0); - rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0); - rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0); - - rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1); - rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1); - rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1); - - rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0); - rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0); - rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0); + float32x4_t rv0, rv1, rv2; + float32x4_t v0, v1, v2; + float32x4_t mv0, mv1, mv2; + + v0 = m1[0].mVec128; + v1 = m1[1].mVec128; + v2 = m1[2].mVec128; + + mv0 = (float32x4_t)vandq_s32((int32x4_t)m2[0].mVec128, btvFFF0Mask); + mv1 = (float32x4_t)vandq_s32((int32x4_t)m2[1].mVec128, btvFFF0Mask); + mv2 = (float32x4_t)vandq_s32((int32x4_t)m2[2].mVec128, btvFFF0Mask); + + rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0); + rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0); + rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0); + + rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1); + rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1); + rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1); + + rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0); + rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0); + rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0); return btMatrix3x3(rv0, rv1, rv2); - -#else + +#else return btMatrix3x3( - m2.tdotx( m1[0]), m2.tdoty( m1[0]), m2.tdotz( m1[0]), - m2.tdotx( m1[1]), m2.tdoty( m1[1]), m2.tdotz( m1[1]), - m2.tdotx( m1[2]), m2.tdoty( m1[2]), m2.tdotz( m1[2])); + m2.tdotx(m1[0]), m2.tdoty(m1[0]), m2.tdotz(m1[0]), + m2.tdotx(m1[1]), m2.tdoty(m1[1]), m2.tdotz(m1[1]), + m2.tdotx(m1[2]), m2.tdoty(m1[2]), m2.tdotz(m1[2])); #endif } @@ -1348,73 +1342,67 @@ m1[0][2] * m2[0][2] + m1[1][2] * m2[1][2] + m1[2][2] * m2[2][2]); * It will test all elements are equal. */ SIMD_FORCE_INLINE bool operator==(const btMatrix3x3& m1, const btMatrix3x3& m2) { -#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)) +#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) + + __m128 c0, c1, c2; - __m128 c0, c1, c2; + c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128); + c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128); + c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128); - c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128); - c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128); - c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128); - - c0 = _mm_and_ps(c0, c1); - c0 = _mm_and_ps(c0, c2); + c0 = _mm_and_ps(c0, c1); + c0 = _mm_and_ps(c0, c2); int m = _mm_movemask_ps((__m128)c0); return (0x7 == (m & 0x7)); - -#else - return - ( m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] && - m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] && - m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2] ); + +#else + return (m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] && + m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] && + m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2]); #endif } ///for serialization -struct btMatrix3x3FloatData +struct btMatrix3x3FloatData { btVector3FloatData m_el[3]; }; ///for serialization -struct btMatrix3x3DoubleData +struct btMatrix3x3DoubleData { btVector3DoubleData m_el[3]; }; - - - -SIMD_FORCE_INLINE void btMatrix3x3::serialize(struct btMatrix3x3Data& dataOut) const +SIMD_FORCE_INLINE void btMatrix3x3::serialize(struct btMatrix3x3Data& dataOut) const { - for (int i=0;i<3;i++) + for (int i = 0; i < 3; i++) m_el[i].serialize(dataOut.m_el[i]); } -SIMD_FORCE_INLINE void btMatrix3x3::serializeFloat(struct btMatrix3x3FloatData& dataOut) const +SIMD_FORCE_INLINE void btMatrix3x3::serializeFloat(struct btMatrix3x3FloatData& dataOut) const { - for (int i=0;i<3;i++) + for (int i = 0; i < 3; i++) m_el[i].serializeFloat(dataOut.m_el[i]); } - -SIMD_FORCE_INLINE void btMatrix3x3::deSerialize(const struct btMatrix3x3Data& dataIn) +SIMD_FORCE_INLINE void btMatrix3x3::deSerialize(const struct btMatrix3x3Data& dataIn) { - for (int i=0;i<3;i++) + for (int i = 0; i < 3; i++) m_el[i].deSerialize(dataIn.m_el[i]); } -SIMD_FORCE_INLINE void btMatrix3x3::deSerializeFloat(const struct btMatrix3x3FloatData& dataIn) +SIMD_FORCE_INLINE void btMatrix3x3::deSerializeFloat(const struct btMatrix3x3FloatData& dataIn) { - for (int i=0;i<3;i++) + for (int i = 0; i < 3; i++) m_el[i].deSerializeFloat(dataIn.m_el[i]); } -SIMD_FORCE_INLINE void btMatrix3x3::deSerializeDouble(const struct btMatrix3x3DoubleData& dataIn) +SIMD_FORCE_INLINE void btMatrix3x3::deSerializeDouble(const struct btMatrix3x3DoubleData& dataIn) { - for (int i=0;i<3;i++) + for (int i = 0; i < 3; i++) m_el[i].deSerializeDouble(dataIn.m_el[i]); } -#endif //BT_MATRIX3x3_H - +#endif //BT_MATRIX3x3_H diff --git a/thirdparty/bullet/LinearMath/btMatrixX.h b/thirdparty/bullet/LinearMath/btMatrixX.h index 42caed42ef..9df9e49469 100644 --- a/thirdparty/bullet/LinearMath/btMatrixX.h +++ b/thirdparty/bullet/LinearMath/btMatrixX.h @@ -24,24 +24,23 @@ subject to the following restrictions: //#define BT_DEBUG_OSTREAM #ifdef BT_DEBUG_OSTREAM #include -#include // std::setw -#endif //BT_DEBUG_OSTREAM +#include // std::setw +#endif //BT_DEBUG_OSTREAM class btIntSortPredicate { - public: - bool operator() ( const int& a, const int& b ) const - { - return a < b; - } +public: + bool operator()(const int& a, const int& b) const + { + return a < b; + } }; - template struct btVectorX { - btAlignedObjectArray m_storage; - + btAlignedObjectArray m_storage; + btVectorX() { } @@ -49,7 +48,7 @@ struct btVectorX { m_storage.resize(numRows); } - + void resize(int rows) { m_storage.resize(rows); @@ -66,13 +65,13 @@ struct btVectorX { return rows(); } - + T nrm2() const { T norm = T(0); - + int nn = rows(); - + { if (nn == 1) { @@ -82,11 +81,11 @@ struct btVectorX { T scale = 0.0; T ssq = 1.0; - + /* The following loop is equivalent to this call to the LAPACK auxiliary routine: CALL SLASSQ( N, X, INCX, SCALE, SSQ ) */ - - for (int ix=0;ix @@ -151,8 +148,7 @@ struct btVectorX } */ - -template +template struct btMatrixX { int m_rows; @@ -161,10 +157,10 @@ struct btMatrixX int m_resizeOperations; int m_setElemOperations; - btAlignedObjectArray m_storage; - mutable btAlignedObjectArray< btAlignedObjectArray > m_rowNonZeroElements1; + btAlignedObjectArray m_storage; + mutable btAlignedObjectArray > m_rowNonZeroElements1; - T* getBufferPointerWritable() + T* getBufferPointerWritable() { return m_storage.size() ? &m_storage[0] : 0; } @@ -174,21 +170,21 @@ struct btMatrixX return m_storage.size() ? &m_storage[0] : 0; } btMatrixX() - :m_rows(0), - m_cols(0), - m_operations(0), - m_resizeOperations(0), - m_setElemOperations(0) + : m_rows(0), + m_cols(0), + m_operations(0), + m_resizeOperations(0), + m_setElemOperations(0) { } - btMatrixX(int rows,int cols) - :m_rows(rows), - m_cols(cols), - m_operations(0), - m_resizeOperations(0), - m_setElemOperations(0) + btMatrixX(int rows, int cols) + : m_rows(rows), + m_cols(cols), + m_operations(0), + m_resizeOperations(0), + m_setElemOperations(0) { - resize(rows,cols); + resize(rows, cols); } void resize(int rows, int cols) { @@ -197,7 +193,7 @@ struct btMatrixX m_cols = cols; { BT_PROFILE("m_storage.resize"); - m_storage.resize(rows*cols); + m_storage.resize(rows * cols); } } int cols() const @@ -215,108 +211,99 @@ struct btMatrixX } */ - void addElem(int row,int col, T val) + void addElem(int row, int col, T val) { if (val) { - if (m_storage[col+row*m_cols]==0.f) + if (m_storage[col + row * m_cols] == 0.f) { - setElem(row,col,val); - } else + setElem(row, col, val); + } + else { - m_storage[row*m_cols+col] += val; + m_storage[row * m_cols + col] += val; } } } - - - void setElem(int row,int col, T val) + + void setElem(int row, int col, T val) { m_setElemOperations++; - m_storage[row*m_cols+col] = val; + m_storage[row * m_cols + col] = val; } - - void mulElem(int row,int col, T val) + + void mulElem(int row, int col, T val) { m_setElemOperations++; //mul doesn't change sparsity info - m_storage[row*m_cols+col] *= val; + m_storage[row * m_cols + col] *= val; } - - - - + void copyLowerToUpperTriangle() { - int count=0; - for (int row=0;row0 && numRowsOther>0 && B && C); - const btScalar *bb = B; - for ( int i = 0;i 0 && numRowsOther > 0 && B && C); + const btScalar* bb = B; + for (int i = 0; i < numRows; i++) { - const btScalar *cc = C; - for ( int j = 0;j& block) + void setSubMatrix(int rowstart, int colstart, int rowend, int colend, const btVectorX& block) { - btAssert(rowend+1-rowstart == block.rows()); - btAssert(colend+1-colstart == block.cols()); - for (int row=0;row btMatrixXf; typedef btVectorX btVectorXf; typedef btMatrixX btMatrixXd; typedef btVectorX btVectorXd; - #ifdef BT_DEBUG_OSTREAM -template -std::ostream& operator<< (std::ostream& os, const btMatrixX& mat) +template +std::ostream& operator<<(std::ostream& os, const btMatrixX& mat) +{ + os << " ["; + //printf("%s ---------------------\n",msg); + for (int i = 0; i < mat.rows(); i++) { - - os << " ["; - //printf("%s ---------------------\n",msg); - for (int i=0;i -std::ostream& operator<< (std::ostream& os, const btVectorX& mat) - { - - os << " ["; - //printf("%s ---------------------\n",msg); - for (int i=0;i +std::ostream& operator<<(std::ostream& os, const btVectorX& mat) +{ + os << " ["; + //printf("%s ---------------------\n",msg); + for (int i = 0; i < mat.rows(); i++) + { + os << std::setw(12) << mat[i]; + if (i != mat.rows() - 1) + os << std::endl + << " "; } + os << " ]"; + //printf("\n---------------------\n"); -#endif //BT_DEBUG_OSTREAM + return os; +} +#endif //BT_DEBUG_OSTREAM inline void setElem(btMatrixXd& mat, int row, int col, double val) { - mat.setElem(row,col,val); + mat.setElem(row, col, val); } inline void setElem(btMatrixXf& mat, int row, int col, float val) { - mat.setElem(row,col,val); + mat.setElem(row, col, val); } #ifdef BT_USE_DOUBLE_PRECISION - #define btVectorXu btVectorXd - #define btMatrixXu btMatrixXd +#define btVectorXu btVectorXd +#define btMatrixXu btMatrixXd #else - #define btVectorXu btVectorXf - #define btMatrixXu btMatrixXf -#endif //BT_USE_DOUBLE_PRECISION - - +#define btVectorXu btVectorXf +#define btMatrixXu btMatrixXf +#endif //BT_USE_DOUBLE_PRECISION -#endif//BT_MATRIX_H_H +#endif //BT_MATRIX_H_H diff --git a/thirdparty/bullet/LinearMath/btMinMax.h b/thirdparty/bullet/LinearMath/btMinMax.h index 5b436e9ba4..92fea0275a 100644 --- a/thirdparty/bullet/LinearMath/btMinMax.h +++ b/thirdparty/bullet/LinearMath/btMinMax.h @@ -12,60 +12,58 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - - #ifndef BT_GEN_MINMAX_H #define BT_GEN_MINMAX_H #include "btScalar.h" template -SIMD_FORCE_INLINE const T& btMin(const T& a, const T& b) +SIMD_FORCE_INLINE const T& btMin(const T& a, const T& b) { - return a < b ? a : b ; + return a < b ? a : b; } template -SIMD_FORCE_INLINE const T& btMax(const T& a, const T& b) +SIMD_FORCE_INLINE const T& btMax(const T& a, const T& b) { - return a > b ? a : b; + return a > b ? a : b; } template -SIMD_FORCE_INLINE const T& btClamped(const T& a, const T& lb, const T& ub) +SIMD_FORCE_INLINE const T& btClamped(const T& a, const T& lb, const T& ub) { - return a < lb ? lb : (ub < a ? ub : a); + return a < lb ? lb : (ub < a ? ub : a); } template -SIMD_FORCE_INLINE void btSetMin(T& a, const T& b) +SIMD_FORCE_INLINE void btSetMin(T& a, const T& b) { - if (b < a) + if (b < a) { a = b; } } template -SIMD_FORCE_INLINE void btSetMax(T& a, const T& b) +SIMD_FORCE_INLINE void btSetMax(T& a, const T& b) { - if (a < b) + if (a < b) { a = b; } } template -SIMD_FORCE_INLINE void btClamp(T& a, const T& lb, const T& ub) +SIMD_FORCE_INLINE void btClamp(T& a, const T& lb, const T& ub) { - if (a < lb) + if (a < lb) { - a = lb; + a = lb; } - else if (ub < a) + else if (ub < a) { a = ub; } } -#endif //BT_GEN_MINMAX_H +#endif //BT_GEN_MINMAX_H diff --git a/thirdparty/bullet/LinearMath/btMotionState.h b/thirdparty/bullet/LinearMath/btMotionState.h index 9431814090..ae6a51611d 100644 --- a/thirdparty/bullet/LinearMath/btMotionState.h +++ b/thirdparty/bullet/LinearMath/btMotionState.h @@ -20,21 +20,17 @@ subject to the following restrictions: ///The btMotionState interface class allows the dynamics world to synchronize and interpolate the updated world transforms with graphics ///For optimizations, potentially only moving objects get synchronized (using setWorldPosition/setWorldOrientation) -class btMotionState +class btMotionState { - public: - - virtual ~btMotionState() - { - - } - - virtual void getWorldTransform(btTransform& worldTrans ) const =0; - - //Bullet only calls the update of worldtransform for active objects - virtual void setWorldTransform(const btTransform& worldTrans)=0; - - +public: + virtual ~btMotionState() + { + } + + virtual void getWorldTransform(btTransform& worldTrans) const = 0; + + //Bullet only calls the update of worldtransform for active objects + virtual void setWorldTransform(const btTransform& worldTrans) = 0; }; -#endif //BT_MOTIONSTATE_H +#endif //BT_MOTIONSTATE_H diff --git a/thirdparty/bullet/LinearMath/btPolarDecomposition.cpp b/thirdparty/bullet/LinearMath/btPolarDecomposition.cpp index b3664faa4e..d9c72a8014 100644 --- a/thirdparty/bullet/LinearMath/btPolarDecomposition.cpp +++ b/thirdparty/bullet/LinearMath/btPolarDecomposition.cpp @@ -3,96 +3,92 @@ namespace { - btScalar abs_column_sum(const btMatrix3x3& a, int i) - { - return btFabs(a[0][i]) + btFabs(a[1][i]) + btFabs(a[2][i]); - } - - btScalar abs_row_sum(const btMatrix3x3& a, int i) - { - return btFabs(a[i][0]) + btFabs(a[i][1]) + btFabs(a[i][2]); - } - - btScalar p1_norm(const btMatrix3x3& a) - { - const btScalar sum0 = abs_column_sum(a,0); - const btScalar sum1 = abs_column_sum(a,1); - const btScalar sum2 = abs_column_sum(a,2); - return btMax(btMax(sum0, sum1), sum2); - } - - btScalar pinf_norm(const btMatrix3x3& a) - { - const btScalar sum0 = abs_row_sum(a,0); - const btScalar sum1 = abs_row_sum(a,1); - const btScalar sum2 = abs_row_sum(a,2); - return btMax(btMax(sum0, sum1), sum2); - } +btScalar abs_column_sum(const btMatrix3x3& a, int i) +{ + return btFabs(a[0][i]) + btFabs(a[1][i]) + btFabs(a[2][i]); } +btScalar abs_row_sum(const btMatrix3x3& a, int i) +{ + return btFabs(a[i][0]) + btFabs(a[i][1]) + btFabs(a[i][2]); +} +btScalar p1_norm(const btMatrix3x3& a) +{ + const btScalar sum0 = abs_column_sum(a, 0); + const btScalar sum1 = abs_column_sum(a, 1); + const btScalar sum2 = abs_column_sum(a, 2); + return btMax(btMax(sum0, sum1), sum2); +} + +btScalar pinf_norm(const btMatrix3x3& a) +{ + const btScalar sum0 = abs_row_sum(a, 0); + const btScalar sum1 = abs_row_sum(a, 1); + const btScalar sum2 = abs_row_sum(a, 2); + return btMax(btMax(sum0, sum1), sum2); +} +} // namespace btPolarDecomposition::btPolarDecomposition(btScalar tolerance, unsigned int maxIterations) -: m_tolerance(tolerance) -, m_maxIterations(maxIterations) + : m_tolerance(tolerance), m_maxIterations(maxIterations) { } unsigned int btPolarDecomposition::decompose(const btMatrix3x3& a, btMatrix3x3& u, btMatrix3x3& h) const { - // Use the 'u' and 'h' matrices for intermediate calculations - u = a; - h = a.inverse(); - - for (unsigned int i = 0; i < m_maxIterations; ++i) - { - const btScalar h_1 = p1_norm(h); - const btScalar h_inf = pinf_norm(h); - const btScalar u_1 = p1_norm(u); - const btScalar u_inf = pinf_norm(u); - - const btScalar h_norm = h_1 * h_inf; - const btScalar u_norm = u_1 * u_inf; - - // The matrix is effectively singular so we cannot invert it - if (btFuzzyZero(h_norm) || btFuzzyZero(u_norm)) - break; - - const btScalar gamma = btPow(h_norm / u_norm, 0.25f); - const btScalar inv_gamma = btScalar(1.0) / gamma; - - // Determine the delta to 'u' - const btMatrix3x3 delta = (u * (gamma - btScalar(2.0)) + h.transpose() * inv_gamma) * btScalar(0.5); - - // Update the matrices - u += delta; - h = u.inverse(); - - // Check for convergence - if (p1_norm(delta) <= m_tolerance * u_1) - { - h = u.transpose() * a; - h = (h + h.transpose()) * 0.5; - return i; - } - } - - // The algorithm has failed to converge to the specified tolerance, but we - // want to make sure that the matrices returned are in the right form. - h = u.transpose() * a; - h = (h + h.transpose()) * 0.5; - - return m_maxIterations; + // Use the 'u' and 'h' matrices for intermediate calculations + u = a; + h = a.inverse(); + + for (unsigned int i = 0; i < m_maxIterations; ++i) + { + const btScalar h_1 = p1_norm(h); + const btScalar h_inf = pinf_norm(h); + const btScalar u_1 = p1_norm(u); + const btScalar u_inf = pinf_norm(u); + + const btScalar h_norm = h_1 * h_inf; + const btScalar u_norm = u_1 * u_inf; + + // The matrix is effectively singular so we cannot invert it + if (btFuzzyZero(h_norm) || btFuzzyZero(u_norm)) + break; + + const btScalar gamma = btPow(h_norm / u_norm, 0.25f); + const btScalar inv_gamma = btScalar(1.0) / gamma; + + // Determine the delta to 'u' + const btMatrix3x3 delta = (u * (gamma - btScalar(2.0)) + h.transpose() * inv_gamma) * btScalar(0.5); + + // Update the matrices + u += delta; + h = u.inverse(); + + // Check for convergence + if (p1_norm(delta) <= m_tolerance * u_1) + { + h = u.transpose() * a; + h = (h + h.transpose()) * 0.5; + return i; + } + } + + // The algorithm has failed to converge to the specified tolerance, but we + // want to make sure that the matrices returned are in the right form. + h = u.transpose() * a; + h = (h + h.transpose()) * 0.5; + + return m_maxIterations; } unsigned int btPolarDecomposition::maxIterations() const { - return m_maxIterations; + return m_maxIterations; } unsigned int polarDecompose(const btMatrix3x3& a, btMatrix3x3& u, btMatrix3x3& h) { - static btPolarDecomposition polar; - return polar.decompose(a, u, h); + static btPolarDecomposition polar; + return polar.decompose(a, u, h); } - diff --git a/thirdparty/bullet/LinearMath/btPolarDecomposition.h b/thirdparty/bullet/LinearMath/btPolarDecomposition.h index 1feea0f78e..bf29140a14 100644 --- a/thirdparty/bullet/LinearMath/btPolarDecomposition.h +++ b/thirdparty/bullet/LinearMath/btPolarDecomposition.h @@ -13,10 +13,8 @@ */ class btPolarDecomposition { - public: - - - /** +public: + /** * Creates an instance with optional parameters. * * @param tolerance - the tolerance used to determine convergence of the @@ -24,10 +22,10 @@ class btPolarDecomposition * @param maxIterations - the maximum number of iterations used to achieve * convergence */ - btPolarDecomposition(btScalar tolerance = btScalar(0.0001), - unsigned int maxIterations = 16); + btPolarDecomposition(btScalar tolerance = btScalar(0.0001), + unsigned int maxIterations = 16); - /** + /** * Decomposes a matrix into orthogonal and symmetric, positive-definite * parts. If the number of iterations returned by this function is equal to * the maximum number of iterations, the algorithm has failed to converge. @@ -38,19 +36,19 @@ class btPolarDecomposition * * @return the number of iterations performed by the algorithm. */ - unsigned int decompose(const btMatrix3x3& a, btMatrix3x3& u, btMatrix3x3& h) const; + unsigned int decompose(const btMatrix3x3& a, btMatrix3x3& u, btMatrix3x3& h) const; - /** + /** * Returns the maximum number of iterations that this algorithm will perform * to achieve convergence. * * @return maximum number of iterations */ - unsigned int maxIterations() const; + unsigned int maxIterations() const; - private: - btScalar m_tolerance; - unsigned int m_maxIterations; +private: + btScalar m_tolerance; + unsigned int m_maxIterations; }; /** @@ -66,7 +64,6 @@ class btPolarDecomposition * * @return the number of iterations performed by the algorithm. */ -unsigned int polarDecompose(const btMatrix3x3& a, btMatrix3x3& u, btMatrix3x3& h); - -#endif // POLARDECOMPOSITION_H +unsigned int polarDecompose(const btMatrix3x3& a, btMatrix3x3& u, btMatrix3x3& h); +#endif // POLARDECOMPOSITION_H diff --git a/thirdparty/bullet/LinearMath/btPoolAllocator.h b/thirdparty/bullet/LinearMath/btPoolAllocator.h index efdeda8ffc..4e7b49660a 100644 --- a/thirdparty/bullet/LinearMath/btPoolAllocator.h +++ b/thirdparty/bullet/LinearMath/btPoolAllocator.h @@ -12,7 +12,6 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - #ifndef _BT_POOL_ALLOCATOR_H #define _BT_POOL_ALLOCATOR_H @@ -23,38 +22,38 @@ subject to the following restrictions: ///The btPoolAllocator class allows to efficiently allocate a large pool of objects, instead of dynamically allocating them separately. class btPoolAllocator { - int m_elemSize; - int m_maxElements; - int m_freeCount; - void* m_firstFree; - unsigned char* m_pool; - btSpinMutex m_mutex; // only used if BT_THREADSAFE + int m_elemSize; + int m_maxElements; + int m_freeCount; + void* m_firstFree; + unsigned char* m_pool; + btSpinMutex m_mutex; // only used if BT_THREADSAFE public: - btPoolAllocator(int elemSize, int maxElements) - :m_elemSize(elemSize), - m_maxElements(maxElements) + : m_elemSize(elemSize), + m_maxElements(maxElements) { - m_pool = (unsigned char*) btAlignedAlloc( static_cast(m_elemSize*m_maxElements),16); + m_pool = (unsigned char*)btAlignedAlloc(static_cast(m_elemSize * m_maxElements), 16); unsigned char* p = m_pool; - m_firstFree = p; - m_freeCount = m_maxElements; - int count = m_maxElements; - while (--count) { - *(void**)p = (p + m_elemSize); - p += m_elemSize; - } - *(void**)p = 0; - } + m_firstFree = p; + m_freeCount = m_maxElements; + int count = m_maxElements; + while (--count) + { + *(void**)p = (p + m_elemSize); + p += m_elemSize; + } + *(void**)p = 0; + } ~btPoolAllocator() { - btAlignedFree( m_pool); + btAlignedFree(m_pool); } - int getFreeCount() const + int getFreeCount() const { return m_freeCount; } @@ -69,26 +68,27 @@ public: return m_maxElements; } - void* allocate(int size) + void* allocate(int size) { // release mode fix (void)size; - btMutexLock(&m_mutex); - btAssert(!size || size<=m_elemSize); + btMutexLock(&m_mutex); + btAssert(!size || size <= m_elemSize); //btAssert(m_freeCount>0); // should return null if all full - void* result = m_firstFree; - if (NULL != m_firstFree) - { - m_firstFree = *(void**)m_firstFree; - --m_freeCount; - } - btMutexUnlock(&m_mutex); - return result; + void* result = m_firstFree; + if (NULL != m_firstFree) + { + m_firstFree = *(void**)m_firstFree; + --m_freeCount; + } + btMutexUnlock(&m_mutex); + return result; } bool validPtr(void* ptr) { - if (ptr) { + if (ptr) + { if (((unsigned char*)ptr >= m_pool && (unsigned char*)ptr < m_pool + m_maxElements * m_elemSize)) { return true; @@ -97,34 +97,34 @@ public: return false; } - void freeMemory(void* ptr) + void freeMemory(void* ptr) { - if (ptr) { - btAssert((unsigned char*)ptr >= m_pool && (unsigned char*)ptr < m_pool + m_maxElements * m_elemSize); - - btMutexLock(&m_mutex); - *(void**)ptr = m_firstFree; - m_firstFree = ptr; - ++m_freeCount; - btMutexUnlock(&m_mutex); - } + if (ptr) + { + btAssert((unsigned char*)ptr >= m_pool && (unsigned char*)ptr < m_pool + m_maxElements * m_elemSize); + + btMutexLock(&m_mutex); + *(void**)ptr = m_firstFree; + m_firstFree = ptr; + ++m_freeCount; + btMutexUnlock(&m_mutex); + } } - int getElementSize() const + int getElementSize() const { return m_elemSize; } - unsigned char* getPoolAddress() + unsigned char* getPoolAddress() { return m_pool; } - const unsigned char* getPoolAddress() const + const unsigned char* getPoolAddress() const { return m_pool; } - }; -#endif //_BT_POOL_ALLOCATOR_H +#endif //_BT_POOL_ALLOCATOR_H diff --git a/thirdparty/bullet/LinearMath/btQuadWord.h b/thirdparty/bullet/LinearMath/btQuadWord.h index fcfb3be444..ab2d3175ad 100644 --- a/thirdparty/bullet/LinearMath/btQuadWord.h +++ b/thirdparty/bullet/LinearMath/btQuadWord.h @@ -12,18 +12,13 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - #ifndef BT_SIMD_QUADWORD_H #define BT_SIMD_QUADWORD_H #include "btScalar.h" #include "btMinMax.h" - - - - -#if defined (__CELLOS_LV2) && defined (__SPU__) +#if defined(__CELLOS_LV2) && defined(__SPU__) #include #endif @@ -31,51 +26,53 @@ subject to the following restrictions: * Some issues under PS3 Linux with IBM 2.1 SDK, gcc compiler prevent from using aligned quadword. */ #ifndef USE_LIBSPE2 -ATTRIBUTE_ALIGNED16(class) btQuadWord +ATTRIBUTE_ALIGNED16(class) +btQuadWord #else class btQuadWord #endif { protected: - -#if defined (__SPU__) && defined (__CELLOS_LV2__) +#if defined(__SPU__) && defined(__CELLOS_LV2__) union { vec_float4 mVec128; - btScalar m_floats[4]; + btScalar m_floats[4]; }; + public: - vec_float4 get128() const + vec_float4 get128() const { return mVec128; } + protected: -#else //__CELLOS_LV2__ __SPU__ +#else //__CELLOS_LV2__ __SPU__ -#if defined(BT_USE_SSE) || defined(BT_USE_NEON) +#if defined(BT_USE_SSE) || defined(BT_USE_NEON) union { btSimdFloat4 mVec128; - btScalar m_floats[4]; + btScalar m_floats[4]; }; + public: - SIMD_FORCE_INLINE btSimdFloat4 get128() const + SIMD_FORCE_INLINE btSimdFloat4 get128() const { return mVec128; } - SIMD_FORCE_INLINE void set128(btSimdFloat4 v128) + SIMD_FORCE_INLINE void set128(btSimdFloat4 v128) { mVec128 = v128; } #else - btScalar m_floats[4]; -#endif // BT_USE_SSE + btScalar m_floats[4]; +#endif // BT_USE_SSE -#endif //__CELLOS_LV2__ __SPU__ +#endif //__CELLOS_LV2__ __SPU__ - public: - +public: #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) - // Set Vector + // Set Vector SIMD_FORCE_INLINE btQuadWord(const btSimdFloat4 vec) { mVec128 = vec; @@ -88,157 +85,154 @@ public: } // Assignment Operator - SIMD_FORCE_INLINE btQuadWord& - operator=(const btQuadWord& v) + SIMD_FORCE_INLINE btQuadWord& + operator=(const btQuadWord& v) { mVec128 = v.mVec128; - + return *this; } - + #endif - /**@brief Return the x value */ - SIMD_FORCE_INLINE const btScalar& getX() const { return m_floats[0]; } - /**@brief Return the y value */ - SIMD_FORCE_INLINE const btScalar& getY() const { return m_floats[1]; } - /**@brief Return the z value */ - SIMD_FORCE_INLINE const btScalar& getZ() const { return m_floats[2]; } - /**@brief Set the x value */ - SIMD_FORCE_INLINE void setX(btScalar _x) { m_floats[0] = _x;}; - /**@brief Set the y value */ - SIMD_FORCE_INLINE void setY(btScalar _y) { m_floats[1] = _y;}; - /**@brief Set the z value */ - SIMD_FORCE_INLINE void setZ(btScalar _z) { m_floats[2] = _z;}; - /**@brief Set the w value */ - SIMD_FORCE_INLINE void setW(btScalar _w) { m_floats[3] = _w;}; - /**@brief Return the x value */ - SIMD_FORCE_INLINE const btScalar& x() const { return m_floats[0]; } - /**@brief Return the y value */ - SIMD_FORCE_INLINE const btScalar& y() const { return m_floats[1]; } - /**@brief Return the z value */ - SIMD_FORCE_INLINE const btScalar& z() const { return m_floats[2]; } - /**@brief Return the w value */ - SIMD_FORCE_INLINE const btScalar& w() const { return m_floats[3]; } - - //SIMD_FORCE_INLINE btScalar& operator[](int i) { return (&m_floats[0])[i]; } + /**@brief Return the x value */ + SIMD_FORCE_INLINE const btScalar& getX() const { return m_floats[0]; } + /**@brief Return the y value */ + SIMD_FORCE_INLINE const btScalar& getY() const { return m_floats[1]; } + /**@brief Return the z value */ + SIMD_FORCE_INLINE const btScalar& getZ() const { return m_floats[2]; } + /**@brief Set the x value */ + SIMD_FORCE_INLINE void setX(btScalar _x) { m_floats[0] = _x; }; + /**@brief Set the y value */ + SIMD_FORCE_INLINE void setY(btScalar _y) { m_floats[1] = _y; }; + /**@brief Set the z value */ + SIMD_FORCE_INLINE void setZ(btScalar _z) { m_floats[2] = _z; }; + /**@brief Set the w value */ + SIMD_FORCE_INLINE void setW(btScalar _w) { m_floats[3] = _w; }; + /**@brief Return the x value */ + SIMD_FORCE_INLINE const btScalar& x() const { return m_floats[0]; } + /**@brief Return the y value */ + SIMD_FORCE_INLINE const btScalar& y() const { return m_floats[1]; } + /**@brief Return the z value */ + SIMD_FORCE_INLINE const btScalar& z() const { return m_floats[2]; } + /**@brief Return the w value */ + SIMD_FORCE_INLINE const btScalar& w() const { return m_floats[3]; } + + //SIMD_FORCE_INLINE btScalar& operator[](int i) { return (&m_floats[0])[i]; } //SIMD_FORCE_INLINE const btScalar& operator[](int i) const { return (&m_floats[0])[i]; } ///operator btScalar*() replaces operator[], using implicit conversion. We added operator != and operator == to avoid pointer comparisons. - SIMD_FORCE_INLINE operator btScalar *() { return &m_floats[0]; } - SIMD_FORCE_INLINE operator const btScalar *() const { return &m_floats[0]; } + SIMD_FORCE_INLINE operator btScalar*() { return &m_floats[0]; } + SIMD_FORCE_INLINE operator const btScalar*() const { return &m_floats[0]; } - SIMD_FORCE_INLINE bool operator==(const btQuadWord& other) const + SIMD_FORCE_INLINE bool operator==(const btQuadWord& other) const { #ifdef BT_USE_SSE - return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128))); -#else - return ((m_floats[3]==other.m_floats[3]) && - (m_floats[2]==other.m_floats[2]) && - (m_floats[1]==other.m_floats[1]) && - (m_floats[0]==other.m_floats[0])); + return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128))); +#else + return ((m_floats[3] == other.m_floats[3]) && + (m_floats[2] == other.m_floats[2]) && + (m_floats[1] == other.m_floats[1]) && + (m_floats[0] == other.m_floats[0])); #endif } - SIMD_FORCE_INLINE bool operator!=(const btQuadWord& other) const + SIMD_FORCE_INLINE bool operator!=(const btQuadWord& other) const { return !(*this == other); } - /**@brief Set x,y,z and zero w + /**@brief Set x,y,z and zero w * @param x Value of x * @param y Value of y * @param z Value of z */ - SIMD_FORCE_INLINE void setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z) - { - m_floats[0]=_x; - m_floats[1]=_y; - m_floats[2]=_z; - m_floats[3] = 0.f; - } + SIMD_FORCE_INLINE void setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z) + { + m_floats[0] = _x; + m_floats[1] = _y; + m_floats[2] = _z; + m_floats[3] = 0.f; + } -/* void getValue(btScalar *m) const + /* void getValue(btScalar *m) const { m[0] = m_floats[0]; m[1] = m_floats[1]; m[2] = m_floats[2]; } */ -/**@brief Set the values + /**@brief Set the values * @param x Value of x * @param y Value of y * @param z Value of z * @param w Value of w */ - SIMD_FORCE_INLINE void setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z,const btScalar& _w) - { - m_floats[0]=_x; - m_floats[1]=_y; - m_floats[2]=_z; - m_floats[3]=_w; - } - /**@brief No initialization constructor */ - SIMD_FORCE_INLINE btQuadWord() - // :m_floats[0](btScalar(0.)),m_floats[1](btScalar(0.)),m_floats[2](btScalar(0.)),m_floats[3](btScalar(0.)) - { - } - - /**@brief Three argument constructor (zeros w) + SIMD_FORCE_INLINE void setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z, const btScalar& _w) + { + m_floats[0] = _x; + m_floats[1] = _y; + m_floats[2] = _z; + m_floats[3] = _w; + } + /**@brief No initialization constructor */ + SIMD_FORCE_INLINE btQuadWord() + // :m_floats[0](btScalar(0.)),m_floats[1](btScalar(0.)),m_floats[2](btScalar(0.)),m_floats[3](btScalar(0.)) + { + } + + /**@brief Three argument constructor (zeros w) * @param x Value of x * @param y Value of y * @param z Value of z */ - SIMD_FORCE_INLINE btQuadWord(const btScalar& _x, const btScalar& _y, const btScalar& _z) - { - m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = 0.0f; - } + SIMD_FORCE_INLINE btQuadWord(const btScalar& _x, const btScalar& _y, const btScalar& _z) + { + m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = 0.0f; + } -/**@brief Initializing constructor + /**@brief Initializing constructor * @param x Value of x * @param y Value of y * @param z Value of z * @param w Value of w */ - SIMD_FORCE_INLINE btQuadWord(const btScalar& _x, const btScalar& _y, const btScalar& _z,const btScalar& _w) - { - m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = _w; - } + SIMD_FORCE_INLINE btQuadWord(const btScalar& _x, const btScalar& _y, const btScalar& _z, const btScalar& _w) + { + m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = _w; + } - /**@brief Set each element to the max of the current values and the values of another btQuadWord + /**@brief Set each element to the max of the current values and the values of another btQuadWord * @param other The other btQuadWord to compare with */ - SIMD_FORCE_INLINE void setMax(const btQuadWord& other) - { - #ifdef BT_USE_SSE - mVec128 = _mm_max_ps(mVec128, other.mVec128); - #elif defined(BT_USE_NEON) - mVec128 = vmaxq_f32(mVec128, other.mVec128); - #else - btSetMax(m_floats[0], other.m_floats[0]); - btSetMax(m_floats[1], other.m_floats[1]); - btSetMax(m_floats[2], other.m_floats[2]); - btSetMax(m_floats[3], other.m_floats[3]); - #endif - } - /**@brief Set each element to the min of the current values and the values of another btQuadWord + SIMD_FORCE_INLINE void setMax(const btQuadWord& other) + { +#ifdef BT_USE_SSE + mVec128 = _mm_max_ps(mVec128, other.mVec128); +#elif defined(BT_USE_NEON) + mVec128 = vmaxq_f32(mVec128, other.mVec128); +#else + btSetMax(m_floats[0], other.m_floats[0]); + btSetMax(m_floats[1], other.m_floats[1]); + btSetMax(m_floats[2], other.m_floats[2]); + btSetMax(m_floats[3], other.m_floats[3]); +#endif + } + /**@brief Set each element to the min of the current values and the values of another btQuadWord * @param other The other btQuadWord to compare with */ - SIMD_FORCE_INLINE void setMin(const btQuadWord& other) - { - #ifdef BT_USE_SSE - mVec128 = _mm_min_ps(mVec128, other.mVec128); - #elif defined(BT_USE_NEON) - mVec128 = vminq_f32(mVec128, other.mVec128); - #else - btSetMin(m_floats[0], other.m_floats[0]); - btSetMin(m_floats[1], other.m_floats[1]); - btSetMin(m_floats[2], other.m_floats[2]); - btSetMin(m_floats[3], other.m_floats[3]); - #endif - } - - - + SIMD_FORCE_INLINE void setMin(const btQuadWord& other) + { +#ifdef BT_USE_SSE + mVec128 = _mm_min_ps(mVec128, other.mVec128); +#elif defined(BT_USE_NEON) + mVec128 = vminq_f32(mVec128, other.mVec128); +#else + btSetMin(m_floats[0], other.m_floats[0]); + btSetMin(m_floats[1], other.m_floats[1]); + btSetMin(m_floats[2], other.m_floats[2]); + btSetMin(m_floats[3], other.m_floats[3]); +#endif + } }; -#endif //BT_SIMD_QUADWORD_H +#endif //BT_SIMD_QUADWORD_H diff --git a/thirdparty/bullet/LinearMath/btQuaternion.h b/thirdparty/bullet/LinearMath/btQuaternion.h index a98fec7bc4..53e8169b80 100644 --- a/thirdparty/bullet/LinearMath/btQuaternion.h +++ b/thirdparty/bullet/LinearMath/btQuaternion.h @@ -12,25 +12,19 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - - #ifndef BT_SIMD__QUATERNION_H_ #define BT_SIMD__QUATERNION_H_ - #include "btVector3.h" #include "btQuadWord.h" - #ifdef BT_USE_DOUBLE_PRECISION #define btQuaternionData btQuaternionDoubleData #define btQuaternionDataName "btQuaternionDoubleData" #else #define btQuaternionData btQuaternionFloatData #define btQuaternionDataName "btQuaternionFloatData" -#endif //BT_USE_DOUBLE_PRECISION - - +#endif //BT_USE_DOUBLE_PRECISION #ifdef BT_USE_SSE @@ -39,7 +33,7 @@ subject to the following restrictions: #endif -#if defined(BT_USE_SSE) +#if defined(BT_USE_SSE) #define vQInv (_mm_set_ps(+0.0f, -0.0f, -0.0f, -0.0f)) #define vPPPM (_mm_set_ps(-0.0f, +0.0f, +0.0f, +0.0f)) @@ -52,13 +46,14 @@ const btSimdFloat4 ATTRIBUTE_ALIGNED16(vPPPM) = {+0.0f, +0.0f, +0.0f, -0.0f}; #endif /**@brief The btQuaternion implements quaternion to perform linear algebra rotations in combination with btMatrix3x3, btVector3 and btTransform. */ -class btQuaternion : public btQuadWord { +class btQuaternion : public btQuadWord +{ public: - /**@brief No initialization constructor */ + /**@brief No initialization constructor */ btQuaternion() {} -#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))|| defined(BT_USE_NEON) - // Set Vector +#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) + // Set Vector SIMD_FORCE_INLINE btQuaternion(const btSimdFloat4 vec) { mVec128 = vec; @@ -71,42 +66,43 @@ public: } // Assignment Operator - SIMD_FORCE_INLINE btQuaternion& - operator=(const btQuaternion& v) + SIMD_FORCE_INLINE btQuaternion& + operator=(const btQuaternion& v) { mVec128 = v.mVec128; - + return *this; } - + #endif // template // explicit Quaternion(const btScalar *v) : Tuple4(v) {} - /**@brief Constructor from scalars */ - btQuaternion(const btScalar& _x, const btScalar& _y, const btScalar& _z, const btScalar& _w) - : btQuadWord(_x, _y, _z, _w) - {} - /**@brief Axis angle Constructor + /**@brief Constructor from scalars */ + btQuaternion(const btScalar& _x, const btScalar& _y, const btScalar& _z, const btScalar& _w) + : btQuadWord(_x, _y, _z, _w) + { + } + /**@brief Axis angle Constructor * @param axis The axis which the rotation is around * @param angle The magnitude of the rotation around the angle (Radians) */ - btQuaternion(const btVector3& _axis, const btScalar& _angle) - { - setRotation(_axis, _angle); + btQuaternion(const btVector3& _axis, const btScalar& _angle) + { + setRotation(_axis, _angle); } - /**@brief Constructor from Euler angles + /**@brief Constructor from Euler angles * @param yaw Angle around Y unless BT_EULER_DEFAULT_ZYX defined then Z * @param pitch Angle around X unless BT_EULER_DEFAULT_ZYX defined then Y * @param roll Angle around Z unless BT_EULER_DEFAULT_ZYX defined then X */ btQuaternion(const btScalar& yaw, const btScalar& pitch, const btScalar& roll) - { + { #ifndef BT_EULER_DEFAULT_ZYX - setEuler(yaw, pitch, roll); + setEuler(yaw, pitch, roll); #else - setEulerZYX(yaw, pitch, roll); -#endif + setEulerZYX(yaw, pitch, roll); +#endif } - /**@brief Set the rotation using axis angle notation + /**@brief Set the rotation using axis angle notation * @param axis The axis around which to rotate * @param angle The magnitude of the rotation in Radians */ void setRotation(const btVector3& axis, const btScalar& _angle) @@ -114,18 +110,18 @@ public: btScalar d = axis.length(); btAssert(d != btScalar(0.0)); btScalar s = btSin(_angle * btScalar(0.5)) / d; - setValue(axis.x() * s, axis.y() * s, axis.z() * s, - btCos(_angle * btScalar(0.5))); + setValue(axis.x() * s, axis.y() * s, axis.z() * s, + btCos(_angle * btScalar(0.5))); } - /**@brief Set the quaternion using Euler angles + /**@brief Set the quaternion using Euler angles * @param yaw Angle around Y * @param pitch Angle around X * @param roll Angle around Z */ void setEuler(const btScalar& yaw, const btScalar& pitch, const btScalar& roll) { - btScalar halfYaw = btScalar(yaw) * btScalar(0.5); - btScalar halfPitch = btScalar(pitch) * btScalar(0.5); - btScalar halfRoll = btScalar(roll) * btScalar(0.5); + btScalar halfYaw = btScalar(yaw) * btScalar(0.5); + btScalar halfPitch = btScalar(pitch) * btScalar(0.5); + btScalar halfRoll = btScalar(roll) * btScalar(0.5); btScalar cosYaw = btCos(halfYaw); btScalar sinYaw = btSin(halfYaw); btScalar cosPitch = btCos(halfPitch); @@ -133,32 +129,32 @@ public: btScalar cosRoll = btCos(halfRoll); btScalar sinRoll = btSin(halfRoll); setValue(cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw, - cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw, - sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw, - cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw); + cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw, + sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw, + cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw); } - /**@brief Set the quaternion using euler angles + /**@brief Set the quaternion using euler angles * @param yaw Angle around Z * @param pitch Angle around Y * @param roll Angle around X */ void setEulerZYX(const btScalar& yawZ, const btScalar& pitchY, const btScalar& rollX) { - btScalar halfYaw = btScalar(yawZ) * btScalar(0.5); - btScalar halfPitch = btScalar(pitchY) * btScalar(0.5); - btScalar halfRoll = btScalar(rollX) * btScalar(0.5); + btScalar halfYaw = btScalar(yawZ) * btScalar(0.5); + btScalar halfPitch = btScalar(pitchY) * btScalar(0.5); + btScalar halfRoll = btScalar(rollX) * btScalar(0.5); btScalar cosYaw = btCos(halfYaw); btScalar sinYaw = btSin(halfYaw); btScalar cosPitch = btCos(halfPitch); btScalar sinPitch = btSin(halfPitch); btScalar cosRoll = btCos(halfRoll); btScalar sinRoll = btSin(halfRoll); - setValue(sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw, //x - cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw, //y - cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw, //z - cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw); //formerly yzx + setValue(sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw, //x + cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw, //y + cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw, //z + cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw); //formerly yzx } - /**@brief Get the euler angles from this quaternion + /**@brief Get the euler angles from this quaternion * @param yaw Angle around Z * @param pitch Angle around Y * @param roll Angle around X */ @@ -173,23 +169,25 @@ public: sqy = m_floats[1] * m_floats[1]; sqz = m_floats[2] * m_floats[2]; squ = m_floats[3] * m_floats[3]; - sarg = btScalar(-2.) * (m_floats[0] * m_floats[2] - m_floats[3] * m_floats[1]); - + sarg = btScalar(-2.) * (m_floats[0] * m_floats[2] - m_floats[3] * m_floats[1]); + // If the pitch angle is PI/2 or -PI/2, we can only compute // the sum roll + yaw. However, any combination that gives // the right sum will produce the correct orientation, so we // set rollX = 0 and compute yawZ. if (sarg <= -btScalar(0.99999)) { - pitchY = btScalar(-0.5)*SIMD_PI; - rollX = 0; - yawZ = btScalar(2) * btAtan2(m_floats[0],-m_floats[1]); - } else if (sarg >= btScalar(0.99999)) + pitchY = btScalar(-0.5) * SIMD_PI; + rollX = 0; + yawZ = btScalar(2) * btAtan2(m_floats[0], -m_floats[1]); + } + else if (sarg >= btScalar(0.99999)) { - pitchY = btScalar(0.5)*SIMD_PI; - rollX = 0; - yawZ = btScalar(2) * btAtan2(-m_floats[0], m_floats[1]); - } else + pitchY = btScalar(0.5) * SIMD_PI; + rollX = 0; + yawZ = btScalar(2) * btAtan2(-m_floats[0], m_floats[1]); + } + else { pitchY = btAsin(sarg); rollX = btAtan2(2 * (m_floats[1] * m_floats[2] + m_floats[3] * m_floats[0]), squ - sqx - sqy + sqz); @@ -197,178 +195,178 @@ public: } } - /**@brief Add two quaternions + /**@brief Add two quaternions * @param q The quaternion to add to this one */ - SIMD_FORCE_INLINE btQuaternion& operator+=(const btQuaternion& q) + SIMD_FORCE_INLINE btQuaternion& operator+=(const btQuaternion& q) { -#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) mVec128 = _mm_add_ps(mVec128, q.mVec128); #elif defined(BT_USE_NEON) mVec128 = vaddq_f32(mVec128, q.mVec128); -#else - m_floats[0] += q.x(); - m_floats[1] += q.y(); - m_floats[2] += q.z(); - m_floats[3] += q.m_floats[3]; +#else + m_floats[0] += q.x(); + m_floats[1] += q.y(); + m_floats[2] += q.z(); + m_floats[3] += q.m_floats[3]; #endif return *this; } - /**@brief Subtract out a quaternion + /**@brief Subtract out a quaternion * @param q The quaternion to subtract from this one */ - btQuaternion& operator-=(const btQuaternion& q) + btQuaternion& operator-=(const btQuaternion& q) { -#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) mVec128 = _mm_sub_ps(mVec128, q.mVec128); #elif defined(BT_USE_NEON) mVec128 = vsubq_f32(mVec128, q.mVec128); -#else - m_floats[0] -= q.x(); - m_floats[1] -= q.y(); - m_floats[2] -= q.z(); - m_floats[3] -= q.m_floats[3]; +#else + m_floats[0] -= q.x(); + m_floats[1] -= q.y(); + m_floats[2] -= q.z(); + m_floats[3] -= q.m_floats[3]; #endif - return *this; + return *this; } - /**@brief Scale this quaternion + /**@brief Scale this quaternion * @param s The scalar to scale by */ btQuaternion& operator*=(const btScalar& s) { -#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) - __m128 vs = _mm_load_ss(&s); // (S 0 0 0) - vs = bt_pshufd_ps(vs, 0); // (S S S S) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) + __m128 vs = _mm_load_ss(&s); // (S 0 0 0) + vs = bt_pshufd_ps(vs, 0); // (S S S S) mVec128 = _mm_mul_ps(mVec128, vs); #elif defined(BT_USE_NEON) mVec128 = vmulq_n_f32(mVec128, s); #else - m_floats[0] *= s; - m_floats[1] *= s; - m_floats[2] *= s; - m_floats[3] *= s; + m_floats[0] *= s; + m_floats[1] *= s; + m_floats[2] *= s; + m_floats[3] *= s; #endif return *this; } - /**@brief Multiply this quaternion by q on the right + /**@brief Multiply this quaternion by q on the right * @param q The other quaternion * Equivilant to this = this * q */ btQuaternion& operator*=(const btQuaternion& q) { -#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) __m128 vQ2 = q.get128(); - - __m128 A1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(0,1,2,0)); - __m128 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0)); - + + __m128 A1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(0, 1, 2, 0)); + __m128 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3, 3, 3, 0)); + A1 = A1 * B1; - - __m128 A2 = bt_pshufd_ps(mVec128, BT_SHUFFLE(1,2,0,1)); - __m128 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1)); - + + __m128 A2 = bt_pshufd_ps(mVec128, BT_SHUFFLE(1, 2, 0, 1)); + __m128 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2, 0, 1, 1)); + A2 = A2 * B2; - - B1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(2,0,1,2)); - B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2)); - - B1 = B1 * B2; // A3 *= B3 - - mVec128 = bt_splat_ps(mVec128, 3); // A0 - mVec128 = mVec128 * vQ2; // A0 * B0 - - A1 = A1 + A2; // AB12 - mVec128 = mVec128 - B1; // AB03 = AB0 - AB3 - A1 = _mm_xor_ps(A1, vPPPM); // change sign of the last element - mVec128 = mVec128+ A1; // AB03 + AB12 - -#elif defined(BT_USE_NEON) - - float32x4_t vQ1 = mVec128; - float32x4_t vQ2 = q.get128(); - float32x4_t A0, A1, B1, A2, B2, A3, B3; - float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz; - - { - float32x2x2_t tmp; - tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) ); // {z x}, {w y} - vQ1zx = tmp.val[0]; - - tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) ); // {z x}, {w y} - vQ2zx = tmp.val[0]; - } - vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); - - vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1); - - vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1); - vQ2xz = vext_f32(vQ2zx, vQ2zx, 1); - - A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx); // X Y z x - B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W W X - - A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1)); - B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1)); - - A3 = vcombine_f32(vQ1zx, vQ1yz); // Z X Y Z - B3 = vcombine_f32(vQ2yz, vQ2xz); // Y Z x z - - A1 = vmulq_f32(A1, B1); - A2 = vmulq_f32(A2, B2); - A3 = vmulq_f32(A3, B3); // A3 *= B3 - A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); // A0 * B0 - - A1 = vaddq_f32(A1, A2); // AB12 = AB1 + AB2 - A0 = vsubq_f32(A0, A3); // AB03 = AB0 - AB3 - - // change the sign of the last element - A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM); - A0 = vaddq_f32(A0, A1); // AB03 + AB12 - - mVec128 = A0; + + B1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(2, 0, 1, 2)); + B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1, 2, 0, 2)); + + B1 = B1 * B2; // A3 *= B3 + + mVec128 = bt_splat_ps(mVec128, 3); // A0 + mVec128 = mVec128 * vQ2; // A0 * B0 + + A1 = A1 + A2; // AB12 + mVec128 = mVec128 - B1; // AB03 = AB0 - AB3 + A1 = _mm_xor_ps(A1, vPPPM); // change sign of the last element + mVec128 = mVec128 + A1; // AB03 + AB12 + +#elif defined(BT_USE_NEON) + + float32x4_t vQ1 = mVec128; + float32x4_t vQ2 = q.get128(); + float32x4_t A0, A1, B1, A2, B2, A3, B3; + float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz; + + { + float32x2x2_t tmp; + tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1)); // {z x}, {w y} + vQ1zx = tmp.val[0]; + + tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2)); // {z x}, {w y} + vQ2zx = tmp.val[0]; + } + vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); + + vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1); + + vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1); + vQ2xz = vext_f32(vQ2zx, vQ2zx, 1); + + A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx); // X Y z x + B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W W X + + A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1)); + B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1)); + + A3 = vcombine_f32(vQ1zx, vQ1yz); // Z X Y Z + B3 = vcombine_f32(vQ2yz, vQ2xz); // Y Z x z + + A1 = vmulq_f32(A1, B1); + A2 = vmulq_f32(A2, B2); + A3 = vmulq_f32(A3, B3); // A3 *= B3 + A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); // A0 * B0 + + A1 = vaddq_f32(A1, A2); // AB12 = AB1 + AB2 + A0 = vsubq_f32(A0, A3); // AB03 = AB0 - AB3 + + // change the sign of the last element + A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM); + A0 = vaddq_f32(A0, A1); // AB03 + AB12 + + mVec128 = A0; #else setValue( - m_floats[3] * q.x() + m_floats[0] * q.m_floats[3] + m_floats[1] * q.z() - m_floats[2] * q.y(), + m_floats[3] * q.x() + m_floats[0] * q.m_floats[3] + m_floats[1] * q.z() - m_floats[2] * q.y(), m_floats[3] * q.y() + m_floats[1] * q.m_floats[3] + m_floats[2] * q.x() - m_floats[0] * q.z(), m_floats[3] * q.z() + m_floats[2] * q.m_floats[3] + m_floats[0] * q.y() - m_floats[1] * q.x(), m_floats[3] * q.m_floats[3] - m_floats[0] * q.x() - m_floats[1] * q.y() - m_floats[2] * q.z()); #endif return *this; } - /**@brief Return the dot product between this quaternion and another + /**@brief Return the dot product between this quaternion and another * @param q The other quaternion */ btScalar dot(const btQuaternion& q) const { -#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) - __m128 vd; - +#if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) + __m128 vd; + vd = _mm_mul_ps(mVec128, q.mVec128); - - __m128 t = _mm_movehl_ps(vd, vd); + + __m128 t = _mm_movehl_ps(vd, vd); vd = _mm_add_ps(vd, t); t = _mm_shuffle_ps(vd, vd, 0x55); vd = _mm_add_ss(vd, t); - - return _mm_cvtss_f32(vd); + + return _mm_cvtss_f32(vd); #elif defined(BT_USE_NEON) float32x4_t vd = vmulq_f32(mVec128, q.mVec128); - float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_high_f32(vd)); + float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_high_f32(vd)); x = vpadd_f32(x, x); return vget_lane_f32(x, 0); -#else - return m_floats[0] * q.x() + - m_floats[1] * q.y() + - m_floats[2] * q.z() + - m_floats[3] * q.m_floats[3]; +#else + return m_floats[0] * q.x() + + m_floats[1] * q.y() + + m_floats[2] * q.z() + + m_floats[3] * q.m_floats[3]; #endif } - /**@brief Return the length squared of the quaternion */ + /**@brief Return the length squared of the quaternion */ btScalar length2() const { return dot(*this); } - /**@brief Return the length of the quaternion */ + /**@brief Return the length of the quaternion */ btScalar length() const { return btSqrt(length2()); @@ -376,46 +374,46 @@ public: btQuaternion& safeNormalize() { btScalar l2 = length2(); - if (l2>SIMD_EPSILON) + if (l2 > SIMD_EPSILON) { normalize(); } return *this; } - /**@brief Normalize the quaternion + /**@brief Normalize the quaternion * Such that x^2 + y^2 + z^2 +w^2 = 1 */ - btQuaternion& normalize() + btQuaternion& normalize() { -#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) - __m128 vd; - +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) + __m128 vd; + vd = _mm_mul_ps(mVec128, mVec128); - - __m128 t = _mm_movehl_ps(vd, vd); + + __m128 t = _mm_movehl_ps(vd, vd); vd = _mm_add_ps(vd, t); t = _mm_shuffle_ps(vd, vd, 0x55); vd = _mm_add_ss(vd, t); vd = _mm_sqrt_ss(vd); vd = _mm_div_ss(vOnes, vd); - vd = bt_pshufd_ps(vd, 0); // splat + vd = bt_pshufd_ps(vd, 0); // splat mVec128 = _mm_mul_ps(mVec128, vd); - + return *this; -#else +#else return *this /= length(); #endif } - /**@brief Return a scaled version of this quaternion + /**@brief Return a scaled version of this quaternion * @param s The scale factor */ SIMD_FORCE_INLINE btQuaternion operator*(const btScalar& s) const { -#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) - __m128 vs = _mm_load_ss(&s); // (S 0 0 0) - vs = bt_pshufd_ps(vs, 0x00); // (S S S S) - +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) + __m128 vs = _mm_load_ss(&s); // (S 0 0 0) + vs = bt_pshufd_ps(vs, 0x00); // (S S S S) + return btQuaternion(_mm_mul_ps(mVec128, vs)); #elif defined(BT_USE_NEON) return btQuaternion(vmulq_n_f32(mVec128, s)); @@ -424,7 +422,7 @@ public: #endif } - /**@brief Return an inversely scaled versionof this quaternion + /**@brief Return an inversely scaled versionof this quaternion * @param s The inverse scale factor */ btQuaternion operator/(const btScalar& s) const { @@ -432,49 +430,49 @@ public: return *this * (btScalar(1.0) / s); } - /**@brief Inversely scale this quaternion + /**@brief Inversely scale this quaternion * @param s The scale factor */ - btQuaternion& operator/=(const btScalar& s) + btQuaternion& operator/=(const btScalar& s) { btAssert(s != btScalar(0.0)); return *this *= btScalar(1.0) / s; } - /**@brief Return a normalized version of this quaternion */ - btQuaternion normalized() const + /**@brief Return a normalized version of this quaternion */ + btQuaternion normalized() const { return *this / length(); - } + } /**@brief Return the ***half*** angle between this quaternion and the other * @param q The other quaternion */ - btScalar angle(const btQuaternion& q) const + btScalar angle(const btQuaternion& q) const { btScalar s = btSqrt(length2() * q.length2()); btAssert(s != btScalar(0.0)); return btAcos(dot(q) / s); } - + /**@brief Return the angle between this quaternion and the other along the shortest path * @param q The other quaternion */ - btScalar angleShortestPath(const btQuaternion& q) const + btScalar angleShortestPath(const btQuaternion& q) const { btScalar s = btSqrt(length2() * q.length2()); btAssert(s != btScalar(0.0)); - if (dot(q) < 0) // Take care of long angle case see http://en.wikipedia.org/wiki/Slerp + if (dot(q) < 0) // Take care of long angle case see http://en.wikipedia.org/wiki/Slerp return btAcos(dot(-q) / s) * btScalar(2.0); - else + else return btAcos(dot(q) / s) * btScalar(2.0); } /**@brief Return the angle [0, 2Pi] of rotation represented by this quaternion */ - btScalar getAngle() const + btScalar getAngle() const { btScalar s = btScalar(2.) * btAcos(m_floats[3]); return s; } /**@brief Return the angle [0, Pi] of rotation represented by this quaternion along the shortest path */ - btScalar getAngleShortestPath() const + btScalar getAngleShortestPath() const { btScalar s; if (m_floats[3] >= 0) @@ -484,120 +482,117 @@ public: return s; } - /**@brief Return the axis of the rotation represented by this quaternion */ btVector3 getAxis() const { - btScalar s_squared = 1.f-m_floats[3]*m_floats[3]; - - if (s_squared < btScalar(10.) * SIMD_EPSILON) //Check for divide by zero - return btVector3(1.0, 0.0, 0.0); // Arbitrary - btScalar s = 1.f/btSqrt(s_squared); + btScalar s_squared = 1.f - m_floats[3] * m_floats[3]; + + if (s_squared < btScalar(10.) * SIMD_EPSILON) //Check for divide by zero + return btVector3(1.0, 0.0, 0.0); // Arbitrary + btScalar s = 1.f / btSqrt(s_squared); return btVector3(m_floats[0] * s, m_floats[1] * s, m_floats[2] * s); } /**@brief Return the inverse of this quaternion */ btQuaternion inverse() const { -#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) return btQuaternion(_mm_xor_ps(mVec128, vQInv)); #elif defined(BT_USE_NEON) - return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)vQInv)); -#else + return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)vQInv)); +#else return btQuaternion(-m_floats[0], -m_floats[1], -m_floats[2], m_floats[3]); #endif } - /**@brief Return the sum of this quaternion and the other + /**@brief Return the sum of this quaternion and the other * @param q2 The other quaternion */ SIMD_FORCE_INLINE btQuaternion operator+(const btQuaternion& q2) const { -#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) return btQuaternion(_mm_add_ps(mVec128, q2.mVec128)); #elif defined(BT_USE_NEON) - return btQuaternion(vaddq_f32(mVec128, q2.mVec128)); -#else + return btQuaternion(vaddq_f32(mVec128, q2.mVec128)); +#else const btQuaternion& q1 = *this; return btQuaternion(q1.x() + q2.x(), q1.y() + q2.y(), q1.z() + q2.z(), q1.m_floats[3] + q2.m_floats[3]); #endif } - /**@brief Return the difference between this quaternion and the other + /**@brief Return the difference between this quaternion and the other * @param q2 The other quaternion */ SIMD_FORCE_INLINE btQuaternion operator-(const btQuaternion& q2) const { -#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) return btQuaternion(_mm_sub_ps(mVec128, q2.mVec128)); #elif defined(BT_USE_NEON) - return btQuaternion(vsubq_f32(mVec128, q2.mVec128)); -#else + return btQuaternion(vsubq_f32(mVec128, q2.mVec128)); +#else const btQuaternion& q1 = *this; return btQuaternion(q1.x() - q2.x(), q1.y() - q2.y(), q1.z() - q2.z(), q1.m_floats[3] - q2.m_floats[3]); #endif } - /**@brief Return the negative of this quaternion + /**@brief Return the negative of this quaternion * This simply negates each element */ SIMD_FORCE_INLINE btQuaternion operator-() const { -#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) return btQuaternion(_mm_xor_ps(mVec128, btvMzeroMask)); #elif defined(BT_USE_NEON) - return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)btvMzeroMask) ); -#else + return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)btvMzeroMask)); +#else const btQuaternion& q2 = *this; - return btQuaternion( - q2.x(), - q2.y(), - q2.z(), - q2.m_floats[3]); + return btQuaternion(-q2.x(), -q2.y(), -q2.z(), -q2.m_floats[3]); #endif } - /**@todo document this and it's use */ - SIMD_FORCE_INLINE btQuaternion farthest( const btQuaternion& qd) const + /**@todo document this and it's use */ + SIMD_FORCE_INLINE btQuaternion farthest(const btQuaternion& qd) const { - btQuaternion diff,sum; + btQuaternion diff, sum; diff = *this - qd; sum = *this + qd; - if( diff.dot(diff) > sum.dot(sum) ) + if (diff.dot(diff) > sum.dot(sum)) return qd; return (-qd); } /**@todo document this and it's use */ - SIMD_FORCE_INLINE btQuaternion nearest( const btQuaternion& qd) const + SIMD_FORCE_INLINE btQuaternion nearest(const btQuaternion& qd) const { - btQuaternion diff,sum; + btQuaternion diff, sum; diff = *this - qd; sum = *this + qd; - if( diff.dot(diff) < sum.dot(sum) ) + if (diff.dot(diff) < sum.dot(sum)) return qd; return (-qd); } - - /**@brief Return the quaternion which is the result of Spherical Linear Interpolation between this and the other quaternion + /**@brief Return the quaternion which is the result of Spherical Linear Interpolation between this and the other quaternion * @param q The other quaternion to interpolate with * @param t The ratio between this and q to interpolate. If t = 0 the result is this, if t=1 the result is q. * Slerp interpolates assuming constant velocity. */ btQuaternion slerp(const btQuaternion& q, const btScalar& t) const { - const btScalar magnitude = btSqrt(length2() * q.length2()); btAssert(magnitude > btScalar(0)); - + const btScalar product = dot(q) / magnitude; const btScalar absproduct = btFabs(product); - - if(absproduct < btScalar(1.0 - SIMD_EPSILON)) + + if (absproduct < btScalar(1.0 - SIMD_EPSILON)) { // Take care of long angle case see http://en.wikipedia.org/wiki/Slerp const btScalar theta = btAcos(absproduct); const btScalar d = btSin(theta); btAssert(d > btScalar(0)); - + const btScalar sign = (product < 0) ? btScalar(-1) : btScalar(1); const btScalar s0 = btSin((btScalar(1.0) - t) * theta) / d; const btScalar s1 = btSin(sign * t * theta) / d; - + return btQuaternion( (m_floats[0] * s0 + q.x() * s1), (m_floats[1] * s0 + q.y() * s1), @@ -610,314 +605,308 @@ public: } } - static const btQuaternion& getIdentity() + static const btQuaternion& getIdentity() { - static const btQuaternion identityQuat(btScalar(0.),btScalar(0.),btScalar(0.),btScalar(1.)); + static const btQuaternion identityQuat(btScalar(0.), btScalar(0.), btScalar(0.), btScalar(1.)); return identityQuat; } SIMD_FORCE_INLINE const btScalar& getW() const { return m_floats[3]; } - SIMD_FORCE_INLINE void serialize(struct btQuaternionData& dataOut) const; - - SIMD_FORCE_INLINE void deSerialize(const struct btQuaternionFloatData& dataIn); + SIMD_FORCE_INLINE void serialize(struct btQuaternionData& dataOut) const; - SIMD_FORCE_INLINE void deSerialize(const struct btQuaternionDoubleData& dataIn); + SIMD_FORCE_INLINE void deSerialize(const struct btQuaternionFloatData& dataIn); - SIMD_FORCE_INLINE void serializeFloat(struct btQuaternionFloatData& dataOut) const; + SIMD_FORCE_INLINE void deSerialize(const struct btQuaternionDoubleData& dataIn); - SIMD_FORCE_INLINE void deSerializeFloat(const struct btQuaternionFloatData& dataIn); + SIMD_FORCE_INLINE void serializeFloat(struct btQuaternionFloatData& dataOut) const; - SIMD_FORCE_INLINE void serializeDouble(struct btQuaternionDoubleData& dataOut) const; + SIMD_FORCE_INLINE void deSerializeFloat(const struct btQuaternionFloatData& dataIn); - SIMD_FORCE_INLINE void deSerializeDouble(const struct btQuaternionDoubleData& dataIn); + SIMD_FORCE_INLINE void serializeDouble(struct btQuaternionDoubleData& dataOut) const; + SIMD_FORCE_INLINE void deSerializeDouble(const struct btQuaternionDoubleData& dataIn); }; - - - - /**@brief Return the product of two quaternions */ SIMD_FORCE_INLINE btQuaternion -operator*(const btQuaternion& q1, const btQuaternion& q2) +operator*(const btQuaternion& q1, const btQuaternion& q2) { -#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) __m128 vQ1 = q1.get128(); __m128 vQ2 = q2.get128(); __m128 A0, A1, B1, A2, B2; - - A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0,1,2,0)); // X Y z x // vtrn - B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0)); // W W W X // vdup vext + + A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0, 1, 2, 0)); // X Y z x // vtrn + B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3, 3, 3, 0)); // W W W X // vdup vext A1 = A1 * B1; - - A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1)); // Y Z X Y // vext - B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1)); // z x Y Y // vtrn vdup + + A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1, 2, 0, 1)); // Y Z X Y // vext + B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2, 0, 1, 1)); // z x Y Y // vtrn vdup A2 = A2 * B2; - B1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2)); // z x Y Z // vtrn vext - B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2)); // Y Z x z // vext vtrn - - B1 = B1 * B2; // A3 *= B3 + B1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2, 0, 1, 2)); // z x Y Z // vtrn vext + B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1, 2, 0, 2)); // Y Z x z // vext vtrn + + B1 = B1 * B2; // A3 *= B3 - A0 = bt_splat_ps(vQ1, 3); // A0 - A0 = A0 * vQ2; // A0 * B0 + A0 = bt_splat_ps(vQ1, 3); // A0 + A0 = A0 * vQ2; // A0 * B0 + + A1 = A1 + A2; // AB12 + A0 = A0 - B1; // AB03 = AB0 - AB3 + + A1 = _mm_xor_ps(A1, vPPPM); // change sign of the last element + A0 = A0 + A1; // AB03 + AB12 - A1 = A1 + A2; // AB12 - A0 = A0 - B1; // AB03 = AB0 - AB3 - - A1 = _mm_xor_ps(A1, vPPPM); // change sign of the last element - A0 = A0 + A1; // AB03 + AB12 - return btQuaternion(A0); -#elif defined(BT_USE_NEON) +#elif defined(BT_USE_NEON) float32x4_t vQ1 = q1.get128(); float32x4_t vQ2 = q2.get128(); float32x4_t A0, A1, B1, A2, B2, A3, B3; - float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz; - - { - float32x2x2_t tmp; - tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) ); // {z x}, {w y} - vQ1zx = tmp.val[0]; + float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz; + + { + float32x2x2_t tmp; + tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1)); // {z x}, {w y} + vQ1zx = tmp.val[0]; - tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) ); // {z x}, {w y} - vQ2zx = tmp.val[0]; - } - vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); + tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2)); // {z x}, {w y} + vQ2zx = tmp.val[0]; + } + vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); - vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1); + vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1); - vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1); - vQ2xz = vext_f32(vQ2zx, vQ2zx, 1); + vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1); + vQ2xz = vext_f32(vQ2zx, vQ2zx, 1); - A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx); // X Y z x - B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W W X + A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx); // X Y z x + B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W W X A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1)); - B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1)); + B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1)); - A3 = vcombine_f32(vQ1zx, vQ1yz); // Z X Y Z - B3 = vcombine_f32(vQ2yz, vQ2xz); // Y Z x z + A3 = vcombine_f32(vQ1zx, vQ1yz); // Z X Y Z + B3 = vcombine_f32(vQ2yz, vQ2xz); // Y Z x z A1 = vmulq_f32(A1, B1); A2 = vmulq_f32(A2, B2); - A3 = vmulq_f32(A3, B3); // A3 *= B3 - A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); // A0 * B0 - - A1 = vaddq_f32(A1, A2); // AB12 = AB1 + AB2 - A0 = vsubq_f32(A0, A3); // AB03 = AB0 - AB3 - - // change the sign of the last element - A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM); - A0 = vaddq_f32(A0, A1); // AB03 + AB12 - + A3 = vmulq_f32(A3, B3); // A3 *= B3 + A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); // A0 * B0 + + A1 = vaddq_f32(A1, A2); // AB12 = AB1 + AB2 + A0 = vsubq_f32(A0, A3); // AB03 = AB0 - AB3 + + // change the sign of the last element + A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM); + A0 = vaddq_f32(A0, A1); // AB03 + AB12 + return btQuaternion(A0); #else return btQuaternion( - q1.w() * q2.x() + q1.x() * q2.w() + q1.y() * q2.z() - q1.z() * q2.y(), + q1.w() * q2.x() + q1.x() * q2.w() + q1.y() * q2.z() - q1.z() * q2.y(), q1.w() * q2.y() + q1.y() * q2.w() + q1.z() * q2.x() - q1.x() * q2.z(), q1.w() * q2.z() + q1.z() * q2.w() + q1.x() * q2.y() - q1.y() * q2.x(), - q1.w() * q2.w() - q1.x() * q2.x() - q1.y() * q2.y() - q1.z() * q2.z()); + q1.w() * q2.w() - q1.x() * q2.x() - q1.y() * q2.y() - q1.z() * q2.z()); #endif } SIMD_FORCE_INLINE btQuaternion operator*(const btQuaternion& q, const btVector3& w) { -#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) __m128 vQ1 = q.get128(); __m128 vQ2 = w.get128(); __m128 A1, B1, A2, B2, A3, B3; - - A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(3,3,3,0)); - B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(0,1,2,0)); + + A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(3, 3, 3, 0)); + B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(0, 1, 2, 0)); A1 = A1 * B1; - - A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1)); - B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1)); + + A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1, 2, 0, 1)); + B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2, 0, 1, 1)); A2 = A2 * B2; - A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2)); - B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2)); - - A3 = A3 * B3; // A3 *= B3 + A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2, 0, 1, 2)); + B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1, 2, 0, 2)); + + A3 = A3 * B3; // A3 *= B3 + + A1 = A1 + A2; // AB12 + A1 = _mm_xor_ps(A1, vPPPM); // change sign of the last element + A1 = A1 - A3; // AB123 = AB12 - AB3 - A1 = A1 + A2; // AB12 - A1 = _mm_xor_ps(A1, vPPPM); // change sign of the last element - A1 = A1 - A3; // AB123 = AB12 - AB3 - return btQuaternion(A1); - -#elif defined(BT_USE_NEON) + +#elif defined(BT_USE_NEON) float32x4_t vQ1 = q.get128(); float32x4_t vQ2 = w.get128(); float32x4_t A1, B1, A2, B2, A3, B3; - float32x2_t vQ1wx, vQ2zx, vQ1yz, vQ2yz, vQ1zx, vQ2xz; - - vQ1wx = vext_f32(vget_high_f32(vQ1), vget_low_f32(vQ1), 1); - { - float32x2x2_t tmp; + float32x2_t vQ1wx, vQ2zx, vQ1yz, vQ2yz, vQ1zx, vQ2xz; - tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) ); // {z x}, {w y} - vQ2zx = tmp.val[0]; + vQ1wx = vext_f32(vget_high_f32(vQ1), vget_low_f32(vQ1), 1); + { + float32x2x2_t tmp; - tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) ); // {z x}, {w y} - vQ1zx = tmp.val[0]; - } + tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2)); // {z x}, {w y} + vQ2zx = tmp.val[0]; - vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1); + tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1)); // {z x}, {w y} + vQ1zx = tmp.val[0]; + } - vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1); - vQ2xz = vext_f32(vQ2zx, vQ2zx, 1); + vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1); - A1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ1), 1), vQ1wx); // W W W X - B1 = vcombine_f32(vget_low_f32(vQ2), vQ2zx); // X Y z x + vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1); + vQ2xz = vext_f32(vQ2zx, vQ2zx, 1); + + A1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ1), 1), vQ1wx); // W W W X + B1 = vcombine_f32(vget_low_f32(vQ2), vQ2zx); // X Y z x A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1)); - B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1)); + B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1)); - A3 = vcombine_f32(vQ1zx, vQ1yz); // Z X Y Z - B3 = vcombine_f32(vQ2yz, vQ2xz); // Y Z x z + A3 = vcombine_f32(vQ1zx, vQ1yz); // Z X Y Z + B3 = vcombine_f32(vQ2yz, vQ2xz); // Y Z x z A1 = vmulq_f32(A1, B1); A2 = vmulq_f32(A2, B2); - A3 = vmulq_f32(A3, B3); // A3 *= B3 - - A1 = vaddq_f32(A1, A2); // AB12 = AB1 + AB2 - - // change the sign of the last element - A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM); - - A1 = vsubq_f32(A1, A3); // AB123 = AB12 - AB3 - + A3 = vmulq_f32(A3, B3); // A3 *= B3 + + A1 = vaddq_f32(A1, A2); // AB12 = AB1 + AB2 + + // change the sign of the last element + A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM); + + A1 = vsubq_f32(A1, A3); // AB123 = AB12 - AB3 + return btQuaternion(A1); - + #else - return btQuaternion( - q.w() * w.x() + q.y() * w.z() - q.z() * w.y(), - q.w() * w.y() + q.z() * w.x() - q.x() * w.z(), - q.w() * w.z() + q.x() * w.y() - q.y() * w.x(), - -q.x() * w.x() - q.y() * w.y() - q.z() * w.z()); + return btQuaternion( + q.w() * w.x() + q.y() * w.z() - q.z() * w.y(), + q.w() * w.y() + q.z() * w.x() - q.x() * w.z(), + q.w() * w.z() + q.x() * w.y() - q.y() * w.x(), + -q.x() * w.x() - q.y() * w.y() - q.z() * w.z()); #endif } SIMD_FORCE_INLINE btQuaternion operator*(const btVector3& w, const btQuaternion& q) { -#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) __m128 vQ1 = w.get128(); __m128 vQ2 = q.get128(); __m128 A1, B1, A2, B2, A3, B3; - - A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0,1,2,0)); // X Y z x - B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0)); // W W W X + + A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0, 1, 2, 0)); // X Y z x + B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3, 3, 3, 0)); // W W W X A1 = A1 * B1; - - A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1)); - B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1)); - A2 = A2 *B2; + A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1, 2, 0, 1)); + B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2, 0, 1, 1)); - A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2)); - B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2)); - - A3 = A3 * B3; // A3 *= B3 + A2 = A2 * B2; + + A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2, 0, 1, 2)); + B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1, 2, 0, 2)); + + A3 = A3 * B3; // A3 *= B3 + + A1 = A1 + A2; // AB12 + A1 = _mm_xor_ps(A1, vPPPM); // change sign of the last element + A1 = A1 - A3; // AB123 = AB12 - AB3 - A1 = A1 + A2; // AB12 - A1 = _mm_xor_ps(A1, vPPPM); // change sign of the last element - A1 = A1 - A3; // AB123 = AB12 - AB3 - return btQuaternion(A1); -#elif defined(BT_USE_NEON) +#elif defined(BT_USE_NEON) float32x4_t vQ1 = w.get128(); float32x4_t vQ2 = q.get128(); - float32x4_t A1, B1, A2, B2, A3, B3; - float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz; - - { - float32x2x2_t tmp; - - tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) ); // {z x}, {w y} - vQ1zx = tmp.val[0]; + float32x4_t A1, B1, A2, B2, A3, B3; + float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz; + + { + float32x2x2_t tmp; - tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) ); // {z x}, {w y} - vQ2zx = tmp.val[0]; - } - vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); + tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1)); // {z x}, {w y} + vQ1zx = tmp.val[0]; - vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1); + tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2)); // {z x}, {w y} + vQ2zx = tmp.val[0]; + } + vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); - vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1); - vQ2xz = vext_f32(vQ2zx, vQ2zx, 1); + vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1); - A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx); // X Y z x - B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W W X + vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1); + vQ2xz = vext_f32(vQ2zx, vQ2zx, 1); + + A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx); // X Y z x + B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W W X A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1)); - B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1)); + B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1)); - A3 = vcombine_f32(vQ1zx, vQ1yz); // Z X Y Z - B3 = vcombine_f32(vQ2yz, vQ2xz); // Y Z x z + A3 = vcombine_f32(vQ1zx, vQ1yz); // Z X Y Z + B3 = vcombine_f32(vQ2yz, vQ2xz); // Y Z x z A1 = vmulq_f32(A1, B1); A2 = vmulq_f32(A2, B2); - A3 = vmulq_f32(A3, B3); // A3 *= B3 - - A1 = vaddq_f32(A1, A2); // AB12 = AB1 + AB2 - - // change the sign of the last element - A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM); - - A1 = vsubq_f32(A1, A3); // AB123 = AB12 - AB3 - + A3 = vmulq_f32(A3, B3); // A3 *= B3 + + A1 = vaddq_f32(A1, A2); // AB12 = AB1 + AB2 + + // change the sign of the last element + A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM); + + A1 = vsubq_f32(A1, A3); // AB123 = AB12 - AB3 + return btQuaternion(A1); - + #else - return btQuaternion( - +w.x() * q.w() + w.y() * q.z() - w.z() * q.y(), + return btQuaternion( + +w.x() * q.w() + w.y() * q.z() - w.z() * q.y(), +w.y() * q.w() + w.z() * q.x() - w.x() * q.z(), +w.z() * q.w() + w.x() * q.y() - w.y() * q.x(), - -w.x() * q.x() - w.y() * q.y() - w.z() * q.z()); + -w.x() * q.x() - w.y() * q.y() - w.z() * q.z()); #endif } /**@brief Calculate the dot product between two quaternions */ -SIMD_FORCE_INLINE btScalar -dot(const btQuaternion& q1, const btQuaternion& q2) -{ - return q1.dot(q2); +SIMD_FORCE_INLINE btScalar +dot(const btQuaternion& q1, const btQuaternion& q2) +{ + return q1.dot(q2); } - /**@brief Return the length of a quaternion */ SIMD_FORCE_INLINE btScalar -length(const btQuaternion& q) -{ - return q.length(); +length(const btQuaternion& q) +{ + return q.length(); } /**@brief Return the angle between two quaternions*/ SIMD_FORCE_INLINE btScalar -btAngle(const btQuaternion& q1, const btQuaternion& q2) -{ - return q1.angle(q2); +btAngle(const btQuaternion& q1, const btQuaternion& q2) +{ + return q1.angle(q2); } /**@brief Return the inverse of a quaternion*/ SIMD_FORCE_INLINE btQuaternion -inverse(const btQuaternion& q) +inverse(const btQuaternion& q) { return q.inverse(); } @@ -928,115 +917,105 @@ inverse(const btQuaternion& q) * @param t The ration between q1 and q2. t = 0 return q1, t=1 returns q2 * Slerp assumes constant velocity between positions. */ SIMD_FORCE_INLINE btQuaternion -slerp(const btQuaternion& q1, const btQuaternion& q2, const btScalar& t) +slerp(const btQuaternion& q1, const btQuaternion& q2, const btScalar& t) { return q1.slerp(q2, t); } -SIMD_FORCE_INLINE btVector3 -quatRotate(const btQuaternion& rotation, const btVector3& v) +SIMD_FORCE_INLINE btVector3 +quatRotate(const btQuaternion& rotation, const btVector3& v) { btQuaternion q = rotation * v; q *= rotation.inverse(); -#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) return btVector3(_mm_and_ps(q.get128(), btvFFF0fMask)); #elif defined(BT_USE_NEON) - return btVector3((float32x4_t)vandq_s32((int32x4_t)q.get128(), btvFFF0Mask)); -#else - return btVector3(q.getX(),q.getY(),q.getZ()); + return btVector3((float32x4_t)vandq_s32((int32x4_t)q.get128(), btvFFF0Mask)); +#else + return btVector3(q.getX(), q.getY(), q.getZ()); #endif } -SIMD_FORCE_INLINE btQuaternion -shortestArcQuat(const btVector3& v0, const btVector3& v1) // Game Programming Gems 2.10. make sure v0,v1 are normalized +SIMD_FORCE_INLINE btQuaternion +shortestArcQuat(const btVector3& v0, const btVector3& v1) // Game Programming Gems 2.10. make sure v0,v1 are normalized { btVector3 c = v0.cross(v1); - btScalar d = v0.dot(v1); + btScalar d = v0.dot(v1); if (d < -1.0 + SIMD_EPSILON) { - btVector3 n,unused; - btPlaneSpace1(v0,n,unused); - return btQuaternion(n.x(),n.y(),n.z(),0.0f); // just pick any vector that is orthogonal to v0 + btVector3 n, unused; + btPlaneSpace1(v0, n, unused); + return btQuaternion(n.x(), n.y(), n.z(), 0.0f); // just pick any vector that is orthogonal to v0 } - btScalar s = btSqrt((1.0f + d) * 2.0f); + btScalar s = btSqrt((1.0f + d) * 2.0f); btScalar rs = 1.0f / s; - return btQuaternion(c.getX()*rs,c.getY()*rs,c.getZ()*rs,s * 0.5f); + return btQuaternion(c.getX() * rs, c.getY() * rs, c.getZ() * rs, s * 0.5f); } -SIMD_FORCE_INLINE btQuaternion -shortestArcQuatNormalize2(btVector3& v0,btVector3& v1) +SIMD_FORCE_INLINE btQuaternion +shortestArcQuatNormalize2(btVector3& v0, btVector3& v1) { v0.normalize(); v1.normalize(); - return shortestArcQuat(v0,v1); + return shortestArcQuat(v0, v1); } - - - -struct btQuaternionFloatData +struct btQuaternionFloatData { - float m_floats[4]; + float m_floats[4]; }; -struct btQuaternionDoubleData +struct btQuaternionDoubleData { - double m_floats[4]; - + double m_floats[4]; }; -SIMD_FORCE_INLINE void btQuaternion::serializeFloat(struct btQuaternionFloatData& dataOut) const +SIMD_FORCE_INLINE void btQuaternion::serializeFloat(struct btQuaternionFloatData& dataOut) const { ///could also do a memcpy, check if it is worth it - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) dataOut.m_floats[i] = float(m_floats[i]); } -SIMD_FORCE_INLINE void btQuaternion::deSerializeFloat(const struct btQuaternionFloatData& dataIn) +SIMD_FORCE_INLINE void btQuaternion::deSerializeFloat(const struct btQuaternionFloatData& dataIn) { - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) m_floats[i] = btScalar(dataIn.m_floats[i]); } - -SIMD_FORCE_INLINE void btQuaternion::serializeDouble(struct btQuaternionDoubleData& dataOut) const +SIMD_FORCE_INLINE void btQuaternion::serializeDouble(struct btQuaternionDoubleData& dataOut) const { ///could also do a memcpy, check if it is worth it - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) dataOut.m_floats[i] = double(m_floats[i]); } -SIMD_FORCE_INLINE void btQuaternion::deSerializeDouble(const struct btQuaternionDoubleData& dataIn) +SIMD_FORCE_INLINE void btQuaternion::deSerializeDouble(const struct btQuaternionDoubleData& dataIn) { - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) m_floats[i] = btScalar(dataIn.m_floats[i]); } - -SIMD_FORCE_INLINE void btQuaternion::serialize(struct btQuaternionData& dataOut) const +SIMD_FORCE_INLINE void btQuaternion::serialize(struct btQuaternionData& dataOut) const { ///could also do a memcpy, check if it is worth it - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) dataOut.m_floats[i] = m_floats[i]; } -SIMD_FORCE_INLINE void btQuaternion::deSerialize(const struct btQuaternionFloatData& dataIn) +SIMD_FORCE_INLINE void btQuaternion::deSerialize(const struct btQuaternionFloatData& dataIn) { - for (int i = 0; i<4; i++) + for (int i = 0; i < 4; i++) m_floats[i] = (btScalar)dataIn.m_floats[i]; } -SIMD_FORCE_INLINE void btQuaternion::deSerialize(const struct btQuaternionDoubleData& dataIn) +SIMD_FORCE_INLINE void btQuaternion::deSerialize(const struct btQuaternionDoubleData& dataIn) { - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) m_floats[i] = (btScalar)dataIn.m_floats[i]; } - -#endif //BT_SIMD__QUATERNION_H_ - - - +#endif //BT_SIMD__QUATERNION_H_ diff --git a/thirdparty/bullet/LinearMath/btQuickprof.cpp b/thirdparty/bullet/LinearMath/btQuickprof.cpp index 1572b96262..86fd1d7812 100644 --- a/thirdparty/bullet/LinearMath/btQuickprof.cpp +++ b/thirdparty/bullet/LinearMath/btQuickprof.cpp @@ -16,16 +16,13 @@ #include "btQuickprof.h" #include "btThreads.h" - - - #ifdef __CELLOS_LV2__ #include #include #include #endif -#if defined (SUNOS) || defined (__SUNOS__) +#if defined(SUNOS) || defined(__SUNOS__) #include #endif #ifdef __APPLE__ @@ -42,49 +39,46 @@ #define NOIME #ifdef _XBOX - #include -#else //_XBOX - #include +#include +#else //_XBOX +#include -#if WINVER <0x0602 +#if WINVER < 0x0602 #define GetTickCount64 GetTickCount #endif -#endif //_XBOX +#endif //_XBOX #include - -#else //_WIN32 +#else //_WIN32 #include #ifdef BT_LINUX_REALTIME //required linking against rt (librt) #include -#endif //BT_LINUX_REALTIME +#endif //BT_LINUX_REALTIME -#endif //_WIN32 +#endif //_WIN32 -#define mymin(a,b) (a > b ? a : b) +#define mymin(a, b) (a > b ? a : b) struct btClockData { - #ifdef BT_USE_WINDOWS_TIMERS LARGE_INTEGER mClockFrequency; LONGLONG mStartTick; LARGE_INTEGER mStartTime; #else #ifdef __CELLOS_LV2__ - uint64_t mStartTime; + uint64_t mStartTime; #else #ifdef __APPLE__ - uint64_t mStartTimeNano; + uint64_t mStartTimeNano; #endif struct timeval mStartTime; #endif -#endif //__CELLOS_LV2__ - +#endif //__CELLOS_LV2__ }; ///The btClock is a portable basic clock that measures accurate time in seconds, use for profiling. @@ -114,8 +108,7 @@ btClock& btClock::operator=(const btClock& other) return *this; } - - /// Resets the initial reference time. +/// Resets the initial reference time. void btClock::reset() { #ifdef BT_USE_WINDOWS_TIMERS @@ -124,14 +117,14 @@ void btClock::reset() #else #ifdef __CELLOS_LV2__ - typedef uint64_t ClockSize; + typedef uint64_t ClockSize; ClockSize newTime; //__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory"); - SYS_TIMEBASE_GET( newTime ); + SYS_TIMEBASE_GET(newTime); m_data->mStartTime = newTime; #else #ifdef __APPLE__ - m_data->mStartTimeNano = mach_absolute_time(); + m_data->mStartTimeNano = mach_absolute_time(); #endif gettimeofday(&m_data->mStartTime, 0); #endif @@ -146,66 +139,66 @@ unsigned long long int btClock::getTimeMilliseconds() LARGE_INTEGER currentTime; QueryPerformanceCounter(¤tTime); LONGLONG elapsedTime = currentTime.QuadPart - - m_data->mStartTime.QuadPart; - // Compute the number of millisecond ticks elapsed. + m_data->mStartTime.QuadPart; + // Compute the number of millisecond ticks elapsed. unsigned long msecTicks = (unsigned long)(1000 * elapsedTime / - m_data->mClockFrequency.QuadPart); + m_data->mClockFrequency.QuadPart); - return msecTicks; + return msecTicks; #else #ifdef __CELLOS_LV2__ - uint64_t freq=sys_time_get_timebase_frequency(); - double dFreq=((double) freq) / 1000.0; - typedef uint64_t ClockSize; - ClockSize newTime; - SYS_TIMEBASE_GET( newTime ); - //__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory"); - - return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq); + uint64_t freq = sys_time_get_timebase_frequency(); + double dFreq = ((double)freq) / 1000.0; + typedef uint64_t ClockSize; + ClockSize newTime; + SYS_TIMEBASE_GET(newTime); + //__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory"); + + return (unsigned long int)((double(newTime - m_data->mStartTime)) / dFreq); #else - struct timeval currentTime; - gettimeofday(¤tTime, 0); - return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000 + - (currentTime.tv_usec - m_data->mStartTime.tv_usec) / 1000; -#endif //__CELLOS_LV2__ + struct timeval currentTime; + gettimeofday(¤tTime, 0); + return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000 + + (currentTime.tv_usec - m_data->mStartTime.tv_usec) / 1000; +#endif //__CELLOS_LV2__ #endif } - /// Returns the time in us since the last call to reset or since - /// the Clock was created. +/// Returns the time in us since the last call to reset or since +/// the Clock was created. unsigned long long int btClock::getTimeMicroseconds() { #ifdef BT_USE_WINDOWS_TIMERS - //see https://msdn.microsoft.com/en-us/library/windows/desktop/dn553408(v=vs.85).aspx - LARGE_INTEGER currentTime, elapsedTime; - - QueryPerformanceCounter(¤tTime); - elapsedTime.QuadPart = currentTime.QuadPart - - m_data->mStartTime.QuadPart; - elapsedTime.QuadPart *= 1000000; - elapsedTime.QuadPart /= m_data->mClockFrequency.QuadPart; - - return (unsigned long long) elapsedTime.QuadPart; + //see https://msdn.microsoft.com/en-us/library/windows/desktop/dn553408(v=vs.85).aspx + LARGE_INTEGER currentTime, elapsedTime; + + QueryPerformanceCounter(¤tTime); + elapsedTime.QuadPart = currentTime.QuadPart - + m_data->mStartTime.QuadPart; + elapsedTime.QuadPart *= 1000000; + elapsedTime.QuadPart /= m_data->mClockFrequency.QuadPart; + + return (unsigned long long)elapsedTime.QuadPart; #else #ifdef __CELLOS_LV2__ - uint64_t freq=sys_time_get_timebase_frequency(); - double dFreq=((double) freq)/ 1000000.0; - typedef uint64_t ClockSize; - ClockSize newTime; - //__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory"); - SYS_TIMEBASE_GET( newTime ); - - return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq); + uint64_t freq = sys_time_get_timebase_frequency(); + double dFreq = ((double)freq) / 1000000.0; + typedef uint64_t ClockSize; + ClockSize newTime; + //__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory"); + SYS_TIMEBASE_GET(newTime); + + return (unsigned long int)((double(newTime - m_data->mStartTime)) / dFreq); #else - struct timeval currentTime; - gettimeofday(¤tTime, 0); - return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000000 + - (currentTime.tv_usec - m_data->mStartTime.tv_usec); -#endif//__CELLOS_LV2__ + struct timeval currentTime; + gettimeofday(¤tTime, 0); + return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000000 + + (currentTime.tv_usec - m_data->mStartTime.tv_usec); +#endif //__CELLOS_LV2__ #endif } @@ -213,65 +206,63 @@ unsigned long long int btClock::getTimeNanoseconds() { #ifdef BT_USE_WINDOWS_TIMERS //see https://msdn.microsoft.com/en-us/library/windows/desktop/dn553408(v=vs.85).aspx - LARGE_INTEGER currentTime, elapsedTime; - - QueryPerformanceCounter(¤tTime); - elapsedTime.QuadPart = currentTime.QuadPart - - m_data->mStartTime.QuadPart; - elapsedTime.QuadPart *= 1000000000; - elapsedTime.QuadPart /= m_data->mClockFrequency.QuadPart; - - return (unsigned long long) elapsedTime.QuadPart; + LARGE_INTEGER currentTime, elapsedTime; + + QueryPerformanceCounter(¤tTime); + elapsedTime.QuadPart = currentTime.QuadPart - + m_data->mStartTime.QuadPart; + elapsedTime.QuadPart *= 1000000000; + elapsedTime.QuadPart /= m_data->mClockFrequency.QuadPart; + + return (unsigned long long)elapsedTime.QuadPart; #else #ifdef __CELLOS_LV2__ - uint64_t freq=sys_time_get_timebase_frequency(); - double dFreq=((double) freq)/ 1e9; - typedef uint64_t ClockSize; - ClockSize newTime; - //__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory"); - SYS_TIMEBASE_GET( newTime ); - - return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq); + uint64_t freq = sys_time_get_timebase_frequency(); + double dFreq = ((double)freq) / 1e9; + typedef uint64_t ClockSize; + ClockSize newTime; + //__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory"); + SYS_TIMEBASE_GET(newTime); + + return (unsigned long int)((double(newTime - m_data->mStartTime)) / dFreq); #else #ifdef __APPLE__ - uint64_t ticks = mach_absolute_time() - m_data->mStartTimeNano; - static long double conversion = 0.0L; - if( 0.0L == conversion ) - { - // attempt to get conversion to nanoseconds - mach_timebase_info_data_t info; - int err = mach_timebase_info( &info ); - if( err ) - { - btAssert(0); - conversion = 1.; - } - conversion = info.numer / info.denom; - } - return (ticks * conversion); - - -#else//__APPLE__ - + uint64_t ticks = mach_absolute_time() - m_data->mStartTimeNano; + static long double conversion = 0.0L; + if (0.0L == conversion) + { + // attempt to get conversion to nanoseconds + mach_timebase_info_data_t info; + int err = mach_timebase_info(&info); + if (err) + { + btAssert(0); + conversion = 1.; + } + conversion = info.numer / info.denom; + } + return (ticks * conversion); + +#else //__APPLE__ + #ifdef BT_LINUX_REALTIME - timespec ts; - clock_gettime(CLOCK_REALTIME,&ts); - return 1000000000*ts.tv_sec + ts.tv_nsec; + timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + return 1000000000 * ts.tv_sec + ts.tv_nsec; #else - struct timeval currentTime; - gettimeofday(¤tTime, 0); - return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1e9 + - (currentTime.tv_usec - m_data->mStartTime.tv_usec)*1000; -#endif //BT_LINUX_REALTIME - -#endif//__APPLE__ -#endif//__CELLOS_LV2__ -#endif + struct timeval currentTime; + gettimeofday(¤tTime, 0); + return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1e9 + + (currentTime.tv_usec - m_data->mStartTime.tv_usec) * 1000; +#endif //BT_LINUX_REALTIME + +#endif //__APPLE__ +#endif //__CELLOS_LV2__ +#endif } - -/// Returns the time in s since the last call to reset or since +/// Returns the time in s since the last call to reset or since /// the Clock was created. btScalar btClock::getTimeSeconds() { @@ -281,23 +272,19 @@ btScalar btClock::getTimeSeconds() #ifndef BT_NO_PROFILE - static btClock gProfileClock; - -inline void Profile_Get_Ticks(unsigned long int * ticks) +inline void Profile_Get_Ticks(unsigned long int* ticks) { *ticks = (unsigned long int)gProfileClock.getTimeMicroseconds(); } inline float Profile_Get_Tick_Rate(void) { -// return 1000000.f; + // return 1000000.f; return 1000.f; - } - /*************************************************************************************************** ** ** CProfileNode @@ -313,35 +300,32 @@ inline float Profile_Get_Tick_Rate(void) * The name is assumed to be a static pointer, only the pointer is stored and compared for * * efficiency reasons. * *=============================================================================================*/ -CProfileNode::CProfileNode( const char * name, CProfileNode * parent ) : - Name( name ), - TotalCalls( 0 ), - TotalTime( 0 ), - StartTime( 0 ), - RecursionCounter( 0 ), - Parent( parent ), - Child( NULL ), - Sibling( NULL ), - m_userPtr(0) +CProfileNode::CProfileNode(const char* name, CProfileNode* parent) : Name(name), + TotalCalls(0), + TotalTime(0), + StartTime(0), + RecursionCounter(0), + Parent(parent), + Child(NULL), + Sibling(NULL), + m_userPtr(0) { Reset(); } - -void CProfileNode::CleanupMemory() +void CProfileNode::CleanupMemory() { - delete ( Child); + delete (Child); Child = NULL; - delete ( Sibling); + delete (Sibling); Sibling = NULL; } -CProfileNode::~CProfileNode( void ) +CProfileNode::~CProfileNode(void) { CleanupMemory(); } - /*********************************************************************************************** * INPUT: * * name - static string pointer to the name of the node we are searching for * @@ -350,12 +334,14 @@ CProfileNode::~CProfileNode( void ) * All profile names are assumed to be static strings so this function uses pointer compares * * to find the named node. * *=============================================================================================*/ -CProfileNode * CProfileNode::Get_Sub_Node( const char * name ) +CProfileNode* CProfileNode::Get_Sub_Node(const char* name) { // Try to find this sub node - CProfileNode * child = Child; - while ( child ) { - if ( child->Name == name ) { + CProfileNode* child = Child; + while (child) + { + if (child->Name == name) + { return child; } child = child->Sibling; @@ -363,176 +349,212 @@ CProfileNode * CProfileNode::Get_Sub_Node( const char * name ) // We didn't find it, so add it - CProfileNode * node = new CProfileNode( name, this ); + CProfileNode* node = new CProfileNode(name, this); node->Sibling = Child; Child = node; return node; } - -void CProfileNode::Reset( void ) +void CProfileNode::Reset(void) { TotalCalls = 0; TotalTime = 0.0f; - - if ( Child ) { + if (Child) + { Child->Reset(); } - if ( Sibling ) { + if (Sibling) + { Sibling->Reset(); } } - -void CProfileNode::Call( void ) +void CProfileNode::Call(void) { TotalCalls++; - if (RecursionCounter++ == 0) { + if (RecursionCounter++ == 0) + { Profile_Get_Ticks(&StartTime); } } - -bool CProfileNode::Return( void ) +bool CProfileNode::Return(void) { - if ( --RecursionCounter == 0 && TotalCalls != 0 ) { + if (--RecursionCounter == 0 && TotalCalls != 0) + { unsigned long int time; Profile_Get_Ticks(&time); - time-=StartTime; + time -= StartTime; TotalTime += (float)time / Profile_Get_Tick_Rate(); } - return ( RecursionCounter == 0 ); + return (RecursionCounter == 0); } - /*************************************************************************************************** ** ** CProfileIterator ** ***************************************************************************************************/ -CProfileIterator::CProfileIterator( CProfileNode * start ) +CProfileIterator::CProfileIterator(CProfileNode* start) { CurrentParent = start; CurrentChild = CurrentParent->Get_Child(); } - -void CProfileIterator::First(void) +void CProfileIterator::First(void) { CurrentChild = CurrentParent->Get_Child(); } - -void CProfileIterator::Next(void) +void CProfileIterator::Next(void) { CurrentChild = CurrentChild->Get_Sibling(); } - -bool CProfileIterator::Is_Done(void) +bool CProfileIterator::Is_Done(void) { return CurrentChild == NULL; } - -void CProfileIterator::Enter_Child( int index ) +void CProfileIterator::Enter_Child(int index) { CurrentChild = CurrentParent->Get_Child(); - while ( (CurrentChild != NULL) && (index != 0) ) { + while ((CurrentChild != NULL) && (index != 0)) + { index--; CurrentChild = CurrentChild->Get_Sibling(); } - if ( CurrentChild != NULL ) { + if (CurrentChild != NULL) + { CurrentParent = CurrentChild; CurrentChild = CurrentParent->Get_Child(); } } - -void CProfileIterator::Enter_Parent( void ) +void CProfileIterator::Enter_Parent(void) { - if ( CurrentParent->Get_Parent() != NULL ) { + if (CurrentParent->Get_Parent() != NULL) + { CurrentParent = CurrentParent->Get_Parent(); } CurrentChild = CurrentParent->Get_Child(); } - /*************************************************************************************************** ** ** CProfileManager ** ***************************************************************************************************/ - - - -CProfileNode gRoots[BT_QUICKPROF_MAX_THREAD_COUNT]={ - CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL), - CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL), - CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL), - CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL), - CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL), - CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL), - CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL), - CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL), - CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL), - CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL), - CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL), - CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL), - CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL), - CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL), - CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL), - CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL),CProfileNode("Root",NULL) -}; - - -CProfileNode* gCurrentNodes[BT_QUICKPROF_MAX_THREAD_COUNT]= -{ - &gRoots[ 0], &gRoots[ 1], &gRoots[ 2], &gRoots[ 3], - &gRoots[ 4], &gRoots[ 5], &gRoots[ 6], &gRoots[ 7], - &gRoots[ 8], &gRoots[ 9], &gRoots[10], &gRoots[11], - &gRoots[12], &gRoots[13], &gRoots[14], &gRoots[15], - &gRoots[16], &gRoots[17], &gRoots[18], &gRoots[19], - &gRoots[20], &gRoots[21], &gRoots[22], &gRoots[23], - &gRoots[24], &gRoots[25], &gRoots[26], &gRoots[27], - &gRoots[28], &gRoots[29], &gRoots[30], &gRoots[31], - &gRoots[32], &gRoots[33], &gRoots[34], &gRoots[35], - &gRoots[36], &gRoots[37], &gRoots[38], &gRoots[39], - &gRoots[40], &gRoots[41], &gRoots[42], &gRoots[43], - &gRoots[44], &gRoots[45], &gRoots[46], &gRoots[47], - &gRoots[48], &gRoots[49], &gRoots[50], &gRoots[51], - &gRoots[52], &gRoots[53], &gRoots[54], &gRoots[55], - &gRoots[56], &gRoots[57], &gRoots[58], &gRoots[59], - &gRoots[60], &gRoots[61], &gRoots[62], &gRoots[63], +CProfileNode gRoots[BT_QUICKPROF_MAX_THREAD_COUNT] = { + CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), + CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), + CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), + CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), + CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), + CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), + CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), + CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), + CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), + CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), + CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), + CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), + CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), + CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), + CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), + CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL), CProfileNode("Root", NULL)}; + +CProfileNode* gCurrentNodes[BT_QUICKPROF_MAX_THREAD_COUNT] = + { + &gRoots[0], + &gRoots[1], + &gRoots[2], + &gRoots[3], + &gRoots[4], + &gRoots[5], + &gRoots[6], + &gRoots[7], + &gRoots[8], + &gRoots[9], + &gRoots[10], + &gRoots[11], + &gRoots[12], + &gRoots[13], + &gRoots[14], + &gRoots[15], + &gRoots[16], + &gRoots[17], + &gRoots[18], + &gRoots[19], + &gRoots[20], + &gRoots[21], + &gRoots[22], + &gRoots[23], + &gRoots[24], + &gRoots[25], + &gRoots[26], + &gRoots[27], + &gRoots[28], + &gRoots[29], + &gRoots[30], + &gRoots[31], + &gRoots[32], + &gRoots[33], + &gRoots[34], + &gRoots[35], + &gRoots[36], + &gRoots[37], + &gRoots[38], + &gRoots[39], + &gRoots[40], + &gRoots[41], + &gRoots[42], + &gRoots[43], + &gRoots[44], + &gRoots[45], + &gRoots[46], + &gRoots[47], + &gRoots[48], + &gRoots[49], + &gRoots[50], + &gRoots[51], + &gRoots[52], + &gRoots[53], + &gRoots[54], + &gRoots[55], + &gRoots[56], + &gRoots[57], + &gRoots[58], + &gRoots[59], + &gRoots[60], + &gRoots[61], + &gRoots[62], + &gRoots[63], }; +int CProfileManager::FrameCounter = 0; +unsigned long int CProfileManager::ResetTime = 0; -int CProfileManager::FrameCounter = 0; -unsigned long int CProfileManager::ResetTime = 0; - -CProfileIterator * CProfileManager::Get_Iterator( void ) -{ - - int threadIndex = btQuickprofGetCurrentThreadIndex2(); - if ((threadIndex<0) || threadIndex >= BT_QUICKPROF_MAX_THREAD_COUNT) - return 0; +CProfileIterator* CProfileManager::Get_Iterator(void) +{ + int threadIndex = btQuickprofGetCurrentThreadIndex2(); + if ((threadIndex < 0) || threadIndex >= BT_QUICKPROF_MAX_THREAD_COUNT) + return 0; - return new CProfileIterator( &gRoots[threadIndex]); + return new CProfileIterator(&gRoots[threadIndex]); } -void CProfileManager::CleanupMemory(void) +void CProfileManager::CleanupMemory(void) { - for (int i=0;i= BT_QUICKPROF_MAX_THREAD_COUNT) + if ((threadIndex < 0) || threadIndex >= BT_QUICKPROF_MAX_THREAD_COUNT) return; - if (name != gCurrentNodes[threadIndex]->Get_Name()) { - gCurrentNodes[threadIndex] = gCurrentNodes[threadIndex]->Get_Sub_Node( name ); + if (name != gCurrentNodes[threadIndex]->Get_Name()) + { + gCurrentNodes[threadIndex] = gCurrentNodes[threadIndex]->Get_Sub_Node(name); } gCurrentNodes[threadIndex]->Call(); } - /*********************************************************************************************** * CProfileManager::Stop_Profile -- Stop timing and record the results. * *=============================================================================================*/ -void CProfileManager::Stop_Profile( void ) +void CProfileManager::Stop_Profile(void) { int threadIndex = btQuickprofGetCurrentThreadIndex2(); - if ((threadIndex<0) || threadIndex >= BT_QUICKPROF_MAX_THREAD_COUNT) + if ((threadIndex < 0) || threadIndex >= BT_QUICKPROF_MAX_THREAD_COUNT) return; // Return will indicate whether we should back up to our parent (we may // be profiling a recursive function) - if (gCurrentNodes[threadIndex]->Return()) { + if (gCurrentNodes[threadIndex]->Return()) + { gCurrentNodes[threadIndex] = gCurrentNodes[threadIndex]->Get_Parent(); } } - - - - - /*********************************************************************************************** * CProfileManager::Reset -- Reset the contents of the profiling system * * * * This resets everything except for the tree structure. All of the timing data is reset. * *=============================================================================================*/ -void CProfileManager::Reset( void ) +void CProfileManager::Reset(void) { gProfileClock.reset(); int threadIndex = btQuickprofGetCurrentThreadIndex2(); - if ((threadIndex<0) || threadIndex >= BT_QUICKPROF_MAX_THREAD_COUNT) + if ((threadIndex < 0) || threadIndex >= BT_QUICKPROF_MAX_THREAD_COUNT) return; gRoots[threadIndex].Reset(); gRoots[threadIndex].Call(); @@ -598,20 +616,18 @@ void CProfileManager::Reset( void ) Profile_Get_Ticks(&ResetTime); } - /*********************************************************************************************** * CProfileManager::Increment_Frame_Counter -- Increment the frame counter * *=============================================================================================*/ -void CProfileManager::Increment_Frame_Counter( void ) +void CProfileManager::Increment_Frame_Counter(void) { FrameCounter++; } - /*********************************************************************************************** * CProfileManager::Get_Time_Since_Reset -- returns the elapsed time since last reset * *=============================================================================================*/ -float CProfileManager::Get_Time_Since_Reset( void ) +float CProfileManager::Get_Time_Since_Reset(void) { unsigned long int time; Profile_Get_Ticks(&time); @@ -621,34 +637,34 @@ float CProfileManager::Get_Time_Since_Reset( void ) #include -void CProfileManager::dumpRecursive(CProfileIterator* profileIterator, int spacing) +void CProfileManager::dumpRecursive(CProfileIterator* profileIterator, int spacing) { profileIterator->First(); if (profileIterator->Is_Done()) return; - float accumulated_time=0,parent_time = profileIterator->Is_Root() ? CProfileManager::Get_Time_Since_Reset() : profileIterator->Get_Current_Parent_Total_Time(); + float accumulated_time = 0, parent_time = profileIterator->Is_Root() ? CProfileManager::Get_Time_Since_Reset() : profileIterator->Get_Current_Parent_Total_Time(); int i; int frames_since_reset = CProfileManager::Get_Frame_Count_Since_Reset(); - for (i=0;iGet_Current_Parent_Name(), parent_time ); + for (i = 0; i < spacing; i++) printf("."); + printf("Profiling: %s (total running time: %.3f ms) ---\n", profileIterator->Get_Current_Parent_Name(), parent_time); float totalTime = 0.f; - int numChildren = 0; - for (i = 0; !profileIterator->Is_Done(); i++,profileIterator->Next()) + for (i = 0; !profileIterator->Is_Done(); i++, profileIterator->Next()) { numChildren++; float current_total_time = profileIterator->Get_Current_Total_Time(); accumulated_time += current_total_time; float fraction = parent_time > SIMD_EPSILON ? (current_total_time / parent_time) * 100 : 0.f; { - int i; for (i=0;iGet_Current_Name(), fraction,(current_total_time / (double)frames_since_reset),profileIterator->Get_Current_Total_Calls()); + printf("%d -- %s (%.2f %%) :: %.3f ms / frame (%d calls)\n", i, profileIterator->Get_Current_Name(), fraction, (current_total_time / (double)frames_since_reset), profileIterator->Get_Current_Total_Calls()); totalTime += current_total_time; //recurse into children } @@ -657,29 +673,45 @@ void CProfileManager::dumpRecursive(CProfileIterator* profileIterator, int spaci { //printf("what's wrong\n"); } - for (i=0;i SIMD_EPSILON ? ((parent_time - accumulated_time) / parent_time) * 100 : 0.f, parent_time - accumulated_time); + for (i = 0; i < spacing; i++) printf("."); + printf("%s (%.3f %%) :: %.3f ms\n", "Unaccounted:", parent_time > SIMD_EPSILON ? ((parent_time - accumulated_time) / parent_time) * 100 : 0.f, parent_time - accumulated_time); - for (i=0;iEnter_Child(i); - dumpRecursive(profileIterator,spacing+3); + dumpRecursive(profileIterator, spacing + 3); profileIterator->Enter_Parent(); } } - - -void CProfileManager::dumpAll() +void CProfileManager::dumpAll() { CProfileIterator* profileIterator = 0; profileIterator = CProfileManager::Get_Iterator(); - dumpRecursive(profileIterator,0); + dumpRecursive(profileIterator, 0); CProfileManager::Release_Iterator(profileIterator); } + +void btEnterProfileZoneDefault(const char* name) +{ +} +void btLeaveProfileZoneDefault() +{ +} + +#else +void btEnterProfileZoneDefault(const char* name) +{ +} +void btLeaveProfileZoneDefault() +{ +} +#endif //BT_NO_PROFILE + + // clang-format off #if defined(_WIN32) && (defined(__MINGW32__) || defined(__MINGW64__)) #define BT_HAVE_TLS 1 @@ -703,50 +735,31 @@ void CProfileManager::dumpAll() #endif // defined(__ANDROID__) && defined(__clang__) // clang-format on -unsigned int btQuickprofGetCurrentThreadIndex2() { - const unsigned int kNullIndex = ~0U; +unsigned int btQuickprofGetCurrentThreadIndex2() +{ + const unsigned int kNullIndex = ~0U; #if BT_THREADSAFE - return btGetCurrentThreadIndex(); + return btGetCurrentThreadIndex(); #else #if defined(BT_HAVE_TLS) - static __thread unsigned int sThreadIndex = kNullIndex; + static __thread unsigned int sThreadIndex = kNullIndex; #elif defined(_WIN32) - __declspec(thread) static unsigned int sThreadIndex = kNullIndex; + __declspec(thread) static unsigned int sThreadIndex = kNullIndex; #else - unsigned int sThreadIndex = 0; - return -1; + unsigned int sThreadIndex = 0; + return -1; #endif - static int gThreadCounter = 0; - - if (sThreadIndex == kNullIndex) { - sThreadIndex = gThreadCounter++; - } - return sThreadIndex; -#endif //BT_THREADSAFE -} - -void btEnterProfileZoneDefault(const char* name) -{ -} -void btLeaveProfileZoneDefault() -{ -} - + static int gThreadCounter = 0; -#else -void btEnterProfileZoneDefault(const char* name) -{ -} -void btLeaveProfileZoneDefault() -{ + if (sThreadIndex == kNullIndex) + { + sThreadIndex = gThreadCounter++; + } + return sThreadIndex; +#endif //BT_THREADSAFE } -#endif //BT_NO_PROFILE - - - - static btEnterProfileZoneFunc* bts_enterFunc = btEnterProfileZoneDefault; static btLeaveProfileZoneFunc* bts_leaveFunc = btLeaveProfileZoneDefault; @@ -762,14 +775,13 @@ void btLeaveProfileZone() btEnterProfileZoneFunc* btGetCurrentEnterProfileZoneFunc() { - return bts_enterFunc ; + return bts_enterFunc; } btLeaveProfileZoneFunc* btGetCurrentLeaveProfileZoneFunc() { return bts_leaveFunc; } - void btSetCustomEnterProfileZoneFunc(btEnterProfileZoneFunc* enterFunc) { bts_enterFunc = enterFunc; @@ -779,13 +791,12 @@ void btSetCustomLeaveProfileZoneFunc(btLeaveProfileZoneFunc* leaveFunc) bts_leaveFunc = leaveFunc; } -CProfileSample::CProfileSample( const char * name ) -{ +CProfileSample::CProfileSample(const char* name) +{ btEnterProfileZone(name); } -CProfileSample::~CProfileSample( void ) -{ +CProfileSample::~CProfileSample(void) +{ btLeaveProfileZone(); } - diff --git a/thirdparty/bullet/LinearMath/btQuickprof.h b/thirdparty/bullet/LinearMath/btQuickprof.h index 98a2675771..990d401d50 100644 --- a/thirdparty/bullet/LinearMath/btQuickprof.h +++ b/thirdparty/bullet/LinearMath/btQuickprof.h @@ -7,11 +7,9 @@ ** ***************************************************************************************************/ -// Credits: The Clock class was inspired by the Timer classes in +// Credits: The Clock class was inspired by the Timer classes in // Ogre (www.ogre3d.org). - - #ifndef BT_QUICK_PROF_H #define BT_QUICK_PROF_H @@ -34,97 +32,88 @@ public: /// Resets the initial reference time. void reset(); - /// Returns the time in ms since the last call to reset or since + /// Returns the time in ms since the last call to reset or since /// the btClock was created. unsigned long long int getTimeMilliseconds(); - /// Returns the time in us since the last call to reset or since + /// Returns the time in us since the last call to reset or since /// the Clock was created. unsigned long long int getTimeMicroseconds(); - + unsigned long long int getTimeNanoseconds(); - /// Returns the time in s since the last call to reset or since + /// Returns the time in s since the last call to reset or since /// the Clock was created. btScalar getTimeSeconds(); - + private: struct btClockData* m_data; }; -#endif //USE_BT_CLOCK +#endif //USE_BT_CLOCK -typedef void (btEnterProfileZoneFunc)(const char* msg); -typedef void (btLeaveProfileZoneFunc)(); +typedef void(btEnterProfileZoneFunc)(const char* msg); +typedef void(btLeaveProfileZoneFunc)(); btEnterProfileZoneFunc* btGetCurrentEnterProfileZoneFunc(); btLeaveProfileZoneFunc* btGetCurrentLeaveProfileZoneFunc(); - - void btSetCustomEnterProfileZoneFunc(btEnterProfileZoneFunc* enterFunc); void btSetCustomLeaveProfileZoneFunc(btLeaveProfileZoneFunc* leaveFunc); -#ifndef BT_NO_PROFILE // FIX redefinition -//To disable built-in profiling, please comment out next line -//#define BT_NO_PROFILE 1 -#endif //BT_NO_PROFILE +#ifndef BT_ENABLE_PROFILE +#define BT_NO_PROFILE 1 +#endif //BT_NO_PROFILE const unsigned int BT_QUICKPROF_MAX_THREAD_COUNT = 64; -#ifndef BT_NO_PROFILE -//btQuickprofGetCurrentThreadIndex will return -1 if thread index cannot be determined, +//btQuickprofGetCurrentThreadIndex will return -1 if thread index cannot be determined, //otherwise returns thread index in range [0..maxThreads] unsigned int btQuickprofGetCurrentThreadIndex2(); -#include //@todo remove this, backwards compatibility - -#include "btAlignedAllocator.h" -#include - - - - - +#ifndef BT_NO_PROFILE +#include //@todo remove this, backwards compatibility +#include "btAlignedAllocator.h" +#include ///A node in the Profile Hierarchy Tree -class CProfileNode { - +class CProfileNode +{ public: - CProfileNode( const char * name, CProfileNode * parent ); - ~CProfileNode( void ); + CProfileNode(const char* name, CProfileNode* parent); + ~CProfileNode(void); - CProfileNode * Get_Sub_Node( const char * name ); + CProfileNode* Get_Sub_Node(const char* name); - CProfileNode * Get_Parent( void ) { return Parent; } - CProfileNode * Get_Sibling( void ) { return Sibling; } - CProfileNode * Get_Child( void ) { return Child; } + CProfileNode* Get_Parent(void) { return Parent; } + CProfileNode* Get_Sibling(void) { return Sibling; } + CProfileNode* Get_Child(void) { return Child; } - void CleanupMemory(); - void Reset( void ); - void Call( void ); - bool Return( void ); + void CleanupMemory(); + void Reset(void); + void Call(void); + bool Return(void); - const char * Get_Name( void ) { return Name; } - int Get_Total_Calls( void ) { return TotalCalls; } - float Get_Total_Time( void ) { return TotalTime; } - void* GetUserPointer() const {return m_userPtr;} - void SetUserPointer(void* ptr) { m_userPtr = ptr;} -protected: + const char* Get_Name(void) { return Name; } + int Get_Total_Calls(void) { return TotalCalls; } + float Get_Total_Time(void) { return TotalTime; } + void* GetUserPointer() const { return m_userPtr; } + void SetUserPointer(void* ptr) { m_userPtr = ptr; } - const char * Name; - int TotalCalls; - float TotalTime; - unsigned long int StartTime; - int RecursionCounter; - - CProfileNode * Parent; - CProfileNode * Child; - CProfileNode * Sibling; - void* m_userPtr; +protected: + const char* Name; + int TotalCalls; + float TotalTime; + unsigned long int StartTime; + int RecursionCounter; + + CProfileNode* Parent; + CProfileNode* Child; + CProfileNode* Sibling; + void* m_userPtr; }; ///An iterator to navigate through the tree @@ -132,91 +121,80 @@ class CProfileIterator { public: // Access all the children of the current parent - void First(void); - void Next(void); - bool Is_Done(void); - bool Is_Root(void) { return (CurrentParent->Get_Parent() == 0); } + void First(void); + void Next(void); + bool Is_Done(void); + bool Is_Root(void) { return (CurrentParent->Get_Parent() == 0); } - void Enter_Child( int index ); // Make the given child the new parent - void Enter_Largest_Child( void ); // Make the largest child the new parent - void Enter_Parent( void ); // Make the current parent's parent the new parent + void Enter_Child(int index); // Make the given child the new parent + void Enter_Largest_Child(void); // Make the largest child the new parent + void Enter_Parent(void); // Make the current parent's parent the new parent // Access the current child - const char * Get_Current_Name( void ) { return CurrentChild->Get_Name(); } - int Get_Current_Total_Calls( void ) { return CurrentChild->Get_Total_Calls(); } - float Get_Current_Total_Time( void ) { return CurrentChild->Get_Total_Time(); } + const char* Get_Current_Name(void) { return CurrentChild->Get_Name(); } + int Get_Current_Total_Calls(void) { return CurrentChild->Get_Total_Calls(); } + float Get_Current_Total_Time(void) { return CurrentChild->Get_Total_Time(); } - void* Get_Current_UserPointer( void ) { return CurrentChild->GetUserPointer(); } - void Set_Current_UserPointer(void* ptr) {CurrentChild->SetUserPointer(ptr);} + void* Get_Current_UserPointer(void) { return CurrentChild->GetUserPointer(); } + void Set_Current_UserPointer(void* ptr) { CurrentChild->SetUserPointer(ptr); } // Access the current parent - const char * Get_Current_Parent_Name( void ) { return CurrentParent->Get_Name(); } - int Get_Current_Parent_Total_Calls( void ) { return CurrentParent->Get_Total_Calls(); } - float Get_Current_Parent_Total_Time( void ) { return CurrentParent->Get_Total_Time(); } - - + const char* Get_Current_Parent_Name(void) { return CurrentParent->Get_Name(); } + int Get_Current_Parent_Total_Calls(void) { return CurrentParent->Get_Total_Calls(); } + float Get_Current_Parent_Total_Time(void) { return CurrentParent->Get_Total_Time(); } protected: + CProfileNode* CurrentParent; + CProfileNode* CurrentChild; - CProfileNode * CurrentParent; - CProfileNode * CurrentChild; - - - CProfileIterator( CProfileNode * start ); - friend class CProfileManager; + CProfileIterator(CProfileNode* start); + friend class CProfileManager; }; - ///The Manager for the Profile system -class CProfileManager { +class CProfileManager +{ public: - static void Start_Profile( const char * name ); - static void Stop_Profile( void ); + static void Start_Profile(const char* name); + static void Stop_Profile(void); - static void CleanupMemory(void); -// { -// Root.CleanupMemory(); -// } + static void CleanupMemory(void); + // { + // Root.CleanupMemory(); + // } - static void Reset( void ); - static void Increment_Frame_Counter( void ); - static int Get_Frame_Count_Since_Reset( void ) { return FrameCounter; } - static float Get_Time_Since_Reset( void ); + static void Reset(void); + static void Increment_Frame_Counter(void); + static int Get_Frame_Count_Since_Reset(void) { return FrameCounter; } + static float Get_Time_Since_Reset(void); - static CProfileIterator * Get_Iterator( void ); -// { -// -// return new CProfileIterator( &Root ); -// } - static void Release_Iterator( CProfileIterator * iterator ) { delete ( iterator); } + static CProfileIterator* Get_Iterator(void); + // { + // + // return new CProfileIterator( &Root ); + // } + static void Release_Iterator(CProfileIterator* iterator) { delete (iterator); } - static void dumpRecursive(CProfileIterator* profileIterator, int spacing); + static void dumpRecursive(CProfileIterator* profileIterator, int spacing); - static void dumpAll(); + static void dumpAll(); private: - - static int FrameCounter; - static unsigned long int ResetTime; + static int FrameCounter; + static unsigned long int ResetTime; }; - - - -#endif //#ifndef BT_NO_PROFILE +#endif //#ifndef BT_NO_PROFILE ///ProfileSampleClass is a simple way to profile a function's scope ///Use the BT_PROFILE macro at the start of scope to time -class CProfileSample { +class CProfileSample +{ public: - CProfileSample( const char * name ); + CProfileSample(const char* name); - ~CProfileSample( void ); + ~CProfileSample(void); }; -#define BT_PROFILE( name ) CProfileSample __profile( name ) - - - -#endif //BT_QUICK_PROF_H - +#define BT_PROFILE(name) CProfileSample __profile(name) +#endif //BT_QUICK_PROF_H diff --git a/thirdparty/bullet/LinearMath/btRandom.h b/thirdparty/bullet/LinearMath/btRandom.h index 4cbfc6bfe9..e659af8605 100644 --- a/thirdparty/bullet/LinearMath/btRandom.h +++ b/thirdparty/bullet/LinearMath/btRandom.h @@ -12,8 +12,6 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - - #ifndef BT_GEN_RANDOM_H #define BT_GEN_RANDOM_H @@ -24,8 +22,8 @@ subject to the following restrictions: #define GEN_RAND_MAX UINT_MAX -SIMD_FORCE_INLINE void GEN_srand(unsigned int seed) { init_genrand(seed); } -SIMD_FORCE_INLINE unsigned int GEN_rand() { return genrand_int32(); } +SIMD_FORCE_INLINE void GEN_srand(unsigned int seed) { init_genrand(seed); } +SIMD_FORCE_INLINE unsigned int GEN_rand() { return genrand_int32(); } #else @@ -33,10 +31,9 @@ SIMD_FORCE_INLINE unsigned int GEN_rand() { return genrand_int #define GEN_RAND_MAX RAND_MAX -SIMD_FORCE_INLINE void GEN_srand(unsigned int seed) { srand(seed); } -SIMD_FORCE_INLINE unsigned int GEN_rand() { return rand(); } +SIMD_FORCE_INLINE void GEN_srand(unsigned int seed) { srand(seed); } +SIMD_FORCE_INLINE unsigned int GEN_rand() { return rand(); } #endif -#endif //BT_GEN_RANDOM_H - +#endif //BT_GEN_RANDOM_H diff --git a/thirdparty/bullet/LinearMath/btScalar.h b/thirdparty/bullet/LinearMath/btScalar.h index 24e8454c1f..c198bd4b35 100644 --- a/thirdparty/bullet/LinearMath/btScalar.h +++ b/thirdparty/bullet/LinearMath/btScalar.h @@ -32,7 +32,6 @@ inline int btGetVersion() return BT_BULLET_VERSION; } - // The following macro "BT_NOT_EMPTY_FILE" can be put into a file // in order suppress the MS Visual C++ Linker warning 4221 // @@ -44,16 +43,19 @@ inline int btGetVersion() // // see more https://stackoverflow.com/questions/1822887/what-is-the-best-way-to-eliminate-ms-visual-c-linker-warning-warning-lnk422 -#if defined (_MSC_VER) - #define BT_NOT_EMPTY_FILE_CAT_II(p, res) res - #define BT_NOT_EMPTY_FILE_CAT_I(a, b) BT_NOT_EMPTY_FILE_CAT_II(~, a ## b) - #define BT_NOT_EMPTY_FILE_CAT(a, b) BT_NOT_EMPTY_FILE_CAT_I(a, b) - #define BT_NOT_EMPTY_FILE namespace { char BT_NOT_EMPTY_FILE_CAT(NoEmptyFileDummy, __COUNTER__); } +#if defined(_MSC_VER) +#define BT_NOT_EMPTY_FILE_CAT_II(p, res) res +#define BT_NOT_EMPTY_FILE_CAT_I(a, b) BT_NOT_EMPTY_FILE_CAT_II(~, a##b) +#define BT_NOT_EMPTY_FILE_CAT(a, b) BT_NOT_EMPTY_FILE_CAT_I(a, b) +#define BT_NOT_EMPTY_FILE \ + namespace \ + { \ + char BT_NOT_EMPTY_FILE_CAT(NoEmptyFileDummy, __COUNTER__); \ + } #else - #define BT_NOT_EMPTY_FILE +#define BT_NOT_EMPTY_FILE #endif - // clang and most formatting tools don't support indentation of preprocessor guards, so turn it off // clang-format off #if defined(DEBUG) || defined (_DEBUG) diff --git a/thirdparty/bullet/LinearMath/btSerializer.cpp b/thirdparty/bullet/LinearMath/btSerializer.cpp index 4faa8f536b..18683c8fa7 100644 --- a/thirdparty/bullet/LinearMath/btSerializer.cpp +++ b/thirdparty/bullet/LinearMath/btSerializer.cpp @@ -1,3 +1,4 @@ +// clang-format off char sBulletDNAstr[]= { char(83),char(68),char(78),char(65),char(78),char(65),char(77),char(69),char(-76),char(1),char(0),char(0),char(109),char(95),char(115),char(105),char(122),char(101),char(0),char(109), char(95),char(99),char(97),char(112),char(97),char(99),char(105),char(116),char(121),char(0),char(42),char(109),char(95),char(100),char(97),char(116),char(97),char(0),char(109),char(95), @@ -687,3 +688,5 @@ char(97),char(0),char(4),char(0),char(50),char(0),char(-79),char(1),char(96),cha char(98),char(0),char(4),char(0),char(48),char(0),char(-79),char(1),char(95),char(0),char(-78),char(1),char(4),char(0),char(-77),char(1),char(0),char(0),char(37),char(0), }; int sBulletDNAlen= sizeof(sBulletDNAstr); + +// clang-format on diff --git a/thirdparty/bullet/LinearMath/btSerializer.h b/thirdparty/bullet/LinearMath/btSerializer.h index 39be3f810e..ba34441615 100644 --- a/thirdparty/bullet/LinearMath/btSerializer.h +++ b/thirdparty/bullet/LinearMath/btSerializer.h @@ -16,49 +16,45 @@ subject to the following restrictions: #ifndef BT_SERIALIZER_H #define BT_SERIALIZER_H -#include "btScalar.h" // has definitions like SIMD_FORCE_INLINE +#include "btScalar.h" // has definitions like SIMD_FORCE_INLINE #include "btHashMap.h" -#if !defined( __CELLOS_LV2__) && !defined(__MWERKS__) +#if !defined(__CELLOS_LV2__) && !defined(__MWERKS__) #include #endif #include - - - extern char sBulletDNAstr[]; extern int sBulletDNAlen; extern char sBulletDNAstr64[]; extern int sBulletDNAlen64; -SIMD_FORCE_INLINE int btStrLen(const char* str) +SIMD_FORCE_INLINE int btStrLen(const char* str) { - if (!str) - return(0); + if (!str) + return (0); int len = 0; while (*str != 0) { - str++; - len++; - } + str++; + len++; + } - return len; + return len; } - class btChunk { public: - int m_chunkCode; - int m_length; - void *m_oldPtr; - int m_dna_nr; - int m_number; + int m_chunkCode; + int m_length; + void* m_oldPtr; + int m_dna_nr; + int m_number; }; -enum btSerializationFlags +enum btSerializationFlags { BT_SERIALIZE_NO_BVH = 1, BT_SERIALIZE_NO_TRIANGLEINFOMAP = 2, @@ -66,78 +62,71 @@ enum btSerializationFlags BT_SERIALIZE_CONTACT_MANIFOLDS = 8, }; -class btSerializer +class btSerializer { - public: - virtual ~btSerializer() {} - virtual const unsigned char* getBufferPointer() const = 0; + virtual const unsigned char* getBufferPointer() const = 0; - virtual int getCurrentBufferSize() const = 0; + virtual int getCurrentBufferSize() const = 0; - virtual btChunk* allocate(size_t size, int numElements) = 0; + virtual btChunk* allocate(size_t size, int numElements) = 0; - virtual void finalizeChunk(btChunk* chunk, const char* structType, int chunkCode,void* oldPtr)= 0; + virtual void finalizeChunk(btChunk* chunk, const char* structType, int chunkCode, void* oldPtr) = 0; - virtual void* findPointer(void* oldPtr) = 0; + virtual void* findPointer(void* oldPtr) = 0; - virtual void* getUniquePointer(void*oldPtr) = 0; + virtual void* getUniquePointer(void* oldPtr) = 0; - virtual void startSerialization() = 0; + virtual void startSerialization() = 0; - virtual void finishSerialization() = 0; + virtual void finishSerialization() = 0; - virtual const char* findNameForPointer(const void* ptr) const = 0; + virtual const char* findNameForPointer(const void* ptr) const = 0; - virtual void registerNameForPointer(const void* ptr, const char* name) = 0; + virtual void registerNameForPointer(const void* ptr, const char* name) = 0; - virtual void serializeName(const char* ptr) = 0; + virtual void serializeName(const char* ptr) = 0; - virtual int getSerializationFlags() const = 0; + virtual int getSerializationFlags() const = 0; - virtual void setSerializationFlags(int flags) = 0; + virtual void setSerializationFlags(int flags) = 0; virtual int getNumChunks() const = 0; virtual const btChunk* getChunk(int chunkIndex) const = 0; - }; - - #define BT_HEADER_LENGTH 12 -#if defined(__sgi) || defined (__sparc) || defined (__sparc__) || defined (__PPC__) || defined (__ppc__) || defined (__BIG_ENDIAN__) -# define BT_MAKE_ID(a,b,c,d) ( (int)(a)<<24 | (int)(b)<<16 | (c)<<8 | (d) ) +#if defined(__sgi) || defined(__sparc) || defined(__sparc__) || defined(__PPC__) || defined(__ppc__) || defined(__BIG_ENDIAN__) +#define BT_MAKE_ID(a, b, c, d) ((int)(a) << 24 | (int)(b) << 16 | (c) << 8 | (d)) #else -# define BT_MAKE_ID(a,b,c,d) ( (int)(d)<<24 | (int)(c)<<16 | (b)<<8 | (a) ) +#define BT_MAKE_ID(a, b, c, d) ((int)(d) << 24 | (int)(c) << 16 | (b) << 8 | (a)) #endif - -#define BT_MULTIBODY_CODE BT_MAKE_ID('M','B','D','Y') -#define BT_MB_LINKCOLLIDER_CODE BT_MAKE_ID('M','B','L','C') -#define BT_SOFTBODY_CODE BT_MAKE_ID('S','B','D','Y') -#define BT_COLLISIONOBJECT_CODE BT_MAKE_ID('C','O','B','J') -#define BT_RIGIDBODY_CODE BT_MAKE_ID('R','B','D','Y') -#define BT_CONSTRAINT_CODE BT_MAKE_ID('C','O','N','S') -#define BT_BOXSHAPE_CODE BT_MAKE_ID('B','O','X','S') -#define BT_QUANTIZED_BVH_CODE BT_MAKE_ID('Q','B','V','H') -#define BT_TRIANLGE_INFO_MAP BT_MAKE_ID('T','M','A','P') -#define BT_SHAPE_CODE BT_MAKE_ID('S','H','A','P') -#define BT_ARRAY_CODE BT_MAKE_ID('A','R','A','Y') -#define BT_SBMATERIAL_CODE BT_MAKE_ID('S','B','M','T') -#define BT_SBNODE_CODE BT_MAKE_ID('S','B','N','D') -#define BT_DYNAMICSWORLD_CODE BT_MAKE_ID('D','W','L','D') -#define BT_CONTACTMANIFOLD_CODE BT_MAKE_ID('C','O','N','T') -#define BT_DNA_CODE BT_MAKE_ID('D','N','A','1') - -struct btPointerUid +#define BT_MULTIBODY_CODE BT_MAKE_ID('M', 'B', 'D', 'Y') +#define BT_MB_LINKCOLLIDER_CODE BT_MAKE_ID('M', 'B', 'L', 'C') +#define BT_SOFTBODY_CODE BT_MAKE_ID('S', 'B', 'D', 'Y') +#define BT_COLLISIONOBJECT_CODE BT_MAKE_ID('C', 'O', 'B', 'J') +#define BT_RIGIDBODY_CODE BT_MAKE_ID('R', 'B', 'D', 'Y') +#define BT_CONSTRAINT_CODE BT_MAKE_ID('C', 'O', 'N', 'S') +#define BT_BOXSHAPE_CODE BT_MAKE_ID('B', 'O', 'X', 'S') +#define BT_QUANTIZED_BVH_CODE BT_MAKE_ID('Q', 'B', 'V', 'H') +#define BT_TRIANLGE_INFO_MAP BT_MAKE_ID('T', 'M', 'A', 'P') +#define BT_SHAPE_CODE BT_MAKE_ID('S', 'H', 'A', 'P') +#define BT_ARRAY_CODE BT_MAKE_ID('A', 'R', 'A', 'Y') +#define BT_SBMATERIAL_CODE BT_MAKE_ID('S', 'B', 'M', 'T') +#define BT_SBNODE_CODE BT_MAKE_ID('S', 'B', 'N', 'D') +#define BT_DYNAMICSWORLD_CODE BT_MAKE_ID('D', 'W', 'L', 'D') +#define BT_CONTACTMANIFOLD_CODE BT_MAKE_ID('C', 'O', 'N', 'T') +#define BT_DNA_CODE BT_MAKE_ID('D', 'N', 'A', '1') + +struct btPointerUid { - union - { - void* m_ptr; - int m_uniqueIds[2]; + union { + void* m_ptr; + int m_uniqueIds[2]; }; }; @@ -146,8 +135,8 @@ struct btBulletSerializedArrays btBulletSerializedArrays() { } - btAlignedObjectArray m_bvhsDouble; - btAlignedObjectArray m_bvhsFloat; + btAlignedObjectArray m_bvhsDouble; + btAlignedObjectArray m_bvhsFloat; btAlignedObjectArray m_colShapeData; btAlignedObjectArray m_dynamicWorldInfoDataDouble; btAlignedObjectArray m_dynamicWorldInfoDataFloat; @@ -157,51 +146,42 @@ struct btBulletSerializedArrays btAlignedObjectArray m_collisionObjectDataFloat; btAlignedObjectArray m_constraintDataFloat; btAlignedObjectArray m_constraintDataDouble; - btAlignedObjectArray m_constraintData;//for backwards compatibility + btAlignedObjectArray m_constraintData; //for backwards compatibility btAlignedObjectArray m_softBodyFloatData; btAlignedObjectArray m_softBodyDoubleData; - }; - ///The btDefaultSerializer is the main Bullet serialization class. ///The constructor takes an optional argument for backwards compatibility, it is recommended to leave this empty/zero. -class btDefaultSerializer : public btSerializer +class btDefaultSerializer : public btSerializer { - protected: + btAlignedObjectArray mTypes; + btAlignedObjectArray mStructs; + btAlignedObjectArray mTlens; + btHashMap mStructReverse; + btHashMap mTypeLookup; - btAlignedObjectArray mTypes; - btAlignedObjectArray mStructs; - btAlignedObjectArray mTlens; - btHashMap mStructReverse; - btHashMap mTypeLookup; - + btHashMap m_chunkP; + btHashMap m_nameMap; - btHashMap m_chunkP; + btHashMap m_uniquePointers; + int m_uniqueIdGenerator; - btHashMap m_nameMap; + int m_totalSize; + unsigned char* m_buffer; + bool m_ownsBuffer; + int m_currentSize; + void* m_dna; + int m_dnaLength; - btHashMap m_uniquePointers; - int m_uniqueIdGenerator; + int m_serializationFlags; - int m_totalSize; - unsigned char* m_buffer; - bool m_ownsBuffer; - int m_currentSize; - void* m_dna; - int m_dnaLength; - - int m_serializationFlags; - - - btAlignedObjectArray m_chunkPtrs; + btAlignedObjectArray m_chunkPtrs; protected: - - - virtual void* findPointer(void* oldPtr) + virtual void* findPointer(void* oldPtr) { void** ptr = m_chunkP.find(oldPtr); if (ptr && *ptr) @@ -209,48 +189,43 @@ protected: return 0; } + virtual void writeDNA() + { + btChunk* dnaChunk = allocate(m_dnaLength, 1); + memcpy(dnaChunk->m_oldPtr, m_dna, m_dnaLength); + finalizeChunk(dnaChunk, "DNA1", BT_DNA_CODE, m_dna); + } + int getReverseType(const char* type) const + { + btHashString key(type); + const int* valuePtr = mTypeLookup.find(key); + if (valuePtr) + return *valuePtr; + return -1; + } + void initDNA(const char* bdnaOrg, int dnalen) + { + ///was already initialized + if (m_dna) + return; - virtual void writeDNA() - { - btChunk* dnaChunk = allocate(m_dnaLength,1); - memcpy(dnaChunk->m_oldPtr,m_dna,m_dnaLength); - finalizeChunk(dnaChunk,"DNA1",BT_DNA_CODE, m_dna); - } - - int getReverseType(const char *type) const - { - - btHashString key(type); - const int* valuePtr = mTypeLookup.find(key); - if (valuePtr) - return *valuePtr; - - return -1; - } - - void initDNA(const char* bdnaOrg,int dnalen) - { - ///was already initialized - if (m_dna) - return; - - int littleEndian= 1; - littleEndian= ((char*)&littleEndian)[0]; - + int littleEndian = 1; + littleEndian = ((char*)&littleEndian)[0]; - m_dna = btAlignedAlloc(dnalen,16); - memcpy(m_dna,bdnaOrg,dnalen); - m_dnaLength = dnalen; + m_dna = btAlignedAlloc(dnalen, 16); + memcpy(m_dna, bdnaOrg, dnalen); + m_dnaLength = dnalen; - int *intPtr=0; - short *shtPtr=0; - char *cp = 0;int dataLen =0; - intPtr = (int*)m_dna; + int* intPtr = 0; + short* shtPtr = 0; + char* cp = 0; + int dataLen = 0; + intPtr = (int*)m_dna; - /* + /* SDNA (4 bytes) (magic number) NAME (4 bytes) (4 bytes) amount of names (int) @@ -258,81 +233,81 @@ protected: */ - if (strncmp((const char*)m_dna, "SDNA", 4)==0) - { - // skip ++ NAME - intPtr++; intPtr++; - } - - // Parse names - if (!littleEndian) - *intPtr = btSwapEndian(*intPtr); + if (strncmp((const char*)m_dna, "SDNA", 4) == 0) + { + // skip ++ NAME + intPtr++; + intPtr++; + } - dataLen = *intPtr; + // Parse names + if (!littleEndian) + *intPtr = btSwapEndian(*intPtr); - intPtr++; + dataLen = *intPtr; - cp = (char*)intPtr; - int i; - for ( i=0; i amount of types (int) */ - intPtr = (int*)cp; - btAssert(strncmp(cp, "TYPE", 4)==0); intPtr++; + intPtr = (int*)cp; + btAssert(strncmp(cp, "TYPE", 4) == 0); + intPtr++; - if (!littleEndian) - *intPtr = btSwapEndian(*intPtr); + if (!littleEndian) + *intPtr = btSwapEndian(*intPtr); - dataLen = *intPtr; - intPtr++; + dataLen = *intPtr; + intPtr++; + cp = (char*)intPtr; + for (i = 0; i < dataLen; i++) + { + mTypes.push_back(cp); + while (*cp) cp++; + cp++; + } - cp = (char*)intPtr; - for (i=0; i (short) the lengths of types */ - // Parse type lens - intPtr = (int*)cp; - btAssert(strncmp(cp, "TLEN", 4)==0); intPtr++; + // Parse type lens + intPtr = (int*)cp; + btAssert(strncmp(cp, "TLEN", 4) == 0); + intPtr++; - dataLen = (int)mTypes.size(); + dataLen = (int)mTypes.size(); - shtPtr = (short*)intPtr; - for (i=0; i amount of structs (int) @@ -343,384 +318,372 @@ protected: */ - intPtr = (int*)shtPtr; - cp = (char*)intPtr; - btAssert(strncmp(cp, "STRC", 4)==0); intPtr++; + intPtr = (int*)shtPtr; + cp = (char*)intPtr; + btAssert(strncmp(cp, "STRC", 4) == 0); + intPtr++; - if (!littleEndian) - *intPtr = btSwapEndian(*intPtr); - dataLen = *intPtr ; - intPtr++; + if (!littleEndian) + *intPtr = btSwapEndian(*intPtr); + dataLen = *intPtr; + intPtr++; + shtPtr = (short*)intPtr; + for (i = 0; i < dataLen; i++) + { + mStructs.push_back(shtPtr); - shtPtr = (short*)intPtr; - for (i=0; i m_skipPointers; - + // build reverse lookups + for (i = 0; i < (int)mStructs.size(); i++) + { + short* strc = mStructs.at(i); + mStructReverse.insert(strc[0], i); + mTypeLookup.insert(btHashString(mTypes[strc[0]]), i); + } + } - btDefaultSerializer(int totalSize=0, unsigned char* buffer=0) - :m_uniqueIdGenerator(0), - m_totalSize(totalSize), - m_currentSize(0), - m_dna(0), - m_dnaLength(0), - m_serializationFlags(0) +public: + btHashMap m_skipPointers; + + btDefaultSerializer(int totalSize = 0, unsigned char* buffer = 0) + : m_uniqueIdGenerator(0), + m_totalSize(totalSize), + m_currentSize(0), + m_dna(0), + m_dnaLength(0), + m_serializationFlags(0) + { + if (buffer == 0) + { + m_buffer = m_totalSize ? (unsigned char*)btAlignedAlloc(totalSize, 16) : 0; + m_ownsBuffer = true; + } + else { - if (buffer==0) - { - m_buffer = m_totalSize?(unsigned char*)btAlignedAlloc(totalSize,16):0; - m_ownsBuffer = true; - } else - { - m_buffer = buffer; - m_ownsBuffer = false; - } - - const bool VOID_IS_8 = ((sizeof(void*)==8)); + m_buffer = buffer; + m_ownsBuffer = false; + } + + const bool VOID_IS_8 = ((sizeof(void*) == 8)); #ifdef BT_INTERNAL_UPDATE_SERIALIZATION_STRUCTURES - if (VOID_IS_8) - { + if (VOID_IS_8) + { #if _WIN64 - initDNA((const char*)sBulletDNAstr64,sBulletDNAlen64); + initDNA((const char*)sBulletDNAstr64, sBulletDNAlen64); #else - btAssert(0); + btAssert(0); #endif - } else - { + } + else + { #ifndef _WIN64 - initDNA((const char*)sBulletDNAstr,sBulletDNAlen); + initDNA((const char*)sBulletDNAstr, sBulletDNAlen); #else - btAssert(0); + btAssert(0); #endif - } - -#else //BT_INTERNAL_UPDATE_SERIALIZATION_STRUCTURES - if (VOID_IS_8) - { - initDNA((const char*)sBulletDNAstr64,sBulletDNAlen64); - } else - { - initDNA((const char*)sBulletDNAstr,sBulletDNAlen); - } -#endif //BT_INTERNAL_UPDATE_SERIALIZATION_STRUCTURES - } - virtual ~btDefaultSerializer() +#else //BT_INTERNAL_UPDATE_SERIALIZATION_STRUCTURES + if (VOID_IS_8) { - if (m_buffer && m_ownsBuffer) - btAlignedFree(m_buffer); - if (m_dna) - btAlignedFree(m_dna); - } - - static int getMemoryDnaSizeInBytes() - { - const bool VOID_IS_8 = ((sizeof(void*) == 8)); - - if (VOID_IS_8) - { - return sBulletDNAlen64; - } - return sBulletDNAlen; + initDNA((const char*)sBulletDNAstr64, sBulletDNAlen64); } - static const char* getMemoryDna() + else { - const bool VOID_IS_8 = ((sizeof(void*) == 8)); - if (VOID_IS_8) - { - return (const char*)sBulletDNAstr64; - } - return (const char*)sBulletDNAstr; + initDNA((const char*)sBulletDNAstr, sBulletDNAlen); } +#endif //BT_INTERNAL_UPDATE_SERIALIZATION_STRUCTURES + } - void insertHeader() + virtual ~btDefaultSerializer() + { + if (m_buffer && m_ownsBuffer) + btAlignedFree(m_buffer); + if (m_dna) + btAlignedFree(m_dna); + } + + static int getMemoryDnaSizeInBytes() + { + const bool VOID_IS_8 = ((sizeof(void*) == 8)); + + if (VOID_IS_8) { - writeHeader(m_buffer); - m_currentSize += BT_HEADER_LENGTH; + return sBulletDNAlen64; } - - void writeHeader(unsigned char* buffer) const + return sBulletDNAlen; + } + static const char* getMemoryDna() + { + const bool VOID_IS_8 = ((sizeof(void*) == 8)); + if (VOID_IS_8) { + return (const char*)sBulletDNAstr64; + } + return (const char*)sBulletDNAstr; + } + void insertHeader() + { + writeHeader(m_buffer); + m_currentSize += BT_HEADER_LENGTH; + } -#ifdef BT_USE_DOUBLE_PRECISION - memcpy(buffer, "BULLETd", 7); + void writeHeader(unsigned char* buffer) const + { +#ifdef BT_USE_DOUBLE_PRECISION + memcpy(buffer, "BULLETd", 7); #else - memcpy(buffer, "BULLETf", 7); -#endif //BT_USE_DOUBLE_PRECISION - - int littleEndian= 1; - littleEndian= ((char*)&littleEndian)[0]; + memcpy(buffer, "BULLETf", 7); +#endif //BT_USE_DOUBLE_PRECISION - if (sizeof(void*)==8) - { - buffer[7] = '-'; - } else - { - buffer[7] = '_'; - } - - if (littleEndian) - { - buffer[8]='v'; - } else - { - buffer[8]='V'; - } - - - buffer[9] = '2'; - buffer[10] = '8'; - buffer[11] = '8'; + int littleEndian = 1; + littleEndian = ((char*)&littleEndian)[0]; + if (sizeof(void*) == 8) + { + buffer[7] = '-'; } - - virtual void startSerialization() + else { - m_uniqueIdGenerator= 1; - if (m_totalSize) - { - unsigned char* buffer = internalAlloc(BT_HEADER_LENGTH); - writeHeader(buffer); - } - + buffer[7] = '_'; } - virtual void finishSerialization() + if (littleEndian) { - writeDNA(); - - //if we didn't pre-allocate a buffer, we need to create a contiguous buffer now - int mysize = 0; - if (!m_totalSize) - { - if (m_buffer) - btAlignedFree(m_buffer); - - m_currentSize += BT_HEADER_LENGTH; - m_buffer = (unsigned char*)btAlignedAlloc(m_currentSize,16); + buffer[8] = 'v'; + } + else + { + buffer[8] = 'V'; + } - unsigned char* currentPtr = m_buffer; - writeHeader(m_buffer); - currentPtr += BT_HEADER_LENGTH; - mysize+=BT_HEADER_LENGTH; - for (int i=0;i< m_chunkPtrs.size();i++) - { - int curLength = sizeof(btChunk)+m_chunkPtrs[i]->m_length; - memcpy(currentPtr,m_chunkPtrs[i], curLength); - btAlignedFree(m_chunkPtrs[i]); - currentPtr+=curLength; - mysize+=curLength; - } - } + buffer[9] = '2'; + buffer[10] = '8'; + buffer[11] = '8'; + } - mTypes.clear(); - mStructs.clear(); - mTlens.clear(); - mStructReverse.clear(); - mTypeLookup.clear(); - m_skipPointers.clear(); - m_chunkP.clear(); - m_nameMap.clear(); - m_uniquePointers.clear(); - m_chunkPtrs.clear(); + virtual void startSerialization() + { + m_uniqueIdGenerator = 1; + if (m_totalSize) + { + unsigned char* buffer = internalAlloc(BT_HEADER_LENGTH); + writeHeader(buffer); } + } - virtual void* getUniquePointer(void*oldPtr) + virtual void finishSerialization() + { + writeDNA(); + + //if we didn't pre-allocate a buffer, we need to create a contiguous buffer now + int mysize = 0; + if (!m_totalSize) { - btAssert(m_uniqueIdGenerator >= 0); - if (!oldPtr) - return 0; + if (m_buffer) + btAlignedFree(m_buffer); - btPointerUid* uptr = (btPointerUid*)m_uniquePointers.find(oldPtr); - if (uptr) - { - return uptr->m_ptr; - } + m_currentSize += BT_HEADER_LENGTH; + m_buffer = (unsigned char*)btAlignedAlloc(m_currentSize, 16); - void** ptr2 = m_skipPointers[oldPtr]; - if (ptr2) + unsigned char* currentPtr = m_buffer; + writeHeader(m_buffer); + currentPtr += BT_HEADER_LENGTH; + mysize += BT_HEADER_LENGTH; + for (int i = 0; i < m_chunkPtrs.size(); i++) { - return 0; + int curLength = sizeof(btChunk) + m_chunkPtrs[i]->m_length; + memcpy(currentPtr, m_chunkPtrs[i], curLength); + btAlignedFree(m_chunkPtrs[i]); + currentPtr += curLength; + mysize += curLength; } + } - m_uniqueIdGenerator++; - - btPointerUid uid; - uid.m_uniqueIds[0] = m_uniqueIdGenerator; - uid.m_uniqueIds[1] = m_uniqueIdGenerator; - m_uniquePointers.insert(oldPtr,uid); - return uid.m_ptr; + mTypes.clear(); + mStructs.clear(); + mTlens.clear(); + mStructReverse.clear(); + mTypeLookup.clear(); + m_skipPointers.clear(); + m_chunkP.clear(); + m_nameMap.clear(); + m_uniquePointers.clear(); + m_chunkPtrs.clear(); + } - } + virtual void* getUniquePointer(void* oldPtr) + { + btAssert(m_uniqueIdGenerator >= 0); + if (!oldPtr) + return 0; - virtual const unsigned char* getBufferPointer() const + btPointerUid* uptr = (btPointerUid*)m_uniquePointers.find(oldPtr); + if (uptr) { - return m_buffer; + return uptr->m_ptr; } - virtual int getCurrentBufferSize() const + void** ptr2 = m_skipPointers[oldPtr]; + if (ptr2) { - return m_currentSize; + return 0; } - virtual void finalizeChunk(btChunk* chunk, const char* structType, int chunkCode,void* oldPtr) - { - if (!(m_serializationFlags&BT_SERIALIZE_NO_DUPLICATE_ASSERT)) - { - btAssert(!findPointer(oldPtr)); - } + m_uniqueIdGenerator++; - chunk->m_dna_nr = getReverseType(structType); - - chunk->m_chunkCode = chunkCode; + btPointerUid uid; + uid.m_uniqueIds[0] = m_uniqueIdGenerator; + uid.m_uniqueIds[1] = m_uniqueIdGenerator; + m_uniquePointers.insert(oldPtr, uid); + return uid.m_ptr; + } - void* uniquePtr = getUniquePointer(oldPtr); + virtual const unsigned char* getBufferPointer() const + { + return m_buffer; + } - m_chunkP.insert(oldPtr,uniquePtr);//chunk->m_oldPtr); - chunk->m_oldPtr = uniquePtr;//oldPtr; + virtual int getCurrentBufferSize() const + { + return m_currentSize; + } + virtual void finalizeChunk(btChunk* chunk, const char* structType, int chunkCode, void* oldPtr) + { + if (!(m_serializationFlags & BT_SERIALIZE_NO_DUPLICATE_ASSERT)) + { + btAssert(!findPointer(oldPtr)); } + chunk->m_dna_nr = getReverseType(structType); - virtual unsigned char* internalAlloc(size_t size) - { - unsigned char* ptr = 0; + chunk->m_chunkCode = chunkCode; - if (m_totalSize) - { - ptr = m_buffer+m_currentSize; - m_currentSize += int(size); - btAssert(m_currentSizem_oldPtr); + chunk->m_oldPtr = uniquePtr; //oldPtr; + } + virtual unsigned char* internalAlloc(size_t size) + { + unsigned char* ptr = 0; - virtual btChunk* allocate(size_t size, int numElements) + if (m_totalSize) { + ptr = m_buffer + m_currentSize; + m_currentSize += int(size); + btAssert(m_currentSize < m_totalSize); + } + else + { + ptr = (unsigned char*)btAlignedAlloc(size, 16); + m_currentSize += int(size); + } + return ptr; + } - unsigned char* ptr = internalAlloc(int(size)*numElements+sizeof(btChunk)); - - unsigned char* data = ptr + sizeof(btChunk); + virtual btChunk* allocate(size_t size, int numElements) + { + unsigned char* ptr = internalAlloc(int(size) * numElements + sizeof(btChunk)); - btChunk* chunk = (btChunk*)ptr; - chunk->m_chunkCode = 0; - chunk->m_oldPtr = data; - chunk->m_length = int(size)*numElements; - chunk->m_number = numElements; + unsigned char* data = ptr + sizeof(btChunk); - m_chunkPtrs.push_back(chunk); + btChunk* chunk = (btChunk*)ptr; + chunk->m_chunkCode = 0; + chunk->m_oldPtr = data; + chunk->m_length = int(size) * numElements; + chunk->m_number = numElements; + m_chunkPtrs.push_back(chunk); - return chunk; - } + return chunk; + } - virtual const char* findNameForPointer(const void* ptr) const - { - const char*const * namePtr = m_nameMap.find(ptr); - if (namePtr && *namePtr) - return *namePtr; - return 0; + virtual const char* findNameForPointer(const void* ptr) const + { + const char* const* namePtr = m_nameMap.find(ptr); + if (namePtr && *namePtr) + return *namePtr; + return 0; + } - } + virtual void registerNameForPointer(const void* ptr, const char* name) + { + m_nameMap.insert(ptr, name); + } - virtual void registerNameForPointer(const void* ptr, const char* name) + virtual void serializeName(const char* name) + { + if (name) { - m_nameMap.insert(ptr,name); - } + //don't serialize name twice + if (findPointer((void*)name)) + return; - virtual void serializeName(const char* name) - { - if (name) + int len = btStrLen(name); + if (len) { - //don't serialize name twice - if (findPointer((void*)name)) - return; - - int len = btStrLen(name); - if (len) + int newLen = len + 1; + int padding = ((newLen + 3) & ~3) - newLen; + newLen += padding; + + //serialize name string now + btChunk* chunk = allocate(sizeof(char), newLen); + char* destinationName = (char*)chunk->m_oldPtr; + for (int i = 0; i < len; i++) { - - int newLen = len+1; - int padding = ((newLen+3)&~3)-newLen; - newLen += padding; - - //serialize name string now - btChunk* chunk = allocate(sizeof(char),newLen); - char* destinationName = (char*)chunk->m_oldPtr; - for (int i=0;i m_uid2ChunkPtr; - btHashMap m_orgPtr2UniqueDataPtr; - btHashMap m_names2Ptr; - - - btBulletSerializedArrays m_arrays; - - btInMemorySerializer(int totalSize=0, unsigned char* buffer=0) - :btDefaultSerializer(totalSize,buffer) - { - - } - - virtual void startSerialization() - { - m_uid2ChunkPtr.clear(); - //todo: m_arrays.clear(); - btDefaultSerializer::startSerialization(); - } - - - - btChunk* findChunkFromUniquePointer(void* uniquePointer) - { - btChunk** chkPtr = m_uid2ChunkPtr[uniquePointer]; - if (chkPtr) - { - return *chkPtr; - } - return 0; - } - - virtual void registerNameForPointer(const void* ptr, const char* name) - { - btDefaultSerializer::registerNameForPointer(ptr,name); - m_names2Ptr.insert(name,ptr); - } - - virtual void finishSerialization() - { - } - - virtual void* getUniquePointer(void*oldPtr) - { - if (oldPtr==0) - return 0; - - // void* uniquePtr = getUniquePointer(oldPtr); - btChunk* chunk = findChunkFromUniquePointer(oldPtr); - if (chunk) - { - return chunk->m_oldPtr; - } else - { - const char* n = (const char*) oldPtr; - const void** ptr = m_names2Ptr[n]; - if (ptr) - { - return oldPtr; - } else - { - void** ptr2 = m_skipPointers[oldPtr]; - if (ptr2) - { - return 0; - } else - { - //If this assert hit, serialization happened in the wrong order - // 'getUniquePointer' - btAssert(0); - } - - } - return 0; - } - return oldPtr; - } + btHashMap m_uid2ChunkPtr; + btHashMap m_orgPtr2UniqueDataPtr; + btHashMap m_names2Ptr; + + btBulletSerializedArrays m_arrays; + + btInMemorySerializer(int totalSize = 0, unsigned char* buffer = 0) + : btDefaultSerializer(totalSize, buffer) + { + } + + virtual void startSerialization() + { + m_uid2ChunkPtr.clear(); + //todo: m_arrays.clear(); + btDefaultSerializer::startSerialization(); + } - virtual void finalizeChunk(btChunk* chunk, const char* structType, int chunkCode,void* oldPtr) - { - if (!(m_serializationFlags&BT_SERIALIZE_NO_DUPLICATE_ASSERT)) - { - btAssert(!findPointer(oldPtr)); - } + btChunk* findChunkFromUniquePointer(void* uniquePointer) + { + btChunk** chkPtr = m_uid2ChunkPtr[uniquePointer]; + if (chkPtr) + { + return *chkPtr; + } + return 0; + } - chunk->m_dna_nr = getReverseType(structType); - chunk->m_chunkCode = chunkCode; - //void* uniquePtr = getUniquePointer(oldPtr); - m_chunkP.insert(oldPtr,oldPtr);//chunk->m_oldPtr); - // chunk->m_oldPtr = uniquePtr;//oldPtr; + virtual void registerNameForPointer(const void* ptr, const char* name) + { + btDefaultSerializer::registerNameForPointer(ptr, name); + m_names2Ptr.insert(name, ptr); + } - void* uid = findPointer(oldPtr); - m_uid2ChunkPtr.insert(uid,chunk); + virtual void finishSerialization() + { + } - switch (chunk->m_chunkCode) + virtual void* getUniquePointer(void* oldPtr) + { + if (oldPtr == 0) + return 0; + + // void* uniquePtr = getUniquePointer(oldPtr); + btChunk* chunk = findChunkFromUniquePointer(oldPtr); + if (chunk) + { + return chunk->m_oldPtr; + } + else + { + const char* n = (const char*)oldPtr; + const void** ptr = m_names2Ptr[n]; + if (ptr) { - case BT_SOFTBODY_CODE: + return oldPtr; + } + else { - #ifdef BT_USE_DOUBLE_PRECISION - m_arrays.m_softBodyDoubleData.push_back((btSoftBodyDoubleData*) chunk->m_oldPtr); - #else - m_arrays.m_softBodyFloatData.push_back((btSoftBodyFloatData*) chunk->m_oldPtr); - #endif - break; + void** ptr2 = m_skipPointers[oldPtr]; + if (ptr2) + { + return 0; } - case BT_COLLISIONOBJECT_CODE: + else { - #ifdef BT_USE_DOUBLE_PRECISION - m_arrays.m_collisionObjectDataDouble.push_back((btCollisionObjectDoubleData*)chunk->m_oldPtr); - #else//BT_USE_DOUBLE_PRECISION - m_arrays.m_collisionObjectDataFloat.push_back((btCollisionObjectFloatData*)chunk->m_oldPtr); - #endif //BT_USE_DOUBLE_PRECISION - break; + //If this assert hit, serialization happened in the wrong order + // 'getUniquePointer' + btAssert(0); } + } + return 0; + } + return oldPtr; + } + + virtual void finalizeChunk(btChunk* chunk, const char* structType, int chunkCode, void* oldPtr) + { + if (!(m_serializationFlags & BT_SERIALIZE_NO_DUPLICATE_ASSERT)) + { + btAssert(!findPointer(oldPtr)); + } + + chunk->m_dna_nr = getReverseType(structType); + chunk->m_chunkCode = chunkCode; + //void* uniquePtr = getUniquePointer(oldPtr); + m_chunkP.insert(oldPtr, oldPtr); //chunk->m_oldPtr); + // chunk->m_oldPtr = uniquePtr;//oldPtr; + + void* uid = findPointer(oldPtr); + m_uid2ChunkPtr.insert(uid, chunk); + + switch (chunk->m_chunkCode) + { + case BT_SOFTBODY_CODE: + { +#ifdef BT_USE_DOUBLE_PRECISION + m_arrays.m_softBodyDoubleData.push_back((btSoftBodyDoubleData*)chunk->m_oldPtr); +#else + m_arrays.m_softBodyFloatData.push_back((btSoftBodyFloatData*)chunk->m_oldPtr); +#endif + break; + } + case BT_COLLISIONOBJECT_CODE: + { +#ifdef BT_USE_DOUBLE_PRECISION + m_arrays.m_collisionObjectDataDouble.push_back((btCollisionObjectDoubleData*)chunk->m_oldPtr); +#else //BT_USE_DOUBLE_PRECISION + m_arrays.m_collisionObjectDataFloat.push_back((btCollisionObjectFloatData*)chunk->m_oldPtr); +#endif //BT_USE_DOUBLE_PRECISION + break; + } case BT_RIGIDBODY_CODE: - { - #ifdef BT_USE_DOUBLE_PRECISION - m_arrays.m_rigidBodyDataDouble.push_back((btRigidBodyDoubleData*)chunk->m_oldPtr); - #else - m_arrays.m_rigidBodyDataFloat.push_back((btRigidBodyFloatData*)chunk->m_oldPtr); - #endif//BT_USE_DOUBLE_PRECISION - break; - }; + { +#ifdef BT_USE_DOUBLE_PRECISION + m_arrays.m_rigidBodyDataDouble.push_back((btRigidBodyDoubleData*)chunk->m_oldPtr); +#else + m_arrays.m_rigidBodyDataFloat.push_back((btRigidBodyFloatData*)chunk->m_oldPtr); +#endif //BT_USE_DOUBLE_PRECISION + break; + }; case BT_CONSTRAINT_CODE: - { - #ifdef BT_USE_DOUBLE_PRECISION - m_arrays.m_constraintDataDouble.push_back((btTypedConstraintDoubleData*)chunk->m_oldPtr); - #else - m_arrays.m_constraintDataFloat.push_back((btTypedConstraintFloatData*)chunk->m_oldPtr); - #endif - break; - } + { +#ifdef BT_USE_DOUBLE_PRECISION + m_arrays.m_constraintDataDouble.push_back((btTypedConstraintDoubleData*)chunk->m_oldPtr); +#else + m_arrays.m_constraintDataFloat.push_back((btTypedConstraintFloatData*)chunk->m_oldPtr); +#endif + break; + } case BT_QUANTIZED_BVH_CODE: - { - #ifdef BT_USE_DOUBLE_PRECISION - m_arrays.m_bvhsDouble.push_back((btQuantizedBvhDoubleData*) chunk->m_oldPtr); - #else - m_arrays.m_bvhsFloat.push_back((btQuantizedBvhFloatData*) chunk->m_oldPtr); - #endif - break; - } + { +#ifdef BT_USE_DOUBLE_PRECISION + m_arrays.m_bvhsDouble.push_back((btQuantizedBvhDoubleData*)chunk->m_oldPtr); +#else + m_arrays.m_bvhsFloat.push_back((btQuantizedBvhFloatData*)chunk->m_oldPtr); +#endif + break; + } case BT_SHAPE_CODE: - { - btCollisionShapeData* shapeData = (btCollisionShapeData*) chunk->m_oldPtr; - m_arrays.m_colShapeData.push_back(shapeData); - break; - } + { + btCollisionShapeData* shapeData = (btCollisionShapeData*)chunk->m_oldPtr; + m_arrays.m_colShapeData.push_back(shapeData); + break; + } case BT_TRIANLGE_INFO_MAP: case BT_ARRAY_CODE: case BT_SBMATERIAL_CODE: case BT_SBNODE_CODE: case BT_DYNAMICSWORLD_CODE: case BT_DNA_CODE: - { - break; - } + { + break; + } default: - { - } - }; - } - - int getNumChunks() const - { - return m_uid2ChunkPtr.size(); - } + { + } + }; + } - const btChunk* getChunk(int chunkIndex) const - { - return *m_uid2ChunkPtr.getAtIndex(chunkIndex); - } + int getNumChunks() const + { + return m_uid2ChunkPtr.size(); + } + const btChunk* getChunk(int chunkIndex) const + { + return *m_uid2ChunkPtr.getAtIndex(chunkIndex); + } }; -#endif //ENABLE_INMEMORY_SERIALIZER - -#endif //BT_SERIALIZER_H +#endif //ENABLE_INMEMORY_SERIALIZER +#endif //BT_SERIALIZER_H diff --git a/thirdparty/bullet/LinearMath/btSerializer64.cpp b/thirdparty/bullet/LinearMath/btSerializer64.cpp index 0aa5cbf30e..cf281cdb36 100644 --- a/thirdparty/bullet/LinearMath/btSerializer64.cpp +++ b/thirdparty/bullet/LinearMath/btSerializer64.cpp @@ -1,3 +1,4 @@ +// clang-format off char sBulletDNAstr64[]= { char(83),char(68),char(78),char(65),char(78),char(65),char(77),char(69),char(-76),char(1),char(0),char(0),char(109),char(95),char(115),char(105),char(122),char(101),char(0),char(109), char(95),char(99),char(97),char(112),char(97),char(99),char(105),char(116),char(121),char(0),char(42),char(109),char(95),char(100),char(97),char(116),char(97),char(0),char(109),char(95), @@ -687,3 +688,4 @@ char(97),char(0),char(4),char(0),char(50),char(0),char(-79),char(1),char(96),cha char(98),char(0),char(4),char(0),char(48),char(0),char(-79),char(1),char(95),char(0),char(-78),char(1),char(4),char(0),char(-77),char(1),char(0),char(0),char(37),char(0), }; int sBulletDNAlen64= sizeof(sBulletDNAstr64); +// clang-format on diff --git a/thirdparty/bullet/LinearMath/btSpatialAlgebra.h b/thirdparty/bullet/LinearMath/btSpatialAlgebra.h index 8e59658bca..6ad67a1081 100644 --- a/thirdparty/bullet/LinearMath/btSpatialAlgebra.h +++ b/thirdparty/bullet/LinearMath/btSpatialAlgebra.h @@ -12,18 +12,17 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ -///These spatial algebra classes are used for btMultiBody, +///These spatial algebra classes are used for btMultiBody, ///see BulletDynamics/Featherstone #ifndef BT_SPATIAL_ALGEBRA_H #define BT_SPATIAL_ALGEBRA_H - #include "btMatrix3x3.h" struct btSpatialForceVector -{ - btVector3 m_topVec, m_bottomVec; +{ + btVector3 m_topVec, m_bottomVec; // btSpatialForceVector() { setZero(); } btSpatialForceVector(const btVector3 &angular, const btVector3 &linear) : m_topVec(linear), m_bottomVec(angular) {} @@ -32,21 +31,34 @@ struct btSpatialForceVector setValue(ax, ay, az, lx, ly, lz); } // - void setVector(const btVector3 &angular, const btVector3 &linear) { m_topVec = linear; m_bottomVec = angular; } + void setVector(const btVector3 &angular, const btVector3 &linear) + { + m_topVec = linear; + m_bottomVec = angular; + } void setValue(const btScalar &ax, const btScalar &ay, const btScalar &az, const btScalar &lx, const btScalar &ly, const btScalar &lz) { - m_bottomVec.setValue(ax, ay, az); m_topVec.setValue(lx, ly, lz); + m_bottomVec.setValue(ax, ay, az); + m_topVec.setValue(lx, ly, lz); } // - void addVector(const btVector3 &angular, const btVector3 &linear) { m_topVec += linear; m_bottomVec += angular; } + void addVector(const btVector3 &angular, const btVector3 &linear) + { + m_topVec += linear; + m_bottomVec += angular; + } void addValue(const btScalar &ax, const btScalar &ay, const btScalar &az, const btScalar &lx, const btScalar &ly, const btScalar &lz) { - m_bottomVec[0] += ax; m_bottomVec[1] += ay; m_bottomVec[2] += az; - m_topVec[0] += lx; m_topVec[1] += ly; m_topVec[2] += lz; + m_bottomVec[0] += ax; + m_bottomVec[1] += ay; + m_bottomVec[2] += az; + m_topVec[0] += lx; + m_topVec[1] += ly; + m_topVec[2] += lz; } // - const btVector3 & getLinear() const { return m_topVec; } - const btVector3 & getAngular() const { return m_bottomVec; } + const btVector3 &getLinear() const { return m_topVec; } + const btVector3 &getAngular() const { return m_bottomVec; } // void setLinear(const btVector3 &linear) { m_topVec = linear; } void setAngular(const btVector3 &angular) { m_bottomVec = angular; } @@ -54,14 +66,28 @@ struct btSpatialForceVector void addAngular(const btVector3 &angular) { m_bottomVec += angular; } void addLinear(const btVector3 &linear) { m_topVec += linear; } // - void setZero() { m_topVec.setZero(); m_bottomVec.setZero(); } + void setZero() + { + m_topVec.setZero(); + m_bottomVec.setZero(); + } // - btSpatialForceVector & operator += (const btSpatialForceVector &vec) { m_topVec += vec.m_topVec; m_bottomVec += vec.m_bottomVec; return *this; } - btSpatialForceVector & operator -= (const btSpatialForceVector &vec) { m_topVec -= vec.m_topVec; m_bottomVec -= vec.m_bottomVec; return *this; } - btSpatialForceVector operator - (const btSpatialForceVector &vec) const { return btSpatialForceVector(m_bottomVec - vec.m_bottomVec, m_topVec - vec.m_topVec); } - btSpatialForceVector operator + (const btSpatialForceVector &vec) const { return btSpatialForceVector(m_bottomVec + vec.m_bottomVec, m_topVec + vec.m_topVec); } - btSpatialForceVector operator - () const { return btSpatialForceVector(-m_bottomVec, -m_topVec); } - btSpatialForceVector operator * (const btScalar &s) const { return btSpatialForceVector(s * m_bottomVec, s * m_topVec); } + btSpatialForceVector &operator+=(const btSpatialForceVector &vec) + { + m_topVec += vec.m_topVec; + m_bottomVec += vec.m_bottomVec; + return *this; + } + btSpatialForceVector &operator-=(const btSpatialForceVector &vec) + { + m_topVec -= vec.m_topVec; + m_bottomVec -= vec.m_bottomVec; + return *this; + } + btSpatialForceVector operator-(const btSpatialForceVector &vec) const { return btSpatialForceVector(m_bottomVec - vec.m_bottomVec, m_topVec - vec.m_topVec); } + btSpatialForceVector operator+(const btSpatialForceVector &vec) const { return btSpatialForceVector(m_bottomVec + vec.m_bottomVec, m_topVec + vec.m_topVec); } + btSpatialForceVector operator-() const { return btSpatialForceVector(-m_bottomVec, -m_topVec); } + btSpatialForceVector operator*(const btScalar &s) const { return btSpatialForceVector(s * m_bottomVec, s * m_topVec); } //btSpatialForceVector & operator = (const btSpatialForceVector &vec) { m_topVec = vec.m_topVec; m_bottomVec = vec.m_bottomVec; return *this; } }; @@ -70,23 +96,36 @@ struct btSpatialMotionVector btVector3 m_topVec, m_bottomVec; // btSpatialMotionVector() { setZero(); } - btSpatialMotionVector(const btVector3 &angular, const btVector3 &linear) : m_topVec(angular), m_bottomVec(linear) {} + btSpatialMotionVector(const btVector3 &angular, const btVector3 &linear) : m_topVec(angular), m_bottomVec(linear) {} // - void setVector(const btVector3 &angular, const btVector3 &linear) { m_topVec = angular; m_bottomVec = linear; } + void setVector(const btVector3 &angular, const btVector3 &linear) + { + m_topVec = angular; + m_bottomVec = linear; + } void setValue(const btScalar &ax, const btScalar &ay, const btScalar &az, const btScalar &lx, const btScalar &ly, const btScalar &lz) { - m_topVec.setValue(ax, ay, az); m_bottomVec.setValue(lx, ly, lz); + m_topVec.setValue(ax, ay, az); + m_bottomVec.setValue(lx, ly, lz); } // - void addVector(const btVector3 &angular, const btVector3 &linear) { m_topVec += linear; m_bottomVec += angular; } + void addVector(const btVector3 &angular, const btVector3 &linear) + { + m_topVec += linear; + m_bottomVec += angular; + } void addValue(const btScalar &ax, const btScalar &ay, const btScalar &az, const btScalar &lx, const btScalar &ly, const btScalar &lz) { - m_topVec[0] += ax; m_topVec[1] += ay; m_topVec[2] += az; - m_bottomVec[0] += lx; m_bottomVec[1] += ly; m_bottomVec[2] += lz; + m_topVec[0] += ax; + m_topVec[1] += ay; + m_topVec[2] += az; + m_bottomVec[0] += lx; + m_bottomVec[1] += ly; + m_bottomVec[2] += lz; } - // - const btVector3 & getAngular() const { return m_topVec; } - const btVector3 & getLinear() const { return m_bottomVec; } + // + const btVector3 &getAngular() const { return m_topVec; } + const btVector3 &getLinear() const { return m_bottomVec; } // void setAngular(const btVector3 &angular) { m_topVec = angular; } void setLinear(const btVector3 &linear) { m_bottomVec = linear; } @@ -94,20 +133,24 @@ struct btSpatialMotionVector void addAngular(const btVector3 &angular) { m_topVec += angular; } void addLinear(const btVector3 &linear) { m_bottomVec += linear; } // - void setZero() { m_topVec.setZero(); m_bottomVec.setZero(); } + void setZero() + { + m_topVec.setZero(); + m_bottomVec.setZero(); + } // btScalar dot(const btSpatialForceVector &b) const { return m_bottomVec.dot(b.m_topVec) + m_topVec.dot(b.m_bottomVec); } // - template + template void cross(const SpatialVectorType &b, SpatialVectorType &out) const { out.m_topVec = m_topVec.cross(b.m_topVec); out.m_bottomVec = m_bottomVec.cross(b.m_topVec) + m_topVec.cross(b.m_bottomVec); } - template + template SpatialVectorType cross(const SpatialVectorType &b) const { SpatialVectorType out; @@ -116,21 +159,36 @@ struct btSpatialMotionVector return out; } // - btSpatialMotionVector & operator += (const btSpatialMotionVector &vec) { m_topVec += vec.m_topVec; m_bottomVec += vec.m_bottomVec; return *this; } - btSpatialMotionVector & operator -= (const btSpatialMotionVector &vec) { m_topVec -= vec.m_topVec; m_bottomVec -= vec.m_bottomVec; return *this; } - btSpatialMotionVector & operator *= (const btScalar &s) { m_topVec *= s; m_bottomVec *= s; return *this; } - btSpatialMotionVector operator - (const btSpatialMotionVector &vec) const { return btSpatialMotionVector(m_topVec - vec.m_topVec, m_bottomVec - vec.m_bottomVec); } - btSpatialMotionVector operator + (const btSpatialMotionVector &vec) const { return btSpatialMotionVector(m_topVec + vec.m_topVec, m_bottomVec + vec.m_bottomVec); } - btSpatialMotionVector operator - () const { return btSpatialMotionVector(-m_topVec, -m_bottomVec); } - btSpatialMotionVector operator * (const btScalar &s) const { return btSpatialMotionVector(s * m_topVec, s * m_bottomVec); } + btSpatialMotionVector &operator+=(const btSpatialMotionVector &vec) + { + m_topVec += vec.m_topVec; + m_bottomVec += vec.m_bottomVec; + return *this; + } + btSpatialMotionVector &operator-=(const btSpatialMotionVector &vec) + { + m_topVec -= vec.m_topVec; + m_bottomVec -= vec.m_bottomVec; + return *this; + } + btSpatialMotionVector &operator*=(const btScalar &s) + { + m_topVec *= s; + m_bottomVec *= s; + return *this; + } + btSpatialMotionVector operator-(const btSpatialMotionVector &vec) const { return btSpatialMotionVector(m_topVec - vec.m_topVec, m_bottomVec - vec.m_bottomVec); } + btSpatialMotionVector operator+(const btSpatialMotionVector &vec) const { return btSpatialMotionVector(m_topVec + vec.m_topVec, m_bottomVec + vec.m_bottomVec); } + btSpatialMotionVector operator-() const { return btSpatialMotionVector(-m_topVec, -m_bottomVec); } + btSpatialMotionVector operator*(const btScalar &s) const { return btSpatialMotionVector(s * m_topVec, s * m_bottomVec); } }; struct btSymmetricSpatialDyad { btMatrix3x3 m_topLeftMat, m_topRightMat, m_bottomLeftMat; - // + // btSymmetricSpatialDyad() { setIdentity(); } - btSymmetricSpatialDyad(const btMatrix3x3 &topLeftMat, const btMatrix3x3 &topRightMat, const btMatrix3x3 &bottomLeftMat) { setMatrix(topLeftMat, topRightMat, bottomLeftMat); } + btSymmetricSpatialDyad(const btMatrix3x3 &topLeftMat, const btMatrix3x3 &topRightMat, const btMatrix3x3 &bottomLeftMat) { setMatrix(topLeftMat, topRightMat, bottomLeftMat); } // void setMatrix(const btMatrix3x3 &topLeftMat, const btMatrix3x3 &topRightMat, const btMatrix3x3 &bottomLeftMat) { @@ -146,17 +204,22 @@ struct btSymmetricSpatialDyad m_bottomLeftMat += bottomLeftMat; } // - void setIdentity() { m_topLeftMat.setIdentity(); m_topRightMat.setIdentity(); m_bottomLeftMat.setIdentity(); } + void setIdentity() + { + m_topLeftMat.setIdentity(); + m_topRightMat.setIdentity(); + m_bottomLeftMat.setIdentity(); + } // - btSymmetricSpatialDyad & operator -= (const btSymmetricSpatialDyad &mat) + btSymmetricSpatialDyad &operator-=(const btSymmetricSpatialDyad &mat) { m_topLeftMat -= mat.m_topLeftMat; m_topRightMat -= mat.m_topRightMat; m_bottomLeftMat -= mat.m_bottomLeftMat; - return *this; + return *this; } // - btSpatialForceVector operator * (const btSpatialMotionVector &vec) + btSpatialForceVector operator*(const btSpatialMotionVector &vec) { return btSpatialForceVector(m_bottomLeftMat * vec.m_topVec + m_topLeftMat.transpose() * vec.m_bottomVec, m_topLeftMat * vec.m_topVec + m_topRightMat * vec.m_bottomVec); } @@ -164,7 +227,7 @@ struct btSymmetricSpatialDyad struct btSpatialTransformationMatrix { - btMatrix3x3 m_rotMat; //btMatrix3x3 m_trnCrossMat; + btMatrix3x3 m_rotMat; //btMatrix3x3 m_trnCrossMat; btVector3 m_trnVec; // enum eOutputOperation @@ -174,128 +237,124 @@ struct btSpatialTransformationMatrix Subtract = 2 }; // - template - void transform( const SpatialVectorType &inVec, - SpatialVectorType &outVec, - eOutputOperation outOp = None) + template + void transform(const SpatialVectorType &inVec, + SpatialVectorType &outVec, + eOutputOperation outOp = None) { - if(outOp == None) + if (outOp == None) { outVec.m_topVec = m_rotMat * inVec.m_topVec; outVec.m_bottomVec = -m_trnVec.cross(outVec.m_topVec) + m_rotMat * inVec.m_bottomVec; } - else if(outOp == Add) + else if (outOp == Add) { outVec.m_topVec += m_rotMat * inVec.m_topVec; outVec.m_bottomVec += -m_trnVec.cross(outVec.m_topVec) + m_rotMat * inVec.m_bottomVec; } - else if(outOp == Subtract) + else if (outOp == Subtract) { outVec.m_topVec -= m_rotMat * inVec.m_topVec; outVec.m_bottomVec -= -m_trnVec.cross(outVec.m_topVec) + m_rotMat * inVec.m_bottomVec; } - } - template - void transformRotationOnly( const SpatialVectorType &inVec, - SpatialVectorType &outVec, - eOutputOperation outOp = None) + template + void transformRotationOnly(const SpatialVectorType &inVec, + SpatialVectorType &outVec, + eOutputOperation outOp = None) { - if(outOp == None) + if (outOp == None) { outVec.m_topVec = m_rotMat * inVec.m_topVec; outVec.m_bottomVec = m_rotMat * inVec.m_bottomVec; } - else if(outOp == Add) + else if (outOp == Add) { outVec.m_topVec += m_rotMat * inVec.m_topVec; outVec.m_bottomVec += m_rotMat * inVec.m_bottomVec; } - else if(outOp == Subtract) + else if (outOp == Subtract) { outVec.m_topVec -= m_rotMat * inVec.m_topVec; outVec.m_bottomVec -= m_rotMat * inVec.m_bottomVec; } - } - template - void transformInverse( const SpatialVectorType &inVec, - SpatialVectorType &outVec, - eOutputOperation outOp = None) + template + void transformInverse(const SpatialVectorType &inVec, + SpatialVectorType &outVec, + eOutputOperation outOp = None) { - if(outOp == None) + if (outOp == None) { outVec.m_topVec = m_rotMat.transpose() * inVec.m_topVec; outVec.m_bottomVec = m_rotMat.transpose() * (inVec.m_bottomVec + m_trnVec.cross(inVec.m_topVec)); } - else if(outOp == Add) + else if (outOp == Add) { outVec.m_topVec += m_rotMat.transpose() * inVec.m_topVec; outVec.m_bottomVec += m_rotMat.transpose() * (inVec.m_bottomVec + m_trnVec.cross(inVec.m_topVec)); } - else if(outOp == Subtract) + else if (outOp == Subtract) { outVec.m_topVec -= m_rotMat.transpose() * inVec.m_topVec; outVec.m_bottomVec -= m_rotMat.transpose() * (inVec.m_bottomVec + m_trnVec.cross(inVec.m_topVec)); - } + } } - template - void transformInverseRotationOnly( const SpatialVectorType &inVec, - SpatialVectorType &outVec, - eOutputOperation outOp = None) + template + void transformInverseRotationOnly(const SpatialVectorType &inVec, + SpatialVectorType &outVec, + eOutputOperation outOp = None) { - if(outOp == None) + if (outOp == None) { outVec.m_topVec = m_rotMat.transpose() * inVec.m_topVec; outVec.m_bottomVec = m_rotMat.transpose() * inVec.m_bottomVec; } - else if(outOp == Add) + else if (outOp == Add) { outVec.m_topVec += m_rotMat.transpose() * inVec.m_topVec; outVec.m_bottomVec += m_rotMat.transpose() * inVec.m_bottomVec; } - else if(outOp == Subtract) + else if (outOp == Subtract) { outVec.m_topVec -= m_rotMat.transpose() * inVec.m_topVec; outVec.m_bottomVec -= m_rotMat.transpose() * inVec.m_bottomVec; } - } - void transformInverse( const btSymmetricSpatialDyad &inMat, - btSymmetricSpatialDyad &outMat, - eOutputOperation outOp = None) + void transformInverse(const btSymmetricSpatialDyad &inMat, + btSymmetricSpatialDyad &outMat, + eOutputOperation outOp = None) { - const btMatrix3x3 r_cross( 0, -m_trnVec[2], m_trnVec[1], - m_trnVec[2], 0, -m_trnVec[0], - -m_trnVec[1], m_trnVec[0], 0); + const btMatrix3x3 r_cross(0, -m_trnVec[2], m_trnVec[1], + m_trnVec[2], 0, -m_trnVec[0], + -m_trnVec[1], m_trnVec[0], 0); - - if(outOp == None) + if (outOp == None) { - outMat.m_topLeftMat = m_rotMat.transpose() * ( inMat.m_topLeftMat - inMat.m_topRightMat * r_cross ) * m_rotMat; + outMat.m_topLeftMat = m_rotMat.transpose() * (inMat.m_topLeftMat - inMat.m_topRightMat * r_cross) * m_rotMat; outMat.m_topRightMat = m_rotMat.transpose() * inMat.m_topRightMat * m_rotMat; outMat.m_bottomLeftMat = m_rotMat.transpose() * (r_cross * (inMat.m_topLeftMat - inMat.m_topRightMat * r_cross) + inMat.m_bottomLeftMat - inMat.m_topLeftMat.transpose() * r_cross) * m_rotMat; } - else if(outOp == Add) + else if (outOp == Add) { - outMat.m_topLeftMat += m_rotMat.transpose() * ( inMat.m_topLeftMat - inMat.m_topRightMat * r_cross ) * m_rotMat; + outMat.m_topLeftMat += m_rotMat.transpose() * (inMat.m_topLeftMat - inMat.m_topRightMat * r_cross) * m_rotMat; outMat.m_topRightMat += m_rotMat.transpose() * inMat.m_topRightMat * m_rotMat; outMat.m_bottomLeftMat += m_rotMat.transpose() * (r_cross * (inMat.m_topLeftMat - inMat.m_topRightMat * r_cross) + inMat.m_bottomLeftMat - inMat.m_topLeftMat.transpose() * r_cross) * m_rotMat; } - else if(outOp == Subtract) + else if (outOp == Subtract) { - outMat.m_topLeftMat -= m_rotMat.transpose() * ( inMat.m_topLeftMat - inMat.m_topRightMat * r_cross ) * m_rotMat; + outMat.m_topLeftMat -= m_rotMat.transpose() * (inMat.m_topLeftMat - inMat.m_topRightMat * r_cross) * m_rotMat; outMat.m_topRightMat -= m_rotMat.transpose() * inMat.m_topRightMat * m_rotMat; outMat.m_bottomLeftMat -= m_rotMat.transpose() * (r_cross * (inMat.m_topLeftMat - inMat.m_topRightMat * r_cross) + inMat.m_bottomLeftMat - inMat.m_topLeftMat.transpose() * r_cross) * m_rotMat; } } - template - SpatialVectorType operator * (const SpatialVectorType &vec) + template + SpatialVectorType operator*(const SpatialVectorType &vec) { SpatialVectorType out; transform(vec, out); @@ -303,7 +362,7 @@ struct btSpatialTransformationMatrix } }; -template +template void symmetricSpatialOuterProduct(const SpatialVectorType &a, const SpatialVectorType &b, btSymmetricSpatialDyad &out) { //output op maybe? @@ -314,7 +373,7 @@ void symmetricSpatialOuterProduct(const SpatialVectorType &a, const SpatialVecto //maybe simple a*spatTranspose(a) would be nicer? } -template +template btSymmetricSpatialDyad symmetricSpatialOuterProduct(const SpatialVectorType &a, const SpatialVectorType &b) { btSymmetricSpatialDyad out; @@ -327,5 +386,4 @@ btSymmetricSpatialDyad symmetricSpatialOuterProduct(const SpatialVectorType &a, //maybe simple a*spatTranspose(a) would be nicer? } -#endif //BT_SPATIAL_ALGEBRA_H - +#endif //BT_SPATIAL_ALGEBRA_H diff --git a/thirdparty/bullet/LinearMath/btStackAlloc.h b/thirdparty/bullet/LinearMath/btStackAlloc.h index 397b084877..3fc2084976 100644 --- a/thirdparty/bullet/LinearMath/btStackAlloc.h +++ b/thirdparty/bullet/LinearMath/btStackAlloc.h @@ -20,97 +20,99 @@ Nov.2006 #ifndef BT_STACK_ALLOC #define BT_STACK_ALLOC -#include "btScalar.h" //for btAssert +#include "btScalar.h" //for btAssert #include "btAlignedAllocator.h" ///The btBlock class is an internal structure for the btStackAlloc memory allocator. struct btBlock { - btBlock* previous; - unsigned char* address; + btBlock* previous; + unsigned char* address; }; ///The StackAlloc class provides some fast stack-based memory allocator (LIFO last-in first-out) class btStackAlloc { public: + btStackAlloc(unsigned int size) + { + ctor(); + create(size); + } + ~btStackAlloc() { destroy(); } - btStackAlloc(unsigned int size) { ctor();create(size); } - ~btStackAlloc() { destroy(); } - - inline void create(unsigned int size) + inline void create(unsigned int size) { destroy(); - data = (unsigned char*) btAlignedAlloc(size,16); - totalsize = size; + data = (unsigned char*)btAlignedAlloc(size, 16); + totalsize = size; } - inline void destroy() + inline void destroy() { - btAssert(usedsize==0); + btAssert(usedsize == 0); //Raise(L"StackAlloc is still in use"); - if(usedsize==0) + if (usedsize == 0) { - if(!ischild && data) + if (!ischild && data) btAlignedFree(data); - data = 0; - usedsize = 0; + data = 0; + usedsize = 0; } - } - int getAvailableMemory() const + int getAvailableMemory() const { return static_cast(totalsize - usedsize); } - unsigned char* allocate(unsigned int size) + unsigned char* allocate(unsigned int size) { - const unsigned int nus(usedsize+size); - if(nusprevious = current; - pb->address = data+usedsize; - current = pb; - return(pb); + btBlock* pb = (btBlock*)allocate(sizeof(btBlock)); + pb->previous = current; + pb->address = data + usedsize; + current = pb; + return (pb); } - SIMD_FORCE_INLINE void endBlock(btBlock* block) + SIMD_FORCE_INLINE void endBlock(btBlock* block) { - btAssert(block==current); + btAssert(block == current); //Raise(L"Unmatched blocks"); - if(block==current) + if (block == current) { - current = block->previous; - usedsize = (unsigned int)((block->address-data)-sizeof(btBlock)); + current = block->previous; + usedsize = (unsigned int)((block->address - data) - sizeof(btBlock)); } } private: - void ctor() + void ctor() { - data = 0; - totalsize = 0; - usedsize = 0; - current = 0; - ischild = false; + data = 0; + totalsize = 0; + usedsize = 0; + current = 0; + ischild = false; } - unsigned char* data; - unsigned int totalsize; - unsigned int usedsize; - btBlock* current; - bool ischild; + unsigned char* data; + unsigned int totalsize; + unsigned int usedsize; + btBlock* current; + bool ischild; }; -#endif //BT_STACK_ALLOC +#endif //BT_STACK_ALLOC diff --git a/thirdparty/bullet/LinearMath/btThreads.cpp b/thirdparty/bullet/LinearMath/btThreads.cpp index c037626ffb..69a86799fa 100644 --- a/thirdparty/bullet/LinearMath/btThreads.cpp +++ b/thirdparty/bullet/LinearMath/btThreads.cpp @@ -12,18 +12,15 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - #include "btThreads.h" #include "btQuickprof.h" #include // for min and max - #if BT_USE_OPENMP && BT_THREADSAFE #include -#endif // #if BT_USE_OPENMP && BT_THREADSAFE - +#endif // #if BT_USE_OPENMP && BT_THREADSAFE #if BT_USE_PPL && BT_THREADSAFE @@ -32,8 +29,7 @@ subject to the following restrictions: // Visual Studio 2010 and later should come with it #include // for GetProcessorCount() -#endif // #if BT_USE_PPL && BT_THREADSAFE - +#endif // #if BT_USE_PPL && BT_THREADSAFE #if BT_USE_TBB && BT_THREADSAFE @@ -44,8 +40,7 @@ subject to the following restrictions: #include #include -#endif // #if BT_USE_TBB && BT_THREADSAFE - +#endif // #if BT_USE_TBB && BT_THREADSAFE #if BT_THREADSAFE // @@ -53,7 +48,7 @@ subject to the following restrictions: // Using ordinary system-provided mutexes like Windows critical sections was noticeably slower // presumably because when it fails to lock at first it would sleep the thread and trigger costly // context switching. -// +// #if __cplusplus >= 201103L @@ -61,25 +56,24 @@ subject to the following restrictions: // on GCC or Clang you need to compile with -std=c++11 #define USE_CPP11_ATOMICS 1 -#elif defined( _MSC_VER ) +#elif defined(_MSC_VER) // on MSVC, use intrinsics instead #define USE_MSVC_INTRINSICS 1 -#elif defined( __GNUC__ ) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) +#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) // available since GCC 4.7 and some versions of clang // todo: check for clang #define USE_GCC_BUILTIN_ATOMICS 1 -#elif defined( __GNUC__ ) && (__GNUC__ == 4 && __GNUC_MINOR__ >= 1) +#elif defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ >= 1) // available since GCC 4.1 #define USE_GCC_BUILTIN_ATOMICS_OLD 1 #endif - #if USE_CPP11_ATOMICS #include @@ -89,27 +83,26 @@ subject to the following restrictions: bool btSpinMutex::tryLock() { - std::atomic* aDest = reinterpret_cast*>(&mLock); - int expected = 0; - return std::atomic_compare_exchange_weak_explicit( aDest, &expected, int(1), std::memory_order_acq_rel, std::memory_order_acquire ); + std::atomic* aDest = reinterpret_cast*>(&mLock); + int expected = 0; + return std::atomic_compare_exchange_weak_explicit(aDest, &expected, int(1), std::memory_order_acq_rel, std::memory_order_acquire); } void btSpinMutex::lock() { - // note: this lock does not sleep the thread. - while (! tryLock()) - { - // spin - } + // note: this lock does not sleep the thread. + while (!tryLock()) + { + // spin + } } void btSpinMutex::unlock() { - std::atomic* aDest = reinterpret_cast*>(&mLock); - std::atomic_store_explicit( aDest, int(0), std::memory_order_release ); + std::atomic* aDest = reinterpret_cast*>(&mLock); + std::atomic_store_explicit(aDest, int(0), std::memory_order_release); } - #elif USE_MSVC_INTRINSICS #define WIN32_LEAN_AND_MEAN @@ -117,148 +110,142 @@ void btSpinMutex::unlock() #include #include -#define THREAD_LOCAL_STATIC __declspec( thread ) static - +#define THREAD_LOCAL_STATIC __declspec(thread) static bool btSpinMutex::tryLock() { - volatile long* aDest = reinterpret_cast(&mLock); - return ( 0 == _InterlockedCompareExchange( aDest, 1, 0) ); + volatile long* aDest = reinterpret_cast(&mLock); + return (0 == _InterlockedCompareExchange(aDest, 1, 0)); } void btSpinMutex::lock() { - // note: this lock does not sleep the thread - while (! tryLock()) - { - // spin - } + // note: this lock does not sleep the thread + while (!tryLock()) + { + // spin + } } void btSpinMutex::unlock() { - volatile long* aDest = reinterpret_cast( &mLock ); - _InterlockedExchange( aDest, 0 ); + volatile long* aDest = reinterpret_cast(&mLock); + _InterlockedExchange(aDest, 0); } #elif USE_GCC_BUILTIN_ATOMICS #define THREAD_LOCAL_STATIC static __thread - bool btSpinMutex::tryLock() { - int expected = 0; - bool weak = false; - const int memOrderSuccess = __ATOMIC_ACQ_REL; - const int memOrderFail = __ATOMIC_ACQUIRE; - return __atomic_compare_exchange_n(&mLock, &expected, int(1), weak, memOrderSuccess, memOrderFail); + int expected = 0; + bool weak = false; + const int memOrderSuccess = __ATOMIC_ACQ_REL; + const int memOrderFail = __ATOMIC_ACQUIRE; + return __atomic_compare_exchange_n(&mLock, &expected, int(1), weak, memOrderSuccess, memOrderFail); } void btSpinMutex::lock() { - // note: this lock does not sleep the thread - while (! tryLock()) - { - // spin - } + // note: this lock does not sleep the thread + while (!tryLock()) + { + // spin + } } void btSpinMutex::unlock() { - __atomic_store_n(&mLock, int(0), __ATOMIC_RELEASE); + __atomic_store_n(&mLock, int(0), __ATOMIC_RELEASE); } #elif USE_GCC_BUILTIN_ATOMICS_OLD - #define THREAD_LOCAL_STATIC static __thread bool btSpinMutex::tryLock() { - return __sync_bool_compare_and_swap(&mLock, int(0), int(1)); + return __sync_bool_compare_and_swap(&mLock, int(0), int(1)); } void btSpinMutex::lock() { - // note: this lock does not sleep the thread - while (! tryLock()) - { - // spin - } + // note: this lock does not sleep the thread + while (!tryLock()) + { + // spin + } } void btSpinMutex::unlock() { - // write 0 - __sync_fetch_and_and(&mLock, int(0)); + // write 0 + __sync_fetch_and_and(&mLock, int(0)); } -#else //#elif USE_MSVC_INTRINSICS +#else //#elif USE_MSVC_INTRINSICS #error "no threading primitives defined -- unknown platform" #endif //#else //#elif USE_MSVC_INTRINSICS -#else //#if BT_THREADSAFE +#else //#if BT_THREADSAFE // These should not be called ever void btSpinMutex::lock() { - btAssert( !"unimplemented btSpinMutex::lock() called" ); + btAssert(!"unimplemented btSpinMutex::lock() called"); } void btSpinMutex::unlock() { - btAssert( !"unimplemented btSpinMutex::unlock() called" ); + btAssert(!"unimplemented btSpinMutex::unlock() called"); } bool btSpinMutex::tryLock() { - btAssert( !"unimplemented btSpinMutex::tryLock() called" ); - return true; + btAssert(!"unimplemented btSpinMutex::tryLock() called"); + return true; } #define THREAD_LOCAL_STATIC static -#endif // #else //#if BT_THREADSAFE - +#endif // #else //#if BT_THREADSAFE struct ThreadsafeCounter { - unsigned int mCounter; - btSpinMutex mMutex; - - ThreadsafeCounter() - { - mCounter = 0; - --mCounter; // first count should come back 0 - } - - unsigned int getNext() - { - // no need to optimize this with atomics, it is only called ONCE per thread! - mMutex.lock(); - mCounter++; - if ( mCounter >= BT_MAX_THREAD_COUNT ) - { - btAssert( !"thread counter exceeded" ); - // wrap back to the first worker index - mCounter = 1; - } - unsigned int val = mCounter; - mMutex.unlock(); - return val; - } + unsigned int mCounter; + btSpinMutex mMutex; + + ThreadsafeCounter() + { + mCounter = 0; + --mCounter; // first count should come back 0 + } + + unsigned int getNext() + { + // no need to optimize this with atomics, it is only called ONCE per thread! + mMutex.lock(); + mCounter++; + if (mCounter >= BT_MAX_THREAD_COUNT) + { + btAssert(!"thread counter exceeded"); + // wrap back to the first worker index + mCounter = 1; + } + unsigned int val = mCounter; + mMutex.unlock(); + return val; + } }; - -static btITaskScheduler* gBtTaskScheduler; +static btITaskScheduler* gBtTaskScheduler=0; static int gThreadsRunningCounter = 0; // useful for detecting if we are trying to do nested parallel-for calls static btSpinMutex gThreadsRunningCounterMutex; static ThreadsafeCounter gThreadCounter; - // // BT_DETECT_BAD_THREAD_INDEX tries to detect when there are multiple threads assigned the same thread index. // @@ -276,7 +263,7 @@ static ThreadsafeCounter gThreadCounter; // We allocate thread-indexes as needed with a sequential global thread counter. // // Our simple thread-counting scheme falls apart if the task scheduler destroys some threads but -// continues to re-use other threads and the application repeatedly resizes the thread pool of the +// continues to re-use other threads and the application repeatedly resizes the thread pool of the // task scheduler. // In order to prevent the thread-counter from exceeding the global max (BT_MAX_THREAD_COUNT), we // wrap the thread counter back to 1. This should only happen if the worker threads have all been @@ -290,197 +277,192 @@ static ThreadsafeCounter gThreadCounter; typedef DWORD ThreadId_t; const static ThreadId_t kInvalidThreadId = 0; -ThreadId_t gDebugThreadIds[ BT_MAX_THREAD_COUNT ]; +ThreadId_t gDebugThreadIds[BT_MAX_THREAD_COUNT]; static ThreadId_t getDebugThreadId() { - return GetCurrentThreadId(); + return GetCurrentThreadId(); } -#endif // #if BT_DETECT_BAD_THREAD_INDEX - +#endif // #if BT_DETECT_BAD_THREAD_INDEX // return a unique index per thread, main thread is 0, worker threads are in [1, BT_MAX_THREAD_COUNT) unsigned int btGetCurrentThreadIndex() { - const unsigned int kNullIndex = ~0U; - THREAD_LOCAL_STATIC unsigned int sThreadIndex = kNullIndex; - if ( sThreadIndex == kNullIndex ) - { - sThreadIndex = gThreadCounter.getNext(); - btAssert( sThreadIndex < BT_MAX_THREAD_COUNT ); - } + const unsigned int kNullIndex = ~0U; + THREAD_LOCAL_STATIC unsigned int sThreadIndex = kNullIndex; + if (sThreadIndex == kNullIndex) + { + sThreadIndex = gThreadCounter.getNext(); + btAssert(sThreadIndex < BT_MAX_THREAD_COUNT); + } #if BT_DETECT_BAD_THREAD_INDEX - if ( gBtTaskScheduler && sThreadIndex > 0 ) - { - ThreadId_t tid = getDebugThreadId(); - // if not set - if ( gDebugThreadIds[ sThreadIndex ] == kInvalidThreadId ) - { - // set it - gDebugThreadIds[ sThreadIndex ] = tid; - } - else - { - if ( gDebugThreadIds[ sThreadIndex ] != tid ) - { - // this could indicate the task scheduler is breaking our assumptions about - // how threads are managed when threadpool is resized - btAssert( !"there are 2 or more threads with the same thread-index!" ); - __debugbreak(); - } - } - } -#endif // #if BT_DETECT_BAD_THREAD_INDEX - return sThreadIndex; + if (gBtTaskScheduler && sThreadIndex > 0) + { + ThreadId_t tid = getDebugThreadId(); + // if not set + if (gDebugThreadIds[sThreadIndex] == kInvalidThreadId) + { + // set it + gDebugThreadIds[sThreadIndex] = tid; + } + else + { + if (gDebugThreadIds[sThreadIndex] != tid) + { + // this could indicate the task scheduler is breaking our assumptions about + // how threads are managed when threadpool is resized + btAssert(!"there are 2 or more threads with the same thread-index!"); + __debugbreak(); + } + } + } +#endif // #if BT_DETECT_BAD_THREAD_INDEX + return sThreadIndex; } bool btIsMainThread() { - return btGetCurrentThreadIndex() == 0; + return btGetCurrentThreadIndex() == 0; } void btResetThreadIndexCounter() { - // for when all current worker threads are destroyed - btAssert( btIsMainThread() ); - gThreadCounter.mCounter = 0; + // for when all current worker threads are destroyed + btAssert(btIsMainThread()); + gThreadCounter.mCounter = 0; } -btITaskScheduler::btITaskScheduler( const char* name ) +btITaskScheduler::btITaskScheduler(const char* name) { - m_name = name; - m_savedThreadCounter = 0; - m_isActive = false; + m_name = name; + m_savedThreadCounter = 0; + m_isActive = false; } void btITaskScheduler::activate() { - // gThreadCounter is used to assign a thread-index to each worker thread in a task scheduler. - // The main thread is always thread-index 0, and worker threads are numbered from 1 to 63 (BT_MAX_THREAD_COUNT-1) - // The thread-indexes need to be unique amongst the threads that can be running simultaneously. - // Since only one task scheduler can be used at a time, it is OK for a pair of threads that belong to different - // task schedulers to share the same thread index because they can't be running at the same time. - // So each task scheduler needs to keep its own thread counter value - if ( !m_isActive ) - { - gThreadCounter.mCounter = m_savedThreadCounter; // restore saved thread counter - m_isActive = true; - } + // gThreadCounter is used to assign a thread-index to each worker thread in a task scheduler. + // The main thread is always thread-index 0, and worker threads are numbered from 1 to 63 (BT_MAX_THREAD_COUNT-1) + // The thread-indexes need to be unique amongst the threads that can be running simultaneously. + // Since only one task scheduler can be used at a time, it is OK for a pair of threads that belong to different + // task schedulers to share the same thread index because they can't be running at the same time. + // So each task scheduler needs to keep its own thread counter value + if (!m_isActive) + { + gThreadCounter.mCounter = m_savedThreadCounter; // restore saved thread counter + m_isActive = true; + } } void btITaskScheduler::deactivate() { - if ( m_isActive ) - { - m_savedThreadCounter = gThreadCounter.mCounter; // save thread counter - m_isActive = false; - } + if (m_isActive) + { + m_savedThreadCounter = gThreadCounter.mCounter; // save thread counter + m_isActive = false; + } } void btPushThreadsAreRunning() { - gThreadsRunningCounterMutex.lock(); - gThreadsRunningCounter++; - gThreadsRunningCounterMutex.unlock(); + gThreadsRunningCounterMutex.lock(); + gThreadsRunningCounter++; + gThreadsRunningCounterMutex.unlock(); } void btPopThreadsAreRunning() { - gThreadsRunningCounterMutex.lock(); - gThreadsRunningCounter--; - gThreadsRunningCounterMutex.unlock(); + gThreadsRunningCounterMutex.lock(); + gThreadsRunningCounter--; + gThreadsRunningCounterMutex.unlock(); } bool btThreadsAreRunning() { - return gThreadsRunningCounter != 0; + return gThreadsRunningCounter != 0; } - -void btSetTaskScheduler( btITaskScheduler* ts ) +void btSetTaskScheduler(btITaskScheduler* ts) { - int threadId = btGetCurrentThreadIndex(); // make sure we call this on main thread at least once before any workers run - if ( threadId != 0 ) - { - btAssert( !"btSetTaskScheduler must be called from the main thread!" ); - return; - } - if ( gBtTaskScheduler ) - { - // deactivate old task scheduler - gBtTaskScheduler->deactivate(); - } - gBtTaskScheduler = ts; - if ( ts ) - { - // activate new task scheduler - ts->activate(); - } + int threadId = btGetCurrentThreadIndex(); // make sure we call this on main thread at least once before any workers run + if (threadId != 0) + { + btAssert(!"btSetTaskScheduler must be called from the main thread!"); + return; + } + if (gBtTaskScheduler) + { + // deactivate old task scheduler + gBtTaskScheduler->deactivate(); + } + gBtTaskScheduler = ts; + if (ts) + { + // activate new task scheduler + ts->activate(); + } } - btITaskScheduler* btGetTaskScheduler() { - return gBtTaskScheduler; + return gBtTaskScheduler; } - -void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) +void btParallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) { #if BT_THREADSAFE #if BT_DETECT_BAD_THREAD_INDEX - if ( !btThreadsAreRunning() ) - { - // clear out thread ids - for ( int i = 0; i < BT_MAX_THREAD_COUNT; ++i ) - { - gDebugThreadIds[ i ] = kInvalidThreadId; - } - } -#endif // #if BT_DETECT_BAD_THREAD_INDEX + if (!btThreadsAreRunning()) + { + // clear out thread ids + for (int i = 0; i < BT_MAX_THREAD_COUNT; ++i) + { + gDebugThreadIds[i] = kInvalidThreadId; + } + } +#endif // #if BT_DETECT_BAD_THREAD_INDEX - btAssert( gBtTaskScheduler != NULL ); // call btSetTaskScheduler() with a valid task scheduler first! - gBtTaskScheduler->parallelFor( iBegin, iEnd, grainSize, body ); + btAssert(gBtTaskScheduler != NULL); // call btSetTaskScheduler() with a valid task scheduler first! + gBtTaskScheduler->parallelFor(iBegin, iEnd, grainSize, body); -#else // #if BT_THREADSAFE +#else // #if BT_THREADSAFE - // non-parallel version of btParallelFor - btAssert( !"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE" ); - body.forLoop( iBegin, iEnd ); + // non-parallel version of btParallelFor + btAssert(!"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE"); + body.forLoop(iBegin, iEnd); -#endif// #if BT_THREADSAFE +#endif // #if BT_THREADSAFE } -btScalar btParallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) +btScalar btParallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) { #if BT_THREADSAFE #if BT_DETECT_BAD_THREAD_INDEX - if ( !btThreadsAreRunning() ) - { - // clear out thread ids - for ( int i = 0; i < BT_MAX_THREAD_COUNT; ++i ) - { - gDebugThreadIds[ i ] = kInvalidThreadId; - } - } -#endif // #if BT_DETECT_BAD_THREAD_INDEX + if (!btThreadsAreRunning()) + { + // clear out thread ids + for (int i = 0; i < BT_MAX_THREAD_COUNT; ++i) + { + gDebugThreadIds[i] = kInvalidThreadId; + } + } +#endif // #if BT_DETECT_BAD_THREAD_INDEX - btAssert( gBtTaskScheduler != NULL ); // call btSetTaskScheduler() with a valid task scheduler first! - return gBtTaskScheduler->parallelSum( iBegin, iEnd, grainSize, body ); + btAssert(gBtTaskScheduler != NULL); // call btSetTaskScheduler() with a valid task scheduler first! + return gBtTaskScheduler->parallelSum(iBegin, iEnd, grainSize, body); -#else // #if BT_THREADSAFE +#else // #if BT_THREADSAFE - // non-parallel version of btParallelSum - btAssert( !"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE" ); - return body.sumLoop( iBegin, iEnd ); + // non-parallel version of btParallelSum + btAssert(!"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE"); + return body.sumLoop(iBegin, iEnd); -#endif //#else // #if BT_THREADSAFE +#endif //#else // #if BT_THREADSAFE } - /// /// btTaskSchedulerSequential -- non-threaded implementation of task scheduler /// (really just useful for testing performance of single threaded vs multi) @@ -488,86 +470,86 @@ btScalar btParallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSu class btTaskSchedulerSequential : public btITaskScheduler { public: - btTaskSchedulerSequential() : btITaskScheduler( "Sequential" ) {} - virtual int getMaxNumThreads() const BT_OVERRIDE { return 1; } - virtual int getNumThreads() const BT_OVERRIDE { return 1; } - virtual void setNumThreads( int numThreads ) BT_OVERRIDE {} - virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE - { - BT_PROFILE( "parallelFor_sequential" ); - body.forLoop( iBegin, iEnd ); - } - virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE - { - BT_PROFILE( "parallelSum_sequential" ); - return body.sumLoop( iBegin, iEnd ); - } + btTaskSchedulerSequential() : btITaskScheduler("Sequential") {} + virtual int getMaxNumThreads() const BT_OVERRIDE { return 1; } + virtual int getNumThreads() const BT_OVERRIDE { return 1; } + virtual void setNumThreads(int numThreads) BT_OVERRIDE {} + virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE + { + BT_PROFILE("parallelFor_sequential"); + body.forLoop(iBegin, iEnd); + } + virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE + { + BT_PROFILE("parallelSum_sequential"); + return body.sumLoop(iBegin, iEnd); + } }; - #if BT_USE_OPENMP && BT_THREADSAFE /// /// btTaskSchedulerOpenMP -- wrapper around OpenMP task scheduler /// class btTaskSchedulerOpenMP : public btITaskScheduler { - int m_numThreads; + int m_numThreads; + public: - btTaskSchedulerOpenMP() : btITaskScheduler( "OpenMP" ) - { - m_numThreads = 0; - } - virtual int getMaxNumThreads() const BT_OVERRIDE - { - return omp_get_max_threads(); - } - virtual int getNumThreads() const BT_OVERRIDE - { - return m_numThreads; - } - virtual void setNumThreads( int numThreads ) BT_OVERRIDE - { - // With OpenMP, because it is a standard with various implementations, we can't - // know for sure if every implementation has the same behavior of destroying all - // previous threads when resizing the threadpool - m_numThreads = ( std::max )( 1, ( std::min )( int( BT_MAX_THREAD_COUNT ), numThreads ) ); - omp_set_num_threads( 1 ); // hopefully, all previous threads get destroyed here - omp_set_num_threads( m_numThreads ); - m_savedThreadCounter = 0; - if ( m_isActive ) - { - btResetThreadIndexCounter(); - } - } - virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE - { - BT_PROFILE( "parallelFor_OpenMP" ); - btPushThreadsAreRunning(); -#pragma omp parallel for schedule( static, 1 ) - for ( int i = iBegin; i < iEnd; i += grainSize ) - { - BT_PROFILE( "OpenMP_forJob" ); - body.forLoop( i, ( std::min )( i + grainSize, iEnd ) ); - } - btPopThreadsAreRunning(); - } - virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE - { - BT_PROFILE( "parallelFor_OpenMP" ); - btPushThreadsAreRunning(); - btScalar sum = btScalar( 0 ); -#pragma omp parallel for schedule( static, 1 ) reduction(+:sum) - for ( int i = iBegin; i < iEnd; i += grainSize ) - { - BT_PROFILE( "OpenMP_sumJob" ); - sum += body.sumLoop( i, ( std::min )( i + grainSize, iEnd ) ); - } - btPopThreadsAreRunning(); - return sum; - } + btTaskSchedulerOpenMP() : btITaskScheduler("OpenMP") + { + m_numThreads = 0; + } + virtual int getMaxNumThreads() const BT_OVERRIDE + { + return omp_get_max_threads(); + } + virtual int getNumThreads() const BT_OVERRIDE + { + return m_numThreads; + } + virtual void setNumThreads(int numThreads) BT_OVERRIDE + { + // With OpenMP, because it is a standard with various implementations, we can't + // know for sure if every implementation has the same behavior of destroying all + // previous threads when resizing the threadpool + m_numThreads = (std::max)(1, (std::min)(int(BT_MAX_THREAD_COUNT), numThreads)); + omp_set_num_threads(1); // hopefully, all previous threads get destroyed here + omp_set_num_threads(m_numThreads); + m_savedThreadCounter = 0; + if (m_isActive) + { + btResetThreadIndexCounter(); + } + } + virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE + { + BT_PROFILE("parallelFor_OpenMP"); + btPushThreadsAreRunning(); +#pragma omp parallel for schedule(static, 1) + for (int i = iBegin; i < iEnd; i += grainSize) + { + BT_PROFILE("OpenMP_forJob"); + body.forLoop(i, (std::min)(i + grainSize, iEnd)); + } + btPopThreadsAreRunning(); + } + virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE + { + BT_PROFILE("parallelFor_OpenMP"); + btPushThreadsAreRunning(); + btScalar sum = btScalar(0); +#pragma omp parallel for schedule(static, 1) reduction(+ \ + : sum) + for (int i = iBegin; i < iEnd; i += grainSize) + { + BT_PROFILE("OpenMP_sumJob"); + sum += body.sumLoop(i, (std::min)(i + grainSize, iEnd)); + } + btPopThreadsAreRunning(); + return sum; + } }; -#endif // #if BT_USE_OPENMP && BT_THREADSAFE - +#endif // #if BT_USE_OPENMP && BT_THREADSAFE #if BT_USE_TBB && BT_THREADSAFE /// @@ -575,96 +557,94 @@ public: /// class btTaskSchedulerTBB : public btITaskScheduler { - int m_numThreads; - tbb::task_scheduler_init* m_tbbSchedulerInit; + int m_numThreads; + tbb::task_scheduler_init* m_tbbSchedulerInit; public: - btTaskSchedulerTBB() : btITaskScheduler( "IntelTBB" ) - { - m_numThreads = 0; - m_tbbSchedulerInit = NULL; - } - ~btTaskSchedulerTBB() - { - if ( m_tbbSchedulerInit ) - { - delete m_tbbSchedulerInit; - m_tbbSchedulerInit = NULL; - } - } - - virtual int getMaxNumThreads() const BT_OVERRIDE - { - return tbb::task_scheduler_init::default_num_threads(); - } - virtual int getNumThreads() const BT_OVERRIDE - { - return m_numThreads; - } - virtual void setNumThreads( int numThreads ) BT_OVERRIDE - { - m_numThreads = ( std::max )( 1, ( std::min )( int(BT_MAX_THREAD_COUNT), numThreads ) ); - if ( m_tbbSchedulerInit ) - { - // destroys all previous threads - delete m_tbbSchedulerInit; - m_tbbSchedulerInit = NULL; - } - m_tbbSchedulerInit = new tbb::task_scheduler_init( m_numThreads ); - m_savedThreadCounter = 0; - if ( m_isActive ) - { - btResetThreadIndexCounter(); - } - } - struct ForBodyAdapter - { - const btIParallelForBody* mBody; - - ForBodyAdapter( const btIParallelForBody* body ) : mBody( body ) {} - void operator()( const tbb::blocked_range& range ) const - { - BT_PROFILE( "TBB_forJob" ); - mBody->forLoop( range.begin(), range.end() ); - } - }; - virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE - { - BT_PROFILE( "parallelFor_TBB" ); - ForBodyAdapter tbbBody( &body ); - btPushThreadsAreRunning(); - tbb::parallel_for( tbb::blocked_range( iBegin, iEnd, grainSize ), - tbbBody, - tbb::simple_partitioner() - ); - btPopThreadsAreRunning(); - } - struct SumBodyAdapter - { - const btIParallelSumBody* mBody; - btScalar mSum; - - SumBodyAdapter( const btIParallelSumBody* body ) : mBody( body ), mSum( btScalar( 0 ) ) {} - SumBodyAdapter( const SumBodyAdapter& src, tbb::split ) : mBody( src.mBody ), mSum( btScalar( 0 ) ) {} - void join( const SumBodyAdapter& src ) { mSum += src.mSum; } - void operator()( const tbb::blocked_range& range ) - { - BT_PROFILE( "TBB_sumJob" ); - mSum += mBody->sumLoop( range.begin(), range.end() ); - } - }; - virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE - { - BT_PROFILE( "parallelSum_TBB" ); - SumBodyAdapter tbbBody( &body ); - btPushThreadsAreRunning(); - tbb::parallel_deterministic_reduce( tbb::blocked_range( iBegin, iEnd, grainSize ), tbbBody ); - btPopThreadsAreRunning(); - return tbbBody.mSum; - } + btTaskSchedulerTBB() : btITaskScheduler("IntelTBB") + { + m_numThreads = 0; + m_tbbSchedulerInit = NULL; + } + ~btTaskSchedulerTBB() + { + if (m_tbbSchedulerInit) + { + delete m_tbbSchedulerInit; + m_tbbSchedulerInit = NULL; + } + } + + virtual int getMaxNumThreads() const BT_OVERRIDE + { + return tbb::task_scheduler_init::default_num_threads(); + } + virtual int getNumThreads() const BT_OVERRIDE + { + return m_numThreads; + } + virtual void setNumThreads(int numThreads) BT_OVERRIDE + { + m_numThreads = (std::max)(1, (std::min)(int(BT_MAX_THREAD_COUNT), numThreads)); + if (m_tbbSchedulerInit) + { + // destroys all previous threads + delete m_tbbSchedulerInit; + m_tbbSchedulerInit = NULL; + } + m_tbbSchedulerInit = new tbb::task_scheduler_init(m_numThreads); + m_savedThreadCounter = 0; + if (m_isActive) + { + btResetThreadIndexCounter(); + } + } + struct ForBodyAdapter + { + const btIParallelForBody* mBody; + + ForBodyAdapter(const btIParallelForBody* body) : mBody(body) {} + void operator()(const tbb::blocked_range& range) const + { + BT_PROFILE("TBB_forJob"); + mBody->forLoop(range.begin(), range.end()); + } + }; + virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE + { + BT_PROFILE("parallelFor_TBB"); + ForBodyAdapter tbbBody(&body); + btPushThreadsAreRunning(); + tbb::parallel_for(tbb::blocked_range(iBegin, iEnd, grainSize), + tbbBody, + tbb::simple_partitioner()); + btPopThreadsAreRunning(); + } + struct SumBodyAdapter + { + const btIParallelSumBody* mBody; + btScalar mSum; + + SumBodyAdapter(const btIParallelSumBody* body) : mBody(body), mSum(btScalar(0)) {} + SumBodyAdapter(const SumBodyAdapter& src, tbb::split) : mBody(src.mBody), mSum(btScalar(0)) {} + void join(const SumBodyAdapter& src) { mSum += src.mSum; } + void operator()(const tbb::blocked_range& range) + { + BT_PROFILE("TBB_sumJob"); + mSum += mBody->sumLoop(range.begin(), range.end()); + } + }; + virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE + { + BT_PROFILE("parallelSum_TBB"); + SumBodyAdapter tbbBody(&body); + btPushThreadsAreRunning(); + tbb::parallel_deterministic_reduce(tbb::blocked_range(iBegin, iEnd, grainSize), tbbBody); + btPopThreadsAreRunning(); + return tbbBody.mSum; + } }; -#endif // #if BT_USE_TBB && BT_THREADSAFE - +#endif // #if BT_USE_TBB && BT_THREADSAFE #if BT_USE_PPL && BT_THREADSAFE /// @@ -672,148 +652,141 @@ public: /// class btTaskSchedulerPPL : public btITaskScheduler { - int m_numThreads; - concurrency::combinable m_sum; // for parallelSum + int m_numThreads; + concurrency::combinable m_sum; // for parallelSum public: - btTaskSchedulerPPL() : btITaskScheduler( "PPL" ) - { - m_numThreads = 0; - } - virtual int getMaxNumThreads() const BT_OVERRIDE - { - return concurrency::GetProcessorCount(); - } - virtual int getNumThreads() const BT_OVERRIDE - { - return m_numThreads; - } - virtual void setNumThreads( int numThreads ) BT_OVERRIDE - { - // capping the thread count for PPL due to a thread-index issue - const int maxThreadCount = (std::min)(int(BT_MAX_THREAD_COUNT), 31); - m_numThreads = ( std::max )( 1, ( std::min )( maxThreadCount, numThreads ) ); - using namespace concurrency; - if ( CurrentScheduler::Id() != -1 ) - { - CurrentScheduler::Detach(); - } - SchedulerPolicy policy; - { - // PPL seems to destroy threads when threadpool is shrunk, but keeps reusing old threads - // force it to destroy old threads - policy.SetConcurrencyLimits( 1, 1 ); - CurrentScheduler::Create( policy ); - CurrentScheduler::Detach(); - } - policy.SetConcurrencyLimits( m_numThreads, m_numThreads ); - CurrentScheduler::Create( policy ); - m_savedThreadCounter = 0; - if ( m_isActive ) - { - btResetThreadIndexCounter(); - } - } - struct ForBodyAdapter - { - const btIParallelForBody* mBody; - int mGrainSize; - int mIndexEnd; - - ForBodyAdapter( const btIParallelForBody* body, int grainSize, int end ) : mBody( body ), mGrainSize( grainSize ), mIndexEnd( end ) {} - void operator()( int i ) const - { - BT_PROFILE( "PPL_forJob" ); - mBody->forLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) ); - } - }; - virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE - { - BT_PROFILE( "parallelFor_PPL" ); - // PPL dispatch - ForBodyAdapter pplBody( &body, grainSize, iEnd ); - btPushThreadsAreRunning(); - // note: MSVC 2010 doesn't support partitioner args, so avoid them - concurrency::parallel_for( iBegin, - iEnd, - grainSize, - pplBody - ); - btPopThreadsAreRunning(); - } - struct SumBodyAdapter - { - const btIParallelSumBody* mBody; - concurrency::combinable* mSum; - int mGrainSize; - int mIndexEnd; - - SumBodyAdapter( const btIParallelSumBody* body, concurrency::combinable* sum, int grainSize, int end ) : mBody( body ), mSum(sum), mGrainSize( grainSize ), mIndexEnd( end ) {} - void operator()( int i ) const - { - BT_PROFILE( "PPL_sumJob" ); - mSum->local() += mBody->sumLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) ); - } - }; - static btScalar sumFunc( btScalar a, btScalar b ) { return a + b; } - virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE - { - BT_PROFILE( "parallelSum_PPL" ); - m_sum.clear(); - SumBodyAdapter pplBody( &body, &m_sum, grainSize, iEnd ); - btPushThreadsAreRunning(); - // note: MSVC 2010 doesn't support partitioner args, so avoid them - concurrency::parallel_for( iBegin, - iEnd, - grainSize, - pplBody - ); - btPopThreadsAreRunning(); - return m_sum.combine( sumFunc ); - } + btTaskSchedulerPPL() : btITaskScheduler("PPL") + { + m_numThreads = 0; + } + virtual int getMaxNumThreads() const BT_OVERRIDE + { + return concurrency::GetProcessorCount(); + } + virtual int getNumThreads() const BT_OVERRIDE + { + return m_numThreads; + } + virtual void setNumThreads(int numThreads) BT_OVERRIDE + { + // capping the thread count for PPL due to a thread-index issue + const int maxThreadCount = (std::min)(int(BT_MAX_THREAD_COUNT), 31); + m_numThreads = (std::max)(1, (std::min)(maxThreadCount, numThreads)); + using namespace concurrency; + if (CurrentScheduler::Id() != -1) + { + CurrentScheduler::Detach(); + } + SchedulerPolicy policy; + { + // PPL seems to destroy threads when threadpool is shrunk, but keeps reusing old threads + // force it to destroy old threads + policy.SetConcurrencyLimits(1, 1); + CurrentScheduler::Create(policy); + CurrentScheduler::Detach(); + } + policy.SetConcurrencyLimits(m_numThreads, m_numThreads); + CurrentScheduler::Create(policy); + m_savedThreadCounter = 0; + if (m_isActive) + { + btResetThreadIndexCounter(); + } + } + struct ForBodyAdapter + { + const btIParallelForBody* mBody; + int mGrainSize; + int mIndexEnd; + + ForBodyAdapter(const btIParallelForBody* body, int grainSize, int end) : mBody(body), mGrainSize(grainSize), mIndexEnd(end) {} + void operator()(int i) const + { + BT_PROFILE("PPL_forJob"); + mBody->forLoop(i, (std::min)(i + mGrainSize, mIndexEnd)); + } + }; + virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE + { + BT_PROFILE("parallelFor_PPL"); + // PPL dispatch + ForBodyAdapter pplBody(&body, grainSize, iEnd); + btPushThreadsAreRunning(); + // note: MSVC 2010 doesn't support partitioner args, so avoid them + concurrency::parallel_for(iBegin, + iEnd, + grainSize, + pplBody); + btPopThreadsAreRunning(); + } + struct SumBodyAdapter + { + const btIParallelSumBody* mBody; + concurrency::combinable* mSum; + int mGrainSize; + int mIndexEnd; + + SumBodyAdapter(const btIParallelSumBody* body, concurrency::combinable* sum, int grainSize, int end) : mBody(body), mSum(sum), mGrainSize(grainSize), mIndexEnd(end) {} + void operator()(int i) const + { + BT_PROFILE("PPL_sumJob"); + mSum->local() += mBody->sumLoop(i, (std::min)(i + mGrainSize, mIndexEnd)); + } + }; + static btScalar sumFunc(btScalar a, btScalar b) { return a + b; } + virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE + { + BT_PROFILE("parallelSum_PPL"); + m_sum.clear(); + SumBodyAdapter pplBody(&body, &m_sum, grainSize, iEnd); + btPushThreadsAreRunning(); + // note: MSVC 2010 doesn't support partitioner args, so avoid them + concurrency::parallel_for(iBegin, + iEnd, + grainSize, + pplBody); + btPopThreadsAreRunning(); + return m_sum.combine(sumFunc); + } }; -#endif // #if BT_USE_PPL && BT_THREADSAFE - +#endif // #if BT_USE_PPL && BT_THREADSAFE // create a non-threaded task scheduler (always available) btITaskScheduler* btGetSequentialTaskScheduler() { - static btTaskSchedulerSequential sTaskScheduler; - return &sTaskScheduler; + static btTaskSchedulerSequential sTaskScheduler; + return &sTaskScheduler; } - // create an OpenMP task scheduler (if available, otherwise returns null) btITaskScheduler* btGetOpenMPTaskScheduler() { #if BT_USE_OPENMP && BT_THREADSAFE - static btTaskSchedulerOpenMP sTaskScheduler; - return &sTaskScheduler; + static btTaskSchedulerOpenMP sTaskScheduler; + return &sTaskScheduler; #else - return NULL; + return NULL; #endif } - // create an Intel TBB task scheduler (if available, otherwise returns null) btITaskScheduler* btGetTBBTaskScheduler() { #if BT_USE_TBB && BT_THREADSAFE - static btTaskSchedulerTBB sTaskScheduler; - return &sTaskScheduler; + static btTaskSchedulerTBB sTaskScheduler; + return &sTaskScheduler; #else - return NULL; + return NULL; #endif } - // create a PPL task scheduler (if available, otherwise returns null) btITaskScheduler* btGetPPLTaskScheduler() { #if BT_USE_PPL && BT_THREADSAFE - static btTaskSchedulerPPL sTaskScheduler; - return &sTaskScheduler; + static btTaskSchedulerPPL sTaskScheduler; + return &sTaskScheduler; #else - return NULL; + return NULL; #endif } - diff --git a/thirdparty/bullet/LinearMath/btThreads.h b/thirdparty/bullet/LinearMath/btThreads.h index 921fd088c0..b2227e1724 100644 --- a/thirdparty/bullet/LinearMath/btThreads.h +++ b/thirdparty/bullet/LinearMath/btThreads.h @@ -12,14 +12,12 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - - #ifndef BT_THREADS_H #define BT_THREADS_H -#include "btScalar.h" // has definitions like SIMD_FORCE_INLINE +#include "btScalar.h" // has definitions like SIMD_FORCE_INLINE -#if defined (_MSC_VER) && _MSC_VER >= 1600 +#if defined(_MSC_VER) && _MSC_VER >= 1600 // give us a compile error if any signatures of overriden methods is changed #define BT_OVERRIDE override #endif @@ -36,7 +34,7 @@ const unsigned int BT_MAX_THREAD_COUNT = 64; // only if BT_THREADSAFE is 1 bool btIsMainThread(); bool btThreadsAreRunning(); unsigned int btGetCurrentThreadIndex(); -void btResetThreadIndexCounter(); // notify that all worker threads have been destroyed +void btResetThreadIndexCounter(); // notify that all worker threads have been destroyed /// /// btSpinMutex -- lightweight spin-mutex implemented with atomic ops, never puts @@ -46,19 +44,18 @@ void btResetThreadIndexCounter(); // notify that all worker threads have been de /// class btSpinMutex { - int mLock; + int mLock; public: - btSpinMutex() - { - mLock = 0; - } - void lock(); - void unlock(); - bool tryLock(); + btSpinMutex() + { + mLock = 0; + } + void lock(); + void unlock(); + bool tryLock(); }; - // // NOTE: btMutex* is for internal Bullet use only // @@ -70,43 +67,42 @@ public: // of bad because if you call any of these functions from external code // (where BT_THREADSAFE is undefined) you will get unexpected race conditions. // -SIMD_FORCE_INLINE void btMutexLock( btSpinMutex* mutex ) +SIMD_FORCE_INLINE void btMutexLock(btSpinMutex* mutex) { #if BT_THREADSAFE - mutex->lock(); + mutex->lock(); #else - (void)mutex; -#endif // #if BT_THREADSAFE + (void)mutex; +#endif // #if BT_THREADSAFE } -SIMD_FORCE_INLINE void btMutexUnlock( btSpinMutex* mutex ) +SIMD_FORCE_INLINE void btMutexUnlock(btSpinMutex* mutex) { #if BT_THREADSAFE - mutex->unlock(); + mutex->unlock(); #else - (void)mutex; -#endif // #if BT_THREADSAFE + (void)mutex; +#endif // #if BT_THREADSAFE } -SIMD_FORCE_INLINE bool btMutexTryLock( btSpinMutex* mutex ) +SIMD_FORCE_INLINE bool btMutexTryLock(btSpinMutex* mutex) { #if BT_THREADSAFE - return mutex->tryLock(); + return mutex->tryLock(); #else - (void)mutex; - return true; -#endif // #if BT_THREADSAFE + (void)mutex; + return true; +#endif // #if BT_THREADSAFE } - // // btIParallelForBody -- subclass this to express work that can be done in parallel // class btIParallelForBody { public: - virtual ~btIParallelForBody() {} - virtual void forLoop( int iBegin, int iEnd ) const = 0; + virtual ~btIParallelForBody() {} + virtual void forLoop(int iBegin, int iEnd) const = 0; }; // @@ -116,8 +112,8 @@ public: class btIParallelSumBody { public: - virtual ~btIParallelSumBody() {} - virtual btScalar sumLoop( int iBegin, int iEnd ) const = 0; + virtual ~btIParallelSumBody() {} + virtual btScalar sumLoop(int iBegin, int iEnd) const = 0; }; // @@ -127,30 +123,30 @@ public: class btITaskScheduler { public: - btITaskScheduler( const char* name ); - virtual ~btITaskScheduler() {} - const char* getName() const { return m_name; } + btITaskScheduler(const char* name); + virtual ~btITaskScheduler() {} + const char* getName() const { return m_name; } - virtual int getMaxNumThreads() const = 0; - virtual int getNumThreads() const = 0; - virtual void setNumThreads( int numThreads ) = 0; - virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) = 0; - virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) = 0; - virtual void sleepWorkerThreadsHint() {} // hint the task scheduler that we may not be using these threads for a little while + virtual int getMaxNumThreads() const = 0; + virtual int getNumThreads() const = 0; + virtual void setNumThreads(int numThreads) = 0; + virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) = 0; + virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) = 0; + virtual void sleepWorkerThreadsHint() {} // hint the task scheduler that we may not be using these threads for a little while - // internal use only - virtual void activate(); - virtual void deactivate(); + // internal use only + virtual void activate(); + virtual void deactivate(); protected: - const char* m_name; - unsigned int m_savedThreadCounter; - bool m_isActive; + const char* m_name; + unsigned int m_savedThreadCounter; + bool m_isActive; }; // set the task scheduler to use for all calls to btParallelFor() // NOTE: you must set this prior to using any of the multi-threaded "Mt" classes -void btSetTaskScheduler( btITaskScheduler* ts ); +void btSetTaskScheduler(btITaskScheduler* ts); // get the current task scheduler btITaskScheduler* btGetTaskScheduler(); @@ -172,11 +168,10 @@ btITaskScheduler* btGetPPLTaskScheduler(); // btParallelFor -- call this to dispatch work like a for-loop // (iterations may be done out of order, so no dependencies are allowed) -void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ); +void btParallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body); // btParallelSum -- call this to dispatch work like a for-loop, returns the sum of all iterations // (iterations may be done out of order, so no dependencies are allowed) -btScalar btParallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ); - +btScalar btParallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body); #endif diff --git a/thirdparty/bullet/LinearMath/btTransform.h b/thirdparty/bullet/LinearMath/btTransform.h index d4f939a5d9..6f2f99818c 100644 --- a/thirdparty/bullet/LinearMath/btTransform.h +++ b/thirdparty/bullet/LinearMath/btTransform.h @@ -12,12 +12,9 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - - #ifndef BT_TRANSFORM_H #define BT_TRANSFORM_H - #include "btMatrix3x3.h" #ifdef BT_USE_DOUBLE_PRECISION @@ -26,46 +23,45 @@ subject to the following restrictions: #define btTransformData btTransformFloatData #endif - - - /**@brief The btTransform class supports rigid transforms with only translation and rotation and no scaling/shear. *It can be used in combination with btVector3, btQuaternion and btMatrix3x3 linear algebra classes. */ -ATTRIBUTE_ALIGNED16(class) btTransform { - - ///Storage for the rotation +ATTRIBUTE_ALIGNED16(class) +btTransform +{ + ///Storage for the rotation btMatrix3x3 m_basis; - ///Storage for the translation - btVector3 m_origin; + ///Storage for the translation + btVector3 m_origin; public: - - /**@brief No initialization constructor */ + /**@brief No initialization constructor */ btTransform() {} - /**@brief Constructor from btQuaternion (optional btVector3 ) + /**@brief Constructor from btQuaternion (optional btVector3 ) * @param q Rotation from quaternion * @param c Translation from Vector (default 0,0,0) */ - explicit SIMD_FORCE_INLINE btTransform(const btQuaternion& q, - const btVector3& c = btVector3(btScalar(0), btScalar(0), btScalar(0))) + explicit SIMD_FORCE_INLINE btTransform(const btQuaternion& q, + const btVector3& c = btVector3(btScalar(0), btScalar(0), btScalar(0))) : m_basis(q), - m_origin(c) - {} + m_origin(c) + { + } - /**@brief Constructor from btMatrix3x3 (optional btVector3) + /**@brief Constructor from btMatrix3x3 (optional btVector3) * @param b Rotation from Matrix * @param c Translation from Vector default (0,0,0)*/ - explicit SIMD_FORCE_INLINE btTransform(const btMatrix3x3& b, - const btVector3& c = btVector3(btScalar(0), btScalar(0), btScalar(0))) + explicit SIMD_FORCE_INLINE btTransform(const btMatrix3x3& b, + const btVector3& c = btVector3(btScalar(0), btScalar(0), btScalar(0))) : m_basis(b), - m_origin(c) - {} - /**@brief Copy constructor */ - SIMD_FORCE_INLINE btTransform (const btTransform& other) + m_origin(c) + { + } + /**@brief Copy constructor */ + SIMD_FORCE_INLINE btTransform(const btTransform& other) : m_basis(other.m_basis), - m_origin(other.m_origin) + m_origin(other.m_origin) { } - /**@brief Assignment Operator */ + /**@brief Assignment Operator */ SIMD_FORCE_INLINE btTransform& operator=(const btTransform& other) { m_basis = other.m_basis; @@ -73,70 +69,70 @@ public: return *this; } - - /**@brief Set the current transform as the value of the product of two transforms + /**@brief Set the current transform as the value of the product of two transforms * @param t1 Transform 1 * @param t2 Transform 2 * This = Transform1 * Transform2 */ - SIMD_FORCE_INLINE void mult(const btTransform& t1, const btTransform& t2) { - m_basis = t1.m_basis * t2.m_basis; - m_origin = t1(t2.m_origin); - } + SIMD_FORCE_INLINE void mult(const btTransform& t1, const btTransform& t2) + { + m_basis = t1.m_basis * t2.m_basis; + m_origin = t1(t2.m_origin); + } -/* void multInverseLeft(const btTransform& t1, const btTransform& t2) { + /* void multInverseLeft(const btTransform& t1, const btTransform& t2) { btVector3 v = t2.m_origin - t1.m_origin; m_basis = btMultTransposeLeft(t1.m_basis, t2.m_basis); m_origin = v * t1.m_basis; } */ -/**@brief Return the transform of the vector */ + /**@brief Return the transform of the vector */ SIMD_FORCE_INLINE btVector3 operator()(const btVector3& x) const { - return x.dot3(m_basis[0], m_basis[1], m_basis[2]) + m_origin; + return x.dot3(m_basis[0], m_basis[1], m_basis[2]) + m_origin; } - /**@brief Return the transform of the vector */ + /**@brief Return the transform of the vector */ SIMD_FORCE_INLINE btVector3 operator*(const btVector3& x) const { return (*this)(x); } - /**@brief Return the transform of the btQuaternion */ + /**@brief Return the transform of the btQuaternion */ SIMD_FORCE_INLINE btQuaternion operator*(const btQuaternion& q) const { return getRotation() * q; } - /**@brief Return the basis matrix for the rotation */ - SIMD_FORCE_INLINE btMatrix3x3& getBasis() { return m_basis; } - /**@brief Return the basis matrix for the rotation */ - SIMD_FORCE_INLINE const btMatrix3x3& getBasis() const { return m_basis; } + /**@brief Return the basis matrix for the rotation */ + SIMD_FORCE_INLINE btMatrix3x3& getBasis() { return m_basis; } + /**@brief Return the basis matrix for the rotation */ + SIMD_FORCE_INLINE const btMatrix3x3& getBasis() const { return m_basis; } - /**@brief Return the origin vector translation */ - SIMD_FORCE_INLINE btVector3& getOrigin() { return m_origin; } - /**@brief Return the origin vector translation */ - SIMD_FORCE_INLINE const btVector3& getOrigin() const { return m_origin; } + /**@brief Return the origin vector translation */ + SIMD_FORCE_INLINE btVector3& getOrigin() { return m_origin; } + /**@brief Return the origin vector translation */ + SIMD_FORCE_INLINE const btVector3& getOrigin() const { return m_origin; } - /**@brief Return a quaternion representing the rotation */ - btQuaternion getRotation() const { + /**@brief Return a quaternion representing the rotation */ + btQuaternion getRotation() const + { btQuaternion q; m_basis.getRotation(q); return q; } - - - /**@brief Set from an array + + /**@brief Set from an array * @param m A pointer to a 16 element array (12 rotation(row major padded on the right by 1), and 3 translation */ - void setFromOpenGLMatrix(const btScalar *m) + void setFromOpenGLMatrix(const btScalar* m) { m_basis.setFromOpenGLSubMatrix(m); - m_origin.setValue(m[12],m[13],m[14]); + m_origin.setValue(m[12], m[13], m[14]); } - /**@brief Fill an array representation + /**@brief Fill an array representation * @param m A pointer to a 16 element array (12 rotation(row major padded on the right by 1), and 3 translation */ - void getOpenGLMatrix(btScalar *m) const + void getOpenGLMatrix(btScalar * m) const { m_basis.getOpenGLSubMatrix(m); m[12] = m_origin.x(); @@ -145,80 +141,76 @@ public: m[15] = btScalar(1.0); } - /**@brief Set the translational element + /**@brief Set the translational element * @param origin The vector to set the translation to */ - SIMD_FORCE_INLINE void setOrigin(const btVector3& origin) - { + SIMD_FORCE_INLINE void setOrigin(const btVector3& origin) + { m_origin = origin; } SIMD_FORCE_INLINE btVector3 invXform(const btVector3& inVec) const; - - /**@brief Set the rotational element by btMatrix3x3 */ + /**@brief Set the rotational element by btMatrix3x3 */ SIMD_FORCE_INLINE void setBasis(const btMatrix3x3& basis) - { + { m_basis = basis; } - /**@brief Set the rotational element by btQuaternion */ + /**@brief Set the rotational element by btQuaternion */ SIMD_FORCE_INLINE void setRotation(const btQuaternion& q) { m_basis.setRotation(q); } - - /**@brief Set this transformation to the identity */ + /**@brief Set this transformation to the identity */ void setIdentity() { m_basis.setIdentity(); m_origin.setValue(btScalar(0.0), btScalar(0.0), btScalar(0.0)); } - /**@brief Multiply this Transform by another(this = this * another) + /**@brief Multiply this Transform by another(this = this * another) * @param t The other transform */ - btTransform& operator*=(const btTransform& t) + btTransform& operator*=(const btTransform& t) { m_origin += m_basis * t.m_origin; m_basis *= t.m_basis; return *this; } - /**@brief Return the inverse of this transform */ + /**@brief Return the inverse of this transform */ btTransform inverse() const - { + { btMatrix3x3 inv = m_basis.transpose(); return btTransform(inv, inv * -m_origin); } - /**@brief Return the inverse of this transform times the other transform + /**@brief Return the inverse of this transform times the other transform * @param t The other transform * return this.inverse() * the other */ - btTransform inverseTimes(const btTransform& t) const; + btTransform inverseTimes(const btTransform& t) const; - /**@brief Return the product of this transform and the other */ + /**@brief Return the product of this transform and the other */ btTransform operator*(const btTransform& t) const; - /**@brief Return an identity transform */ - static const btTransform& getIdentity() + /**@brief Return an identity transform */ + static const btTransform& getIdentity() { static const btTransform identityTransform(btMatrix3x3::getIdentity()); return identityTransform; } - void serialize(struct btTransformData& dataOut) const; - - void serializeFloat(struct btTransformFloatData& dataOut) const; + void serialize(struct btTransformData & dataOut) const; - void deSerialize(const struct btTransformData& dataIn); + void serializeFloat(struct btTransformFloatData & dataOut) const; - void deSerializeDouble(const struct btTransformDoubleData& dataIn); + void deSerialize(const struct btTransformData& dataIn); - void deSerializeFloat(const struct btTransformFloatData& dataIn); + void deSerializeDouble(const struct btTransformDoubleData& dataIn); + void deSerializeFloat(const struct btTransformFloatData& dataIn); }; - SIMD_FORCE_INLINE btVector3 btTransform::invXform(const btVector3& inVec) const { @@ -226,80 +218,69 @@ btTransform::invXform(const btVector3& inVec) const return (m_basis.transpose() * v); } -SIMD_FORCE_INLINE btTransform -btTransform::inverseTimes(const btTransform& t) const +SIMD_FORCE_INLINE btTransform +btTransform::inverseTimes(const btTransform& t) const { btVector3 v = t.getOrigin() - m_origin; - return btTransform(m_basis.transposeTimes(t.m_basis), - v * m_basis); + return btTransform(m_basis.transposeTimes(t.m_basis), + v * m_basis); } -SIMD_FORCE_INLINE btTransform -btTransform::operator*(const btTransform& t) const +SIMD_FORCE_INLINE btTransform + btTransform::operator*(const btTransform& t) const { - return btTransform(m_basis * t.m_basis, - (*this)(t.m_origin)); + return btTransform(m_basis * t.m_basis, + (*this)(t.m_origin)); } /**@brief Test if two transforms have all elements equal */ SIMD_FORCE_INLINE bool operator==(const btTransform& t1, const btTransform& t2) { - return ( t1.getBasis() == t2.getBasis() && - t1.getOrigin() == t2.getOrigin() ); + return (t1.getBasis() == t2.getBasis() && + t1.getOrigin() == t2.getOrigin()); } - ///for serialization -struct btTransformFloatData +struct btTransformFloatData { - btMatrix3x3FloatData m_basis; - btVector3FloatData m_origin; + btMatrix3x3FloatData m_basis; + btVector3FloatData m_origin; }; -struct btTransformDoubleData +struct btTransformDoubleData { - btMatrix3x3DoubleData m_basis; - btVector3DoubleData m_origin; + btMatrix3x3DoubleData m_basis; + btVector3DoubleData m_origin; }; - - -SIMD_FORCE_INLINE void btTransform::serialize(btTransformData& dataOut) const +SIMD_FORCE_INLINE void btTransform::serialize(btTransformData& dataOut) const { m_basis.serialize(dataOut.m_basis); m_origin.serialize(dataOut.m_origin); } -SIMD_FORCE_INLINE void btTransform::serializeFloat(btTransformFloatData& dataOut) const +SIMD_FORCE_INLINE void btTransform::serializeFloat(btTransformFloatData& dataOut) const { m_basis.serializeFloat(dataOut.m_basis); m_origin.serializeFloat(dataOut.m_origin); } - -SIMD_FORCE_INLINE void btTransform::deSerialize(const btTransformData& dataIn) +SIMD_FORCE_INLINE void btTransform::deSerialize(const btTransformData& dataIn) { m_basis.deSerialize(dataIn.m_basis); m_origin.deSerialize(dataIn.m_origin); } -SIMD_FORCE_INLINE void btTransform::deSerializeFloat(const btTransformFloatData& dataIn) +SIMD_FORCE_INLINE void btTransform::deSerializeFloat(const btTransformFloatData& dataIn) { m_basis.deSerializeFloat(dataIn.m_basis); m_origin.deSerializeFloat(dataIn.m_origin); } -SIMD_FORCE_INLINE void btTransform::deSerializeDouble(const btTransformDoubleData& dataIn) +SIMD_FORCE_INLINE void btTransform::deSerializeDouble(const btTransformDoubleData& dataIn) { m_basis.deSerializeDouble(dataIn.m_basis); m_origin.deSerializeDouble(dataIn.m_origin); } - -#endif //BT_TRANSFORM_H - - - - - - +#endif //BT_TRANSFORM_H diff --git a/thirdparty/bullet/LinearMath/btTransformUtil.h b/thirdparty/bullet/LinearMath/btTransformUtil.h index 182cc43fab..b874dd6807 100644 --- a/thirdparty/bullet/LinearMath/btTransformUtil.h +++ b/thirdparty/bullet/LinearMath/btTransformUtil.h @@ -12,77 +12,66 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - #ifndef BT_TRANSFORM_UTIL_H #define BT_TRANSFORM_UTIL_H #include "btTransform.h" -#define ANGULAR_MOTION_THRESHOLD btScalar(0.5)*SIMD_HALF_PI - - - +#define ANGULAR_MOTION_THRESHOLD btScalar(0.5) * SIMD_HALF_PI -SIMD_FORCE_INLINE btVector3 btAabbSupport(const btVector3& halfExtents,const btVector3& supportDir) +SIMD_FORCE_INLINE btVector3 btAabbSupport(const btVector3& halfExtents, const btVector3& supportDir) { return btVector3(supportDir.x() < btScalar(0.0) ? -halfExtents.x() : halfExtents.x(), - supportDir.y() < btScalar(0.0) ? -halfExtents.y() : halfExtents.y(), - supportDir.z() < btScalar(0.0) ? -halfExtents.z() : halfExtents.z()); + supportDir.y() < btScalar(0.0) ? -halfExtents.y() : halfExtents.y(), + supportDir.z() < btScalar(0.0) ? -halfExtents.z() : halfExtents.z()); } - - - - - /// Utils related to temporal transforms class btTransformUtil { - public: - - static void integrateTransform(const btTransform& curTrans,const btVector3& linvel,const btVector3& angvel,btScalar timeStep,btTransform& predictedTransform) + static void integrateTransform(const btTransform& curTrans, const btVector3& linvel, const btVector3& angvel, btScalar timeStep, btTransform& predictedTransform) { predictedTransform.setOrigin(curTrans.getOrigin() + linvel * timeStep); -// #define QUATERNION_DERIVATIVE - #ifdef QUATERNION_DERIVATIVE + // #define QUATERNION_DERIVATIVE +#ifdef QUATERNION_DERIVATIVE btQuaternion predictedOrn = curTrans.getRotation(); predictedOrn += (angvel * predictedOrn) * (timeStep * btScalar(0.5)); predictedOrn.safeNormalize(); - #else +#else //Exponential map //google for "Practical Parameterization of Rotations Using the Exponential Map", F. Sebastian Grassia btVector3 axis; - btScalar fAngle2 = angvel.length2(); - btScalar fAngle = 0; - if (fAngle2>SIMD_EPSILON) - { - fAngle = btSqrt(fAngle2); - } + btScalar fAngle2 = angvel.length2(); + btScalar fAngle = 0; + if (fAngle2 > SIMD_EPSILON) + { + fAngle = btSqrt(fAngle2); + } //limit the angular motion - if (fAngle*timeStep > ANGULAR_MOTION_THRESHOLD) + if (fAngle * timeStep > ANGULAR_MOTION_THRESHOLD) { fAngle = ANGULAR_MOTION_THRESHOLD / timeStep; } - if ( fAngle < btScalar(0.001) ) + if (fAngle < btScalar(0.001)) { // use Taylor's expansions of sync function - axis = angvel*( btScalar(0.5)*timeStep-(timeStep*timeStep*timeStep)*(btScalar(0.020833333333))*fAngle*fAngle ); + axis = angvel * (btScalar(0.5) * timeStep - (timeStep * timeStep * timeStep) * (btScalar(0.020833333333)) * fAngle * fAngle); } else { // sync(fAngle) = sin(c*fAngle)/t - axis = angvel*( btSin(btScalar(0.5)*fAngle*timeStep)/fAngle ); + axis = angvel * (btSin(btScalar(0.5) * fAngle * timeStep) / fAngle); } - btQuaternion dorn (axis.x(),axis.y(),axis.z(),btCos( fAngle*timeStep*btScalar(0.5) )); + btQuaternion dorn(axis.x(), axis.y(), axis.z(), btCos(fAngle * timeStep * btScalar(0.5))); btQuaternion orn0 = curTrans.getRotation(); btQuaternion predictedOrn = dorn * orn0; predictedOrn.safeNormalize(); - #endif - if (predictedOrn.length2()>SIMD_EPSILON) +#endif + if (predictedOrn.length2() > SIMD_EPSILON) { predictedTransform.setRotation(predictedOrn); } @@ -92,137 +81,133 @@ public: } } - static void calculateVelocityQuaternion(const btVector3& pos0,const btVector3& pos1,const btQuaternion& orn0,const btQuaternion& orn1,btScalar timeStep,btVector3& linVel,btVector3& angVel) + static void calculateVelocityQuaternion(const btVector3& pos0, const btVector3& pos1, const btQuaternion& orn0, const btQuaternion& orn1, btScalar timeStep, btVector3& linVel, btVector3& angVel) { linVel = (pos1 - pos0) / timeStep; btVector3 axis; - btScalar angle; + btScalar angle; if (orn0 != orn1) { - calculateDiffAxisAngleQuaternion(orn0,orn1,axis,angle); + calculateDiffAxisAngleQuaternion(orn0, orn1, axis, angle); angVel = axis * angle / timeStep; - } else + } + else { - angVel.setValue(0,0,0); + angVel.setValue(0, 0, 0); } } - static void calculateDiffAxisAngleQuaternion(const btQuaternion& orn0,const btQuaternion& orn1a,btVector3& axis,btScalar& angle) + static void calculateDiffAxisAngleQuaternion(const btQuaternion& orn0, const btQuaternion& orn1a, btVector3& axis, btScalar& angle) { btQuaternion orn1 = orn0.nearest(orn1a); btQuaternion dorn = orn1 * orn0.inverse(); angle = dorn.getAngle(); - axis = btVector3(dorn.x(),dorn.y(),dorn.z()); + axis = btVector3(dorn.x(), dorn.y(), dorn.z()); axis[3] = btScalar(0.); //check for axis length btScalar len = axis.length2(); - if (len < SIMD_EPSILON*SIMD_EPSILON) - axis = btVector3(btScalar(1.),btScalar(0.),btScalar(0.)); + if (len < SIMD_EPSILON * SIMD_EPSILON) + axis = btVector3(btScalar(1.), btScalar(0.), btScalar(0.)); else axis /= btSqrt(len); } - static void calculateVelocity(const btTransform& transform0,const btTransform& transform1,btScalar timeStep,btVector3& linVel,btVector3& angVel) + static void calculateVelocity(const btTransform& transform0, const btTransform& transform1, btScalar timeStep, btVector3& linVel, btVector3& angVel) { linVel = (transform1.getOrigin() - transform0.getOrigin()) / timeStep; btVector3 axis; - btScalar angle; - calculateDiffAxisAngle(transform0,transform1,axis,angle); + btScalar angle; + calculateDiffAxisAngle(transform0, transform1, axis, angle); angVel = axis * angle / timeStep; } - static void calculateDiffAxisAngle(const btTransform& transform0,const btTransform& transform1,btVector3& axis,btScalar& angle) + static void calculateDiffAxisAngle(const btTransform& transform0, const btTransform& transform1, btVector3& axis, btScalar& angle) { btMatrix3x3 dmat = transform1.getBasis() * transform0.getBasis().inverse(); btQuaternion dorn; dmat.getRotation(dorn); - ///floating point inaccuracy can lead to w component > 1..., which breaks + ///floating point inaccuracy can lead to w component > 1..., which breaks dorn.normalize(); - + angle = dorn.getAngle(); - axis = btVector3(dorn.x(),dorn.y(),dorn.z()); + axis = btVector3(dorn.x(), dorn.y(), dorn.z()); axis[3] = btScalar(0.); //check for axis length btScalar len = axis.length2(); - if (len < SIMD_EPSILON*SIMD_EPSILON) - axis = btVector3(btScalar(1.),btScalar(0.),btScalar(0.)); + if (len < SIMD_EPSILON * SIMD_EPSILON) + axis = btVector3(btScalar(1.), btScalar(0.), btScalar(0.)); else axis /= btSqrt(len); } - }; - -///The btConvexSeparatingDistanceUtil can help speed up convex collision detection +///The btConvexSeparatingDistanceUtil can help speed up convex collision detection ///by conservatively updating a cached separating distance/vector instead of re-calculating the closest distance -class btConvexSeparatingDistanceUtil +class btConvexSeparatingDistanceUtil { - btQuaternion m_ornA; - btQuaternion m_ornB; - btVector3 m_posA; - btVector3 m_posB; - - btVector3 m_separatingNormal; + btQuaternion m_ornA; + btQuaternion m_ornB; + btVector3 m_posA; + btVector3 m_posB; - btScalar m_boundingRadiusA; - btScalar m_boundingRadiusB; - btScalar m_separatingDistance; + btVector3 m_separatingNormal; -public: + btScalar m_boundingRadiusA; + btScalar m_boundingRadiusB; + btScalar m_separatingDistance; - btConvexSeparatingDistanceUtil(btScalar boundingRadiusA,btScalar boundingRadiusB) - :m_boundingRadiusA(boundingRadiusA), - m_boundingRadiusB(boundingRadiusB), - m_separatingDistance(0.f) +public: + btConvexSeparatingDistanceUtil(btScalar boundingRadiusA, btScalar boundingRadiusB) + : m_boundingRadiusA(boundingRadiusA), + m_boundingRadiusB(boundingRadiusB), + m_separatingDistance(0.f) { } - btScalar getConservativeSeparatingDistance() + btScalar getConservativeSeparatingDistance() { return m_separatingDistance; } - void updateSeparatingDistance(const btTransform& transA,const btTransform& transB) + void updateSeparatingDistance(const btTransform& transA, const btTransform& transB) { const btVector3& toPosA = transA.getOrigin(); const btVector3& toPosB = transB.getOrigin(); btQuaternion toOrnA = transA.getRotation(); btQuaternion toOrnB = transB.getRotation(); - if (m_separatingDistance>0.f) + if (m_separatingDistance > 0.f) { - - - btVector3 linVelA,angVelA,linVelB,angVelB; - btTransformUtil::calculateVelocityQuaternion(m_posA,toPosA,m_ornA,toOrnA,btScalar(1.),linVelA,angVelA); - btTransformUtil::calculateVelocityQuaternion(m_posB,toPosB,m_ornB,toOrnB,btScalar(1.),linVelB,angVelB); + btVector3 linVelA, angVelA, linVelB, angVelB; + btTransformUtil::calculateVelocityQuaternion(m_posA, toPosA, m_ornA, toOrnA, btScalar(1.), linVelA, angVelA); + btTransformUtil::calculateVelocityQuaternion(m_posB, toPosB, m_ornB, toOrnB, btScalar(1.), linVelB, angVelB); btScalar maxAngularProjectedVelocity = angVelA.length() * m_boundingRadiusA + angVelB.length() * m_boundingRadiusB; - btVector3 relLinVel = (linVelB-linVelA); + btVector3 relLinVel = (linVelB - linVelA); btScalar relLinVelocLength = relLinVel.dot(m_separatingNormal); - if (relLinVelocLength<0.f) + if (relLinVelocLength < 0.f) { relLinVelocLength = 0.f; } - - btScalar projectedMotion = maxAngularProjectedVelocity +relLinVelocLength; + + btScalar projectedMotion = maxAngularProjectedVelocity + relLinVelocLength; m_separatingDistance -= projectedMotion; } - + m_posA = toPosA; m_posB = toPosB; m_ornA = toOrnA; m_ornB = toOrnB; } - void initSeparatingDistance(const btVector3& separatingVector,btScalar separatingDistance,const btTransform& transA,const btTransform& transB) + void initSeparatingDistance(const btVector3& separatingVector, btScalar separatingDistance, const btTransform& transA, const btTransform& transB) { m_separatingDistance = separatingDistance; - if (m_separatingDistance>0.f) + if (m_separatingDistance > 0.f) { m_separatingNormal = separatingVector; - + const btVector3& toPosA = transA.getOrigin(); const btVector3& toPosB = transB.getOrigin(); btQuaternion toOrnA = transA.getRotation(); @@ -233,9 +218,6 @@ public: m_ornB = toOrnB; } } - }; - -#endif //BT_TRANSFORM_UTIL_H - +#endif //BT_TRANSFORM_UTIL_H diff --git a/thirdparty/bullet/LinearMath/btVector3.cpp b/thirdparty/bullet/LinearMath/btVector3.cpp index e05bdccd67..13111157af 100644 --- a/thirdparty/bullet/LinearMath/btVector3.cpp +++ b/thirdparty/bullet/LinearMath/btVector3.cpp @@ -15,282 +15,285 @@ This source version has been altered. */ -#if defined (_WIN32) || defined (__i386__) +#if defined(_WIN32) || defined(__i386__) #define BT_USE_SSE_IN_API #endif - #include "btVector3.h" - - #if defined BT_USE_SIMD_VECTOR3 #if DEBUG -#include //for memset +#include //for memset #endif - #ifdef __APPLE__ #include -typedef float float4 __attribute__ ((vector_size(16))); +typedef float float4 __attribute__((vector_size(16))); #else #define float4 __m128 #endif //typedef uint32_t uint4 __attribute__ ((vector_size(16))); - #if defined BT_USE_SSE || defined _WIN32 -#define LOG2_ARRAY_SIZE 6 -#define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE) +#define LOG2_ARRAY_SIZE 6 +#define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE) #include -long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult ); -long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult ) +long _maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult); +long _maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult) { - const float4 *vertices = (const float4*) vv; - static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 }; - float4 dotMax = btAssign128( -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY ); - float4 vvec = _mm_loadu_ps( vec ); - float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa )); /// zzzz - float4 vLo = _mm_movelh_ps( vvec, vvec ); /// xyxy - - long maxIndex = -1L; - - size_t segment = 0; - float4 stack_array[ STACK_ARRAY_COUNT ]; - + const float4 *vertices = (const float4 *)vv; + static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0}; + float4 dotMax = btAssign128(-BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY); + float4 vvec = _mm_loadu_ps(vec); + float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa)); /// zzzz + float4 vLo = _mm_movelh_ps(vvec, vvec); /// xyxy + + long maxIndex = -1L; + + size_t segment = 0; + float4 stack_array[STACK_ARRAY_COUNT]; + #if DEBUG - //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) ); + //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) ); #endif - - size_t index; - float4 max; - // Faster loop without cleanup code for full tiles - for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 ) - { - max = dotMax; - - for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 ) - { // do four dot products at a time. Carefully avoid touching the w element. - float4 v0 = vertices[0]; - float4 v1 = vertices[1]; - float4 v2 = vertices[2]; - float4 v3 = vertices[3]; vertices += 4; - - float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); - float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); - float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+1] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+2] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+3] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - - // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way. - } - - // If we found a new max - if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax))) - { - // copy the new max across all lanes of our max accumulator - max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e)); - max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1)); - - dotMax = max; - - // find first occurrence of that max - size_t test; - for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4 - {} - // record where it is. - maxIndex = 4*index + segment + indexTable[test]; - } - } - - // account for work we've already done - count -= segment; - - // Deal with the last < STACK_ARRAY_COUNT vectors - max = dotMax; - index = 0; - - - if( btUnlikely( count > 16) ) - { - for( ; index + 4 <= count / 4; index+=4 ) - { // do four dot products at a time. Carefully avoid touching the w element. - float4 v0 = vertices[0]; - float4 v1 = vertices[1]; - float4 v2 = vertices[2]; - float4 v3 = vertices[3]; vertices += 4; - - float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); - float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); - float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+1] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+2] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+3] = x; - max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - - // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way. - } - } - - size_t localCount = (count & -4L) - 4*index; - if( localCount ) - { + + size_t index; + float4 max; + // Faster loop without cleanup code for full tiles + for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4) + { + max = dotMax; + + for (index = 0; index < STACK_ARRAY_COUNT; index += 4) + { // do four dot products at a time. Carefully avoid touching the w element. + float4 v0 = vertices[0]; + float4 v1 = vertices[1]; + float4 v2 = vertices[2]; + float4 v3 = vertices[3]; + vertices += 4; + + float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); + float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); + float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 1] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 2] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 3] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + + // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way. + } + + // If we found a new max + if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax))) + { + // copy the new max across all lanes of our max accumulator + max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e)); + max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1)); + + dotMax = max; + + // find first occurrence of that max + size_t test; + for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++) // local_count must be a multiple of 4 + { + } + // record where it is. + maxIndex = 4 * index + segment + indexTable[test]; + } + } + + // account for work we've already done + count -= segment; + + // Deal with the last < STACK_ARRAY_COUNT vectors + max = dotMax; + index = 0; + + if (btUnlikely(count > 16)) + { + for (; index + 4 <= count / 4; index += 4) + { // do four dot products at a time. Carefully avoid touching the w element. + float4 v0 = vertices[0]; + float4 v1 = vertices[1]; + float4 v2 = vertices[2]; + float4 v3 = vertices[3]; + vertices += 4; + + float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); + float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); + float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 1] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 2] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 3] = x; + max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan + + // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way. + } + } + + size_t localCount = (count & -4L) - 4 * index; + if (localCount) + { #ifdef __APPLE__ - float4 t0, t1, t2, t3, t4; - float4 * sap = &stack_array[index + localCount / 4]; - vertices += localCount; // counter the offset - size_t byteIndex = -(localCount) * sizeof(float); - //AT&T Code style assembly - asm volatile - ( ".align 4 \n\ + float4 t0, t1, t2, t3, t4; + float4 *sap = &stack_array[index + localCount / 4]; + vertices += localCount; // counter the offset + size_t byteIndex = -(localCount) * sizeof(float); + //AT&T Code style assembly + asm volatile( + ".align 4 \n\ 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\ movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\ movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\ @@ -316,368 +319,374 @@ long _maxdot_large( const float *vv, const float *vec, unsigned long count, floa add $16, %[byteIndex] // advance loop counter\n\ jnz 0b \n\ " - : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex) - : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap) - : "memory", "cc" - ); - index += localCount/4; + : [max] "+x"(max), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex) + : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap) + : "memory", "cc"); + index += localCount / 4; #else - { - for( unsigned int i=0; i 16) ) - { - for( ; index + 4 <= count / 4; index+=4 ) - { // do four dot products at a time. Carefully avoid touching the w element. - float4 v0 = vertices[0]; - float4 v1 = vertices[1]; - float4 v2 = vertices[2]; - float4 v3 = vertices[3]; vertices += 4; - - float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); - float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); - float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index] = x; - min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+1] = x; - min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+2] = x; - min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan - - v0 = vertices[0]; - v1 = vertices[1]; - v2 = vertices[2]; - v3 = vertices[3]; vertices += 4; - - lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 - hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 - lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 - hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - - lo0 = lo0*vLo; - lo1 = lo1*vLo; - z = _mm_shuffle_ps(hi0, hi1, 0x88); - x = _mm_shuffle_ps(lo0, lo1, 0x88); - y = _mm_shuffle_ps(lo0, lo1, 0xdd); - z = z*vHi; - x = x+y; - x = x+z; - stack_array[index+3] = x; - min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan - - // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way. - } - } - - size_t localCount = (count & -4L) - 4*index; - if( localCount ) - { - - + + size_t index; + float4 min; + // Faster loop without cleanup code for full tiles + for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4) + { + min = dotmin; + + for (index = 0; index < STACK_ARRAY_COUNT; index += 4) + { // do four dot products at a time. Carefully avoid touching the w element. + float4 v0 = vertices[0]; + float4 v1 = vertices[1]; + float4 v2 = vertices[2]; + float4 v3 = vertices[3]; + vertices += 4; + + float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); + float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); + float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index] = x; + min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 1] = x; + min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 2] = x; + min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 3] = x; + min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan + + // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way. + } + + // If we found a new min + if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin))) + { + // copy the new min across all lanes of our min accumulator + min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e)); + min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1)); + + dotmin = min; + + // find first occurrence of that min + size_t test; + for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++) // local_count must be a multiple of 4 + { + } + // record where it is. + minIndex = 4 * index + segment + indexTable[test]; + } + } + + // account for work we've already done + count -= segment; + + // Deal with the last < STACK_ARRAY_COUNT vectors + min = dotmin; + index = 0; + + if (btUnlikely(count > 16)) + { + for (; index + 4 <= count / 4; index += 4) + { // do four dot products at a time. Carefully avoid touching the w element. + float4 v0 = vertices[0]; + float4 v1 = vertices[1]; + float4 v2 = vertices[2]; + float4 v3 = vertices[3]; + vertices += 4; + + float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); + float4 x = _mm_shuffle_ps(lo0, lo1, 0x88); + float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index] = x; + min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 1] = x; + min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 2] = x; + min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan + + v0 = vertices[0]; + v1 = vertices[1]; + v2 = vertices[2]; + v3 = vertices[3]; + vertices += 4; + + lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1 + hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1 + lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3 + hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3 + + lo0 = lo0 * vLo; + lo1 = lo1 * vLo; + z = _mm_shuffle_ps(hi0, hi1, 0x88); + x = _mm_shuffle_ps(lo0, lo1, 0x88); + y = _mm_shuffle_ps(lo0, lo1, 0xdd); + z = z * vHi; + x = x + y; + x = x + z; + stack_array[index + 3] = x; + min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan + + // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way. + } + } + + size_t localCount = (count & -4L) - 4 * index; + if (localCount) + { #ifdef __APPLE__ - vertices += localCount; // counter the offset - float4 t0, t1, t2, t3, t4; - size_t byteIndex = -(localCount) * sizeof(float); - float4 * sap = &stack_array[index + localCount / 4]; - - asm volatile - ( ".align 4 \n\ + vertices += localCount; // counter the offset + float4 t0, t1, t2, t3, t4; + size_t byteIndex = -(localCount) * sizeof(float); + float4 *sap = &stack_array[index + localCount / 4]; + + asm volatile( + ".align 4 \n\ 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\ movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\ movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\ @@ -703,968 +712,953 @@ long _mindot_large( const float *vv, const float *vec, unsigned long count, floa add $16, %[byteIndex] // advance loop counter\n\ jnz 0b \n\ " - : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex) - : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap) - : "memory", "cc" - ); - index += localCount/4; + : [min] "+x"(min), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex) + : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap) + : "memory", "cc"); + index += localCount / 4; #else - { - for( unsigned int i=0; i #include -#include //for sysctlbyname +#include //for sysctlbyname -static long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult ); -static long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult ); -static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult ); -static long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult ); -static long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult ); -static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult ); +static long _maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult); +static long _maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult); +static long _maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult); +static long _mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult); +static long _mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult); +static long _mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult); -long (*_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _maxdot_large_sel; -long (*_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _mindot_large_sel; +long (*_maxdot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = _maxdot_large_sel; +long (*_mindot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = _mindot_large_sel; - -static inline uint32_t btGetCpuCapabilities( void ) +static inline uint32_t btGetCpuCapabilities(void) { - static uint32_t capabilities = 0; - static bool testedCapabilities = false; + static uint32_t capabilities = 0; + static bool testedCapabilities = false; - if( 0 == testedCapabilities) - { - uint32_t hasFeature = 0; - size_t featureSize = sizeof( hasFeature ); - int err = sysctlbyname( "hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0 ); + if (0 == testedCapabilities) + { + uint32_t hasFeature = 0; + size_t featureSize = sizeof(hasFeature); + int err = sysctlbyname("hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0); - if( 0 == err && hasFeature) - capabilities |= 0x2000; + if (0 == err && hasFeature) + capabilities |= 0x2000; testedCapabilities = true; - } - - return capabilities; -} - - + } + return capabilities; +} -static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult ) +static long _maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult) { + if (btGetCpuCapabilities() & 0x2000) + _maxdot_large = _maxdot_large_v1; + else + _maxdot_large = _maxdot_large_v0; - if( btGetCpuCapabilities() & 0x2000 ) - _maxdot_large = _maxdot_large_v1; - else - _maxdot_large = _maxdot_large_v0; - - return _maxdot_large(vv, vec, count, dotResult); + return _maxdot_large(vv, vec, count, dotResult); } -static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult ) +static long _mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult) { + if (btGetCpuCapabilities() & 0x2000) + _mindot_large = _mindot_large_v1; + else + _mindot_large = _mindot_large_v0; - if( btGetCpuCapabilities() & 0x2000 ) - _mindot_large = _mindot_large_v1; - else - _mindot_large = _mindot_large_v0; - - return _mindot_large(vv, vec, count, dotResult); + return _mindot_large(vv, vec, count, dotResult); } - - #if defined __arm__ -# define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; }) +#define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; }) #else //support 64bit arm -# define vld1q_f32_aligned_postincrement( _ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); /*return*/ _r; }) +#define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); /*return*/ _r; }) #endif - -long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult ) +long _maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult) { - unsigned long i = 0; - float32x4_t vvec = vld1q_f32_aligned_postincrement( vec ); - float32x2_t vLo = vget_low_f32(vvec); - float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0); - float32x2_t dotMaxLo = (float32x2_t) { -BT_INFINITY, -BT_INFINITY }; - float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY }; - uint32x2_t indexLo = (uint32x2_t) {0, 1}; - uint32x2_t indexHi = (uint32x2_t) {2, 3}; - uint32x2_t iLo = (uint32x2_t) {static_cast(-1), static_cast(-1)}; - uint32x2_t iHi = (uint32x2_t) {static_cast(-1), static_cast(-1)}; - const uint32x2_t four = (uint32x2_t) {4,4}; - - for( ; i+8 <= count; i+= 8 ) - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); - - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); - float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); - float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo); - - float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); - float32x2_t zLo = vmul_f32( z0.val[0], vHi); - float32x2_t zHi = vmul_f32( z1.val[0], vHi); - - float32x2_t rLo = vpadd_f32( xy0, xy1); - float32x2_t rHi = vpadd_f32( xy2, xy3); - rLo = vadd_f32(rLo, zLo); - rHi = vadd_f32(rHi, zHi); - - uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo ); - uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi ); - dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); - dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi); - iLo = vbsl_u32(maskLo, indexLo, iLo); - iHi = vbsl_u32(maskHi, indexHi, iHi); - indexLo = vadd_u32(indexLo, four); - indexHi = vadd_u32(indexHi, four); - - v0 = vld1q_f32_aligned_postincrement( vv ); - v1 = vld1q_f32_aligned_postincrement( vv ); - v2 = vld1q_f32_aligned_postincrement( vv ); - v3 = vld1q_f32_aligned_postincrement( vv ); - - xy0 = vmul_f32( vget_low_f32(v0), vLo); - xy1 = vmul_f32( vget_low_f32(v1), vLo); - xy2 = vmul_f32( vget_low_f32(v2), vLo); - xy3 = vmul_f32( vget_low_f32(v3), vLo); - - z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); - zLo = vmul_f32( z0.val[0], vHi); - zHi = vmul_f32( z1.val[0], vHi); - - rLo = vpadd_f32( xy0, xy1); - rHi = vpadd_f32( xy2, xy3); - rLo = vadd_f32(rLo, zLo); - rHi = vadd_f32(rHi, zHi); - - maskLo = vcgt_f32( rLo, dotMaxLo ); - maskHi = vcgt_f32( rHi, dotMaxHi ); - dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); - dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi); - iLo = vbsl_u32(maskLo, indexLo, iLo); - iHi = vbsl_u32(maskHi, indexHi, iHi); - indexLo = vadd_u32(indexLo, four); - indexHi = vadd_u32(indexHi, four); - } - - for( ; i+4 <= count; i+= 4 ) - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); - - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); - float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); - float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo); - - float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); - float32x2_t zLo = vmul_f32( z0.val[0], vHi); - float32x2_t zHi = vmul_f32( z1.val[0], vHi); - - float32x2_t rLo = vpadd_f32( xy0, xy1); - float32x2_t rHi = vpadd_f32( xy2, xy3); - rLo = vadd_f32(rLo, zLo); - rHi = vadd_f32(rHi, zHi); - - uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo ); - uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi ); - dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); - dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi); - iLo = vbsl_u32(maskLo, indexLo, iLo); - iHi = vbsl_u32(maskHi, indexHi, iHi); - indexLo = vadd_u32(indexLo, four); - indexHi = vadd_u32(indexHi, four); - } - - switch( count & 3 ) - { - case 3: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); - float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); - - float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x2_t zLo = vmul_f32( z0.val[0], vHi); - float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi); - - float32x2_t rLo = vpadd_f32( xy0, xy1); - float32x2_t rHi = vpadd_f32( xy2, xy2); - rLo = vadd_f32(rLo, zLo); - rHi = vadd_f32(rHi, zHi); - - uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo ); - uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi ); - dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); - dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi); - iLo = vbsl_u32(maskLo, indexLo, iLo); - iHi = vbsl_u32(maskHi, indexHi, iHi); - } - break; - case 2: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); - - float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x2_t zLo = vmul_f32( z0.val[0], vHi); - - float32x2_t rLo = vpadd_f32( xy0, xy1); - rLo = vadd_f32(rLo, zLo); - - uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo ); - dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); - iLo = vbsl_u32(maskLo, indexLo, iLo); - } - break; - case 1: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0); - float32x2_t zLo = vmul_f32( z0, vHi); - float32x2_t rLo = vpadd_f32( xy0, xy0); - rLo = vadd_f32(rLo, zLo); - uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo ); - dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo); - iLo = vbsl_u32(maskLo, indexLo, iLo); - } - break; - - default: - break; - } - - // select best answer between hi and lo results - uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo ); - dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo); - iLo = vbsl_u32(mask, iHi, iLo); - - // select best answer between even and odd results - dotMaxHi = vdup_lane_f32(dotMaxLo, 1); - iHi = vdup_lane_u32(iLo, 1); - mask = vcgt_f32( dotMaxHi, dotMaxLo ); - dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo); - iLo = vbsl_u32(mask, iHi, iLo); - - *dotResult = vget_lane_f32( dotMaxLo, 0); - return vget_lane_u32(iLo, 0); -} + unsigned long i = 0; + float32x4_t vvec = vld1q_f32_aligned_postincrement(vec); + float32x2_t vLo = vget_low_f32(vvec); + float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0); + float32x2_t dotMaxLo = (float32x2_t){-BT_INFINITY, -BT_INFINITY}; + float32x2_t dotMaxHi = (float32x2_t){-BT_INFINITY, -BT_INFINITY}; + uint32x2_t indexLo = (uint32x2_t){0, 1}; + uint32x2_t indexHi = (uint32x2_t){2, 3}; + uint32x2_t iLo = (uint32x2_t){static_cast(-1), static_cast(-1)}; + uint32x2_t iHi = (uint32x2_t){static_cast(-1), static_cast(-1)}; + const uint32x2_t four = (uint32x2_t){4, 4}; + + for (; i + 8 <= count; i += 8) + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v3 = vld1q_f32_aligned_postincrement(vv); + + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo); + float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo); + float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo); + + float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3)); + float32x2_t zLo = vmul_f32(z0.val[0], vHi); + float32x2_t zHi = vmul_f32(z1.val[0], vHi); + + float32x2_t rLo = vpadd_f32(xy0, xy1); + float32x2_t rHi = vpadd_f32(xy2, xy3); + rLo = vadd_f32(rLo, zLo); + rHi = vadd_f32(rHi, zHi); + + uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo); + uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi); + dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo); + dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi); + iLo = vbsl_u32(maskLo, indexLo, iLo); + iHi = vbsl_u32(maskHi, indexHi, iHi); + indexLo = vadd_u32(indexLo, four); + indexHi = vadd_u32(indexHi, four); + + v0 = vld1q_f32_aligned_postincrement(vv); + v1 = vld1q_f32_aligned_postincrement(vv); + v2 = vld1q_f32_aligned_postincrement(vv); + v3 = vld1q_f32_aligned_postincrement(vv); + + xy0 = vmul_f32(vget_low_f32(v0), vLo); + xy1 = vmul_f32(vget_low_f32(v1), vLo); + xy2 = vmul_f32(vget_low_f32(v2), vLo); + xy3 = vmul_f32(vget_low_f32(v3), vLo); + + z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3)); + zLo = vmul_f32(z0.val[0], vHi); + zHi = vmul_f32(z1.val[0], vHi); + + rLo = vpadd_f32(xy0, xy1); + rHi = vpadd_f32(xy2, xy3); + rLo = vadd_f32(rLo, zLo); + rHi = vadd_f32(rHi, zHi); + + maskLo = vcgt_f32(rLo, dotMaxLo); + maskHi = vcgt_f32(rHi, dotMaxHi); + dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo); + dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi); + iLo = vbsl_u32(maskLo, indexLo, iLo); + iHi = vbsl_u32(maskHi, indexHi, iHi); + indexLo = vadd_u32(indexLo, four); + indexHi = vadd_u32(indexHi, four); + } + for (; i + 4 <= count; i += 4) + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v3 = vld1q_f32_aligned_postincrement(vv); -long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult ) + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo); + float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo); + float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo); + + float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3)); + float32x2_t zLo = vmul_f32(z0.val[0], vHi); + float32x2_t zHi = vmul_f32(z1.val[0], vHi); + + float32x2_t rLo = vpadd_f32(xy0, xy1); + float32x2_t rHi = vpadd_f32(xy2, xy3); + rLo = vadd_f32(rLo, zLo); + rHi = vadd_f32(rHi, zHi); + + uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo); + uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi); + dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo); + dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi); + iLo = vbsl_u32(maskLo, indexLo, iLo); + iHi = vbsl_u32(maskHi, indexHi, iHi); + indexLo = vadd_u32(indexLo, four); + indexHi = vadd_u32(indexHi, four); + } + + switch (count & 3) + { + case 3: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo); + float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo); + + float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x2_t zLo = vmul_f32(z0.val[0], vHi); + float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi); + + float32x2_t rLo = vpadd_f32(xy0, xy1); + float32x2_t rHi = vpadd_f32(xy2, xy2); + rLo = vadd_f32(rLo, zLo); + rHi = vadd_f32(rHi, zHi); + + uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo); + uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi); + dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo); + dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi); + iLo = vbsl_u32(maskLo, indexLo, iLo); + iHi = vbsl_u32(maskHi, indexHi, iHi); + } + break; + case 2: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo); + + float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x2_t zLo = vmul_f32(z0.val[0], vHi); + + float32x2_t rLo = vpadd_f32(xy0, xy1); + rLo = vadd_f32(rLo, zLo); + + uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo); + dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo); + iLo = vbsl_u32(maskLo, indexLo, iLo); + } + break; + case 1: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0); + float32x2_t zLo = vmul_f32(z0, vHi); + float32x2_t rLo = vpadd_f32(xy0, xy0); + rLo = vadd_f32(rLo, zLo); + uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo); + dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo); + iLo = vbsl_u32(maskLo, indexLo, iLo); + } + break; + + default: + break; + } + + // select best answer between hi and lo results + uint32x2_t mask = vcgt_f32(dotMaxHi, dotMaxLo); + dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo); + iLo = vbsl_u32(mask, iHi, iLo); + + // select best answer between even and odd results + dotMaxHi = vdup_lane_f32(dotMaxLo, 1); + iHi = vdup_lane_u32(iLo, 1); + mask = vcgt_f32(dotMaxHi, dotMaxLo); + dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo); + iLo = vbsl_u32(mask, iHi, iLo); + + *dotResult = vget_lane_f32(dotMaxLo, 0); + return vget_lane_u32(iLo, 0); +} + +long _maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult) { - float32x4_t vvec = vld1q_f32_aligned_postincrement( vec ); - float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec)); - float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0); - const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 }; - uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3}; - uint32x4_t index = (uint32x4_t) { static_cast(-1), static_cast(-1), static_cast(-1), static_cast(-1) }; - float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY }; - - unsigned long i = 0; - for( ; i + 8 <= count; i += 8 ) - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); - - xy0 = vmulq_f32(xy0, vLo); - xy1 = vmulq_f32(xy1, vLo); - - float32x4x2_t zb = vuzpq_f32( z0, z1); - float32x4_t z = vmulq_f32( zb.val[0], vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy1); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcgtq_f32(x, maxDot); - maxDot = vbslq_f32( mask, x, maxDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - - v0 = vld1q_f32_aligned_postincrement( vv ); - v1 = vld1q_f32_aligned_postincrement( vv ); - v2 = vld1q_f32_aligned_postincrement( vv ); - v3 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); - // the next two lines should resolve to a single vswp d, d - z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); - - xy0 = vmulq_f32(xy0, vLo); - xy1 = vmulq_f32(xy1, vLo); - - zb = vuzpq_f32( z0, z1); - z = vmulq_f32( zb.val[0], vHi); - xy = vuzpq_f32( xy0, xy1); - x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - mask = vcgtq_f32(x, maxDot); - maxDot = vbslq_f32( mask, x, maxDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - - for( ; i + 4 <= count; i += 4 ) - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); - - xy0 = vmulq_f32(xy0, vLo); - xy1 = vmulq_f32(xy1, vLo); - - float32x4x2_t zb = vuzpq_f32( z0, z1); - float32x4_t z = vmulq_f32( zb.val[0], vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy1); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcgtq_f32(x, maxDot); - maxDot = vbslq_f32( mask, x, maxDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - - switch (count & 3) { - case 3: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2)); - - xy0 = vmulq_f32(xy0, vLo); - xy1 = vmulq_f32(xy1, vLo); - - float32x4x2_t zb = vuzpq_f32( z0, z1); - float32x4_t z = vmulq_f32( zb.val[0], vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy1); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcgtq_f32(x, maxDot); - maxDot = vbslq_f32( mask, x, maxDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - break; - - case 2: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - - xy0 = vmulq_f32(xy0, vLo); - - float32x4x2_t zb = vuzpq_f32( z0, z0); - float32x4_t z = vmulq_f32( zb.val[0], vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy0); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcgtq_f32(x, maxDot); - maxDot = vbslq_f32( mask, x, maxDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - break; - - case 1: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); - - xy0 = vmulq_f32(xy0, vLo); - - z = vmulq_f32( z, vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy0); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcgtq_f32(x, maxDot); - maxDot = vbslq_f32( mask, x, maxDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - break; - - default: - break; - } - - - // select best answer between hi and lo results - uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot)); - float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot)); - uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index)); - - // select best answer between even and odd results - float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1); - uint32x2_t indexHi = vdup_lane_u32(index2, 1); - mask = vcgt_f32( maxDotO, maxDot2 ); - maxDot2 = vbsl_f32(mask, maxDotO, maxDot2); - index2 = vbsl_u32(mask, indexHi, index2); - - *dotResult = vget_lane_f32( maxDot2, 0); - return vget_lane_u32(index2, 0); - + float32x4_t vvec = vld1q_f32_aligned_postincrement(vec); + float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec)); + float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0); + const uint32x4_t four = (uint32x4_t){4, 4, 4, 4}; + uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3}; + uint32x4_t index = (uint32x4_t){static_cast(-1), static_cast(-1), static_cast(-1), static_cast(-1)}; + float32x4_t maxDot = (float32x4_t){-BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY}; + + unsigned long i = 0; + for (; i + 8 <= count; i += 8) + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v3 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3)); + + xy0 = vmulq_f32(xy0, vLo); + xy1 = vmulq_f32(xy1, vLo); + + float32x4x2_t zb = vuzpq_f32(z0, z1); + float32x4_t z = vmulq_f32(zb.val[0], vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy1); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcgtq_f32(x, maxDot); + maxDot = vbslq_f32(mask, x, maxDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + + v0 = vld1q_f32_aligned_postincrement(vv); + v1 = vld1q_f32_aligned_postincrement(vv); + v2 = vld1q_f32_aligned_postincrement(vv); + v3 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3)); + // the next two lines should resolve to a single vswp d, d + z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3)); + + xy0 = vmulq_f32(xy0, vLo); + xy1 = vmulq_f32(xy1, vLo); + + zb = vuzpq_f32(z0, z1); + z = vmulq_f32(zb.val[0], vHi); + xy = vuzpq_f32(xy0, xy1); + x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + mask = vcgtq_f32(x, maxDot); + maxDot = vbslq_f32(mask, x, maxDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + + for (; i + 4 <= count; i += 4) + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v3 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3)); + + xy0 = vmulq_f32(xy0, vLo); + xy1 = vmulq_f32(xy1, vLo); + + float32x4x2_t zb = vuzpq_f32(z0, z1); + float32x4_t z = vmulq_f32(zb.val[0], vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy1); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcgtq_f32(x, maxDot); + maxDot = vbslq_f32(mask, x, maxDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + + switch (count & 3) + { + case 3: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2)); + + xy0 = vmulq_f32(xy0, vLo); + xy1 = vmulq_f32(xy1, vLo); + + float32x4x2_t zb = vuzpq_f32(z0, z1); + float32x4_t z = vmulq_f32(zb.val[0], vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy1); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcgtq_f32(x, maxDot); + maxDot = vbslq_f32(mask, x, maxDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + break; + + case 2: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + + xy0 = vmulq_f32(xy0, vLo); + + float32x4x2_t zb = vuzpq_f32(z0, z0); + float32x4_t z = vmulq_f32(zb.val[0], vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy0); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcgtq_f32(x, maxDot); + maxDot = vbslq_f32(mask, x, maxDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + break; + + case 1: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); + + xy0 = vmulq_f32(xy0, vLo); + + z = vmulq_f32(z, vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy0); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcgtq_f32(x, maxDot); + maxDot = vbslq_f32(mask, x, maxDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + break; + + default: + break; + } + + // select best answer between hi and lo results + uint32x2_t mask = vcgt_f32(vget_high_f32(maxDot), vget_low_f32(maxDot)); + float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot)); + uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index)); + + // select best answer between even and odd results + float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1); + uint32x2_t indexHi = vdup_lane_u32(index2, 1); + mask = vcgt_f32(maxDotO, maxDot2); + maxDot2 = vbsl_f32(mask, maxDotO, maxDot2); + index2 = vbsl_u32(mask, indexHi, index2); + + *dotResult = vget_lane_f32(maxDot2, 0); + return vget_lane_u32(index2, 0); } -long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult ) +long _mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult) { - unsigned long i = 0; - float32x4_t vvec = vld1q_f32_aligned_postincrement( vec ); - float32x2_t vLo = vget_low_f32(vvec); - float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0); - float32x2_t dotMinLo = (float32x2_t) { BT_INFINITY, BT_INFINITY }; - float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY }; - uint32x2_t indexLo = (uint32x2_t) {0, 1}; - uint32x2_t indexHi = (uint32x2_t) {2, 3}; - uint32x2_t iLo = (uint32x2_t) {static_cast(-1), static_cast(-1)}; - uint32x2_t iHi = (uint32x2_t) {static_cast(-1), static_cast(-1)}; - const uint32x2_t four = (uint32x2_t) {4,4}; - - for( ; i+8 <= count; i+= 8 ) - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); - - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); - float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); - float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo); - - float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); - float32x2_t zLo = vmul_f32( z0.val[0], vHi); - float32x2_t zHi = vmul_f32( z1.val[0], vHi); - - float32x2_t rLo = vpadd_f32( xy0, xy1); - float32x2_t rHi = vpadd_f32( xy2, xy3); - rLo = vadd_f32(rLo, zLo); - rHi = vadd_f32(rHi, zHi); - - uint32x2_t maskLo = vclt_f32( rLo, dotMinLo ); - uint32x2_t maskHi = vclt_f32( rHi, dotMinHi ); - dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); - dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi); - iLo = vbsl_u32(maskLo, indexLo, iLo); - iHi = vbsl_u32(maskHi, indexHi, iHi); - indexLo = vadd_u32(indexLo, four); - indexHi = vadd_u32(indexHi, four); - - v0 = vld1q_f32_aligned_postincrement( vv ); - v1 = vld1q_f32_aligned_postincrement( vv ); - v2 = vld1q_f32_aligned_postincrement( vv ); - v3 = vld1q_f32_aligned_postincrement( vv ); - - xy0 = vmul_f32( vget_low_f32(v0), vLo); - xy1 = vmul_f32( vget_low_f32(v1), vLo); - xy2 = vmul_f32( vget_low_f32(v2), vLo); - xy3 = vmul_f32( vget_low_f32(v3), vLo); - - z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); - zLo = vmul_f32( z0.val[0], vHi); - zHi = vmul_f32( z1.val[0], vHi); - - rLo = vpadd_f32( xy0, xy1); - rHi = vpadd_f32( xy2, xy3); - rLo = vadd_f32(rLo, zLo); - rHi = vadd_f32(rHi, zHi); - - maskLo = vclt_f32( rLo, dotMinLo ); - maskHi = vclt_f32( rHi, dotMinHi ); - dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); - dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi); - iLo = vbsl_u32(maskLo, indexLo, iLo); - iHi = vbsl_u32(maskHi, indexHi, iHi); - indexLo = vadd_u32(indexLo, four); - indexHi = vadd_u32(indexHi, four); - } - - for( ; i+4 <= count; i+= 4 ) - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); - - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); - float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); - float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo); - - float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3)); - float32x2_t zLo = vmul_f32( z0.val[0], vHi); - float32x2_t zHi = vmul_f32( z1.val[0], vHi); - - float32x2_t rLo = vpadd_f32( xy0, xy1); - float32x2_t rHi = vpadd_f32( xy2, xy3); - rLo = vadd_f32(rLo, zLo); - rHi = vadd_f32(rHi, zHi); - - uint32x2_t maskLo = vclt_f32( rLo, dotMinLo ); - uint32x2_t maskHi = vclt_f32( rHi, dotMinHi ); - dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); - dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi); - iLo = vbsl_u32(maskLo, indexLo, iLo); - iHi = vbsl_u32(maskHi, indexHi, iHi); - indexLo = vadd_u32(indexLo, four); - indexHi = vadd_u32(indexHi, four); - } - switch( count & 3 ) - { - case 3: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); - float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo); - - float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x2_t zLo = vmul_f32( z0.val[0], vHi); - float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi); - - float32x2_t rLo = vpadd_f32( xy0, xy1); - float32x2_t rHi = vpadd_f32( xy2, xy2); - rLo = vadd_f32(rLo, zLo); - rHi = vadd_f32(rHi, zHi); - - uint32x2_t maskLo = vclt_f32( rLo, dotMinLo ); - uint32x2_t maskHi = vclt_f32( rHi, dotMinHi ); - dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); - dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi); - iLo = vbsl_u32(maskLo, indexLo, iLo); - iHi = vbsl_u32(maskHi, indexHi, iHi); - } - break; - case 2: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo); - - float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x2_t zLo = vmul_f32( z0.val[0], vHi); - - float32x2_t rLo = vpadd_f32( xy0, xy1); - rLo = vadd_f32(rLo, zLo); - - uint32x2_t maskLo = vclt_f32( rLo, dotMinLo ); - dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); - iLo = vbsl_u32(maskLo, indexLo, iLo); - } - break; - case 1: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo); - float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0); - float32x2_t zLo = vmul_f32( z0, vHi); - float32x2_t rLo = vpadd_f32( xy0, xy0); - rLo = vadd_f32(rLo, zLo); - uint32x2_t maskLo = vclt_f32( rLo, dotMinLo ); - dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo); - iLo = vbsl_u32(maskLo, indexLo, iLo); - } - break; - - default: - break; - } - - // select best answer between hi and lo results - uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo ); - dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo); - iLo = vbsl_u32(mask, iHi, iLo); - - // select best answer between even and odd results - dotMinHi = vdup_lane_f32(dotMinLo, 1); - iHi = vdup_lane_u32(iLo, 1); - mask = vclt_f32( dotMinHi, dotMinLo ); - dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo); - iLo = vbsl_u32(mask, iHi, iLo); - - *dotResult = vget_lane_f32( dotMinLo, 0); - return vget_lane_u32(iLo, 0); + unsigned long i = 0; + float32x4_t vvec = vld1q_f32_aligned_postincrement(vec); + float32x2_t vLo = vget_low_f32(vvec); + float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0); + float32x2_t dotMinLo = (float32x2_t){BT_INFINITY, BT_INFINITY}; + float32x2_t dotMinHi = (float32x2_t){BT_INFINITY, BT_INFINITY}; + uint32x2_t indexLo = (uint32x2_t){0, 1}; + uint32x2_t indexHi = (uint32x2_t){2, 3}; + uint32x2_t iLo = (uint32x2_t){static_cast(-1), static_cast(-1)}; + uint32x2_t iHi = (uint32x2_t){static_cast(-1), static_cast(-1)}; + const uint32x2_t four = (uint32x2_t){4, 4}; + + for (; i + 8 <= count; i += 8) + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v3 = vld1q_f32_aligned_postincrement(vv); + + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo); + float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo); + float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo); + + float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3)); + float32x2_t zLo = vmul_f32(z0.val[0], vHi); + float32x2_t zHi = vmul_f32(z1.val[0], vHi); + + float32x2_t rLo = vpadd_f32(xy0, xy1); + float32x2_t rHi = vpadd_f32(xy2, xy3); + rLo = vadd_f32(rLo, zLo); + rHi = vadd_f32(rHi, zHi); + + uint32x2_t maskLo = vclt_f32(rLo, dotMinLo); + uint32x2_t maskHi = vclt_f32(rHi, dotMinHi); + dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo); + dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi); + iLo = vbsl_u32(maskLo, indexLo, iLo); + iHi = vbsl_u32(maskHi, indexHi, iHi); + indexLo = vadd_u32(indexLo, four); + indexHi = vadd_u32(indexHi, four); + + v0 = vld1q_f32_aligned_postincrement(vv); + v1 = vld1q_f32_aligned_postincrement(vv); + v2 = vld1q_f32_aligned_postincrement(vv); + v3 = vld1q_f32_aligned_postincrement(vv); + + xy0 = vmul_f32(vget_low_f32(v0), vLo); + xy1 = vmul_f32(vget_low_f32(v1), vLo); + xy2 = vmul_f32(vget_low_f32(v2), vLo); + xy3 = vmul_f32(vget_low_f32(v3), vLo); + + z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3)); + zLo = vmul_f32(z0.val[0], vHi); + zHi = vmul_f32(z1.val[0], vHi); + + rLo = vpadd_f32(xy0, xy1); + rHi = vpadd_f32(xy2, xy3); + rLo = vadd_f32(rLo, zLo); + rHi = vadd_f32(rHi, zHi); + + maskLo = vclt_f32(rLo, dotMinLo); + maskHi = vclt_f32(rHi, dotMinHi); + dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo); + dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi); + iLo = vbsl_u32(maskLo, indexLo, iLo); + iHi = vbsl_u32(maskHi, indexHi, iHi); + indexLo = vadd_u32(indexLo, four); + indexHi = vadd_u32(indexHi, four); + } + + for (; i + 4 <= count; i += 4) + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v3 = vld1q_f32_aligned_postincrement(vv); + + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo); + float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo); + float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo); + + float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3)); + float32x2_t zLo = vmul_f32(z0.val[0], vHi); + float32x2_t zHi = vmul_f32(z1.val[0], vHi); + + float32x2_t rLo = vpadd_f32(xy0, xy1); + float32x2_t rHi = vpadd_f32(xy2, xy3); + rLo = vadd_f32(rLo, zLo); + rHi = vadd_f32(rHi, zHi); + + uint32x2_t maskLo = vclt_f32(rLo, dotMinLo); + uint32x2_t maskHi = vclt_f32(rHi, dotMinHi); + dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo); + dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi); + iLo = vbsl_u32(maskLo, indexLo, iLo); + iHi = vbsl_u32(maskHi, indexHi, iHi); + indexLo = vadd_u32(indexLo, four); + indexHi = vadd_u32(indexHi, four); + } + switch (count & 3) + { + case 3: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo); + float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo); + + float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x2_t zLo = vmul_f32(z0.val[0], vHi); + float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi); + + float32x2_t rLo = vpadd_f32(xy0, xy1); + float32x2_t rHi = vpadd_f32(xy2, xy2); + rLo = vadd_f32(rLo, zLo); + rHi = vadd_f32(rHi, zHi); + + uint32x2_t maskLo = vclt_f32(rLo, dotMinLo); + uint32x2_t maskHi = vclt_f32(rHi, dotMinHi); + dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo); + dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi); + iLo = vbsl_u32(maskLo, indexLo, iLo); + iHi = vbsl_u32(maskHi, indexHi, iHi); + } + break; + case 2: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo); + + float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x2_t zLo = vmul_f32(z0.val[0], vHi); + + float32x2_t rLo = vpadd_f32(xy0, xy1); + rLo = vadd_f32(rLo, zLo); + + uint32x2_t maskLo = vclt_f32(rLo, dotMinLo); + dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo); + iLo = vbsl_u32(maskLo, indexLo, iLo); + } + break; + case 1: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo); + float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0); + float32x2_t zLo = vmul_f32(z0, vHi); + float32x2_t rLo = vpadd_f32(xy0, xy0); + rLo = vadd_f32(rLo, zLo); + uint32x2_t maskLo = vclt_f32(rLo, dotMinLo); + dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo); + iLo = vbsl_u32(maskLo, indexLo, iLo); + } + break; + + default: + break; + } + + // select best answer between hi and lo results + uint32x2_t mask = vclt_f32(dotMinHi, dotMinLo); + dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo); + iLo = vbsl_u32(mask, iHi, iLo); + + // select best answer between even and odd results + dotMinHi = vdup_lane_f32(dotMinLo, 1); + iHi = vdup_lane_u32(iLo, 1); + mask = vclt_f32(dotMinHi, dotMinLo); + dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo); + iLo = vbsl_u32(mask, iHi, iLo); + + *dotResult = vget_lane_f32(dotMinLo, 0); + return vget_lane_u32(iLo, 0); } -long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult ) +long _mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult) { - float32x4_t vvec = vld1q_f32_aligned_postincrement( vec ); - float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec)); - float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0); - const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 }; - uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3}; - uint32x4_t index = (uint32x4_t) { static_cast(-1), static_cast(-1), static_cast(-1), static_cast(-1) }; - float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY }; - - unsigned long i = 0; - for( ; i + 8 <= count; i += 8 ) - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); - - xy0 = vmulq_f32(xy0, vLo); - xy1 = vmulq_f32(xy1, vLo); - - float32x4x2_t zb = vuzpq_f32( z0, z1); - float32x4_t z = vmulq_f32( zb.val[0], vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy1); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcltq_f32(x, minDot); - minDot = vbslq_f32( mask, x, minDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - - v0 = vld1q_f32_aligned_postincrement( vv ); - v1 = vld1q_f32_aligned_postincrement( vv ); - v2 = vld1q_f32_aligned_postincrement( vv ); - v3 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); - // the next two lines should resolve to a single vswp d, d - z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); - - xy0 = vmulq_f32(xy0, vLo); - xy1 = vmulq_f32(xy1, vLo); - - zb = vuzpq_f32( z0, z1); - z = vmulq_f32( zb.val[0], vHi); - xy = vuzpq_f32( xy0, xy1); - x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - mask = vcltq_f32(x, minDot); - minDot = vbslq_f32( mask, x, minDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - - for( ; i + 4 <= count; i += 4 ) - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v3 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3)); - - xy0 = vmulq_f32(xy0, vLo); - xy1 = vmulq_f32(xy1, vLo); - - float32x4x2_t zb = vuzpq_f32( z0, z1); - float32x4_t z = vmulq_f32( zb.val[0], vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy1); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcltq_f32(x, minDot); - minDot = vbslq_f32( mask, x, minDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - - switch (count & 3) { - case 3: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v2 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2)); - - xy0 = vmulq_f32(xy0, vLo); - xy1 = vmulq_f32(xy1, vLo); - - float32x4x2_t zb = vuzpq_f32( z0, z1); - float32x4_t z = vmulq_f32( zb.val[0], vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy1); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcltq_f32(x, minDot); - minDot = vbslq_f32( mask, x, minDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - break; - - case 2: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - float32x4_t v1 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1)); - - xy0 = vmulq_f32(xy0, vLo); - - float32x4x2_t zb = vuzpq_f32( z0, z0); - float32x4_t z = vmulq_f32( zb.val[0], vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy0); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcltq_f32(x, minDot); - minDot = vbslq_f32( mask, x, minDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - break; - - case 1: - { - float32x4_t v0 = vld1q_f32_aligned_postincrement( vv ); - - // the next two lines should resolve to a single vswp d, d - float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0)); - // the next two lines should resolve to a single vswp d, d - float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); - - xy0 = vmulq_f32(xy0, vLo); - - z = vmulq_f32( z, vHi); - float32x4x2_t xy = vuzpq_f32( xy0, xy0); - float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); - x = vaddq_f32(x, z); - - uint32x4_t mask = vcltq_f32(x, minDot); - minDot = vbslq_f32( mask, x, minDot); - index = vbslq_u32(mask, local_index, index); - local_index = vaddq_u32(local_index, four); - } - break; - - default: - break; - } - - - // select best answer between hi and lo results - uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot)); - float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot)); - uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index)); - - // select best answer between even and odd results - float32x2_t minDotO = vdup_lane_f32(minDot2, 1); - uint32x2_t indexHi = vdup_lane_u32(index2, 1); - mask = vclt_f32( minDotO, minDot2 ); - minDot2 = vbsl_f32(mask, minDotO, minDot2); - index2 = vbsl_u32(mask, indexHi, index2); - - *dotResult = vget_lane_f32( minDot2, 0); - return vget_lane_u32(index2, 0); - + float32x4_t vvec = vld1q_f32_aligned_postincrement(vec); + float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec)); + float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0); + const uint32x4_t four = (uint32x4_t){4, 4, 4, 4}; + uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3}; + uint32x4_t index = (uint32x4_t){static_cast(-1), static_cast(-1), static_cast(-1), static_cast(-1)}; + float32x4_t minDot = (float32x4_t){BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY}; + + unsigned long i = 0; + for (; i + 8 <= count; i += 8) + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v3 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3)); + + xy0 = vmulq_f32(xy0, vLo); + xy1 = vmulq_f32(xy1, vLo); + + float32x4x2_t zb = vuzpq_f32(z0, z1); + float32x4_t z = vmulq_f32(zb.val[0], vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy1); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcltq_f32(x, minDot); + minDot = vbslq_f32(mask, x, minDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + + v0 = vld1q_f32_aligned_postincrement(vv); + v1 = vld1q_f32_aligned_postincrement(vv); + v2 = vld1q_f32_aligned_postincrement(vv); + v3 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3)); + // the next two lines should resolve to a single vswp d, d + z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3)); + + xy0 = vmulq_f32(xy0, vLo); + xy1 = vmulq_f32(xy1, vLo); + + zb = vuzpq_f32(z0, z1); + z = vmulq_f32(zb.val[0], vHi); + xy = vuzpq_f32(xy0, xy1); + x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + mask = vcltq_f32(x, minDot); + minDot = vbslq_f32(mask, x, minDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + + for (; i + 4 <= count; i += 4) + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v3 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3)); + + xy0 = vmulq_f32(xy0, vLo); + xy1 = vmulq_f32(xy1, vLo); + + float32x4x2_t zb = vuzpq_f32(z0, z1); + float32x4_t z = vmulq_f32(zb.val[0], vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy1); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcltq_f32(x, minDot); + minDot = vbslq_f32(mask, x, minDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + + switch (count & 3) + { + case 3: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v2 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2)); + + xy0 = vmulq_f32(xy0, vLo); + xy1 = vmulq_f32(xy1, vLo); + + float32x4x2_t zb = vuzpq_f32(z0, z1); + float32x4_t z = vmulq_f32(zb.val[0], vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy1); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcltq_f32(x, minDot); + minDot = vbslq_f32(mask, x, minDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + break; + + case 2: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + float32x4_t v1 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1)); + + xy0 = vmulq_f32(xy0, vLo); + + float32x4x2_t zb = vuzpq_f32(z0, z0); + float32x4_t z = vmulq_f32(zb.val[0], vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy0); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcltq_f32(x, minDot); + minDot = vbslq_f32(mask, x, minDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + break; + + case 1: + { + float32x4_t v0 = vld1q_f32_aligned_postincrement(vv); + + // the next two lines should resolve to a single vswp d, d + float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0)); + // the next two lines should resolve to a single vswp d, d + float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); + + xy0 = vmulq_f32(xy0, vLo); + + z = vmulq_f32(z, vHi); + float32x4x2_t xy = vuzpq_f32(xy0, xy0); + float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]); + x = vaddq_f32(x, z); + + uint32x4_t mask = vcltq_f32(x, minDot); + minDot = vbslq_f32(mask, x, minDot); + index = vbslq_u32(mask, local_index, index); + local_index = vaddq_u32(local_index, four); + } + break; + + default: + break; + } + + // select best answer between hi and lo results + uint32x2_t mask = vclt_f32(vget_high_f32(minDot), vget_low_f32(minDot)); + float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot)); + uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index)); + + // select best answer between even and odd results + float32x2_t minDotO = vdup_lane_f32(minDot2, 1); + uint32x2_t indexHi = vdup_lane_u32(index2, 1); + mask = vclt_f32(minDotO, minDot2); + minDot2 = vbsl_f32(mask, minDotO, minDot2); + index2 = vbsl_u32(mask, indexHi, index2); + + *dotResult = vget_lane_f32(minDot2, 0); + return vget_lane_u32(index2, 0); } #else - #error Unhandled __APPLE__ arch +#error Unhandled __APPLE__ arch #endif -#endif /* __APPLE__ */ - - +#endif /* __APPLE__ */ diff --git a/thirdparty/bullet/LinearMath/btVector3.h b/thirdparty/bullet/LinearMath/btVector3.h index 76024f1236..61fd8d1e46 100644 --- a/thirdparty/bullet/LinearMath/btVector3.h +++ b/thirdparty/bullet/LinearMath/btVector3.h @@ -12,8 +12,6 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - - #ifndef BT_VECTOR3_H #define BT_VECTOR3_H @@ -28,25 +26,24 @@ subject to the following restrictions: #else #define btVector3Data btVector3FloatData #define btVector3DataName "btVector3FloatData" -#endif //BT_USE_DOUBLE_PRECISION +#endif //BT_USE_DOUBLE_PRECISION #if defined BT_USE_SSE //typedef uint32_t __m128i __attribute__ ((vector_size(16))); #ifdef _MSC_VER -#pragma warning(disable: 4556) // value of intrinsic immediate argument '4294967239' is out of range '0 - 255' +#pragma warning(disable : 4556) // value of intrinsic immediate argument '4294967239' is out of range '0 - 255' #endif - -#define BT_SHUFFLE(x,y,z,w) ((w)<<6 | (z)<<4 | (y)<<2 | (x)) +#define BT_SHUFFLE(x, y, z, w) ((w) << 6 | (z) << 4 | (y) << 2 | (x)) //#define bt_pshufd_ps( _a, _mask ) (__m128) _mm_shuffle_epi32((__m128i)(_a), (_mask) ) -#define bt_pshufd_ps( _a, _mask ) _mm_shuffle_ps((_a), (_a), (_mask) ) -#define bt_splat3_ps( _a, _i ) bt_pshufd_ps((_a), BT_SHUFFLE(_i,_i,_i, 3) ) -#define bt_splat_ps( _a, _i ) bt_pshufd_ps((_a), BT_SHUFFLE(_i,_i,_i,_i) ) +#define bt_pshufd_ps(_a, _mask) _mm_shuffle_ps((_a), (_a), (_mask)) +#define bt_splat3_ps(_a, _i) bt_pshufd_ps((_a), BT_SHUFFLE(_i, _i, _i, 3)) +#define bt_splat_ps(_a, _i) bt_pshufd_ps((_a), BT_SHUFFLE(_i, _i, _i, _i)) #define btv3AbsiMask (_mm_set_epi32(0x00000000, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF)) -#define btvAbsMask (_mm_set_epi32( 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF)) +#define btvAbsMask (_mm_set_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF)) #define btvFFF0Mask (_mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)) #define btv3AbsfMask btCastiTo128f(btv3AbsiMask) #define btvFFF0fMask btCastiTo128f(btvFFF0Mask) @@ -55,9 +52,9 @@ subject to the following restrictions: //there is an issue with XCode 3.2 (LCx errors) #define btvMzeroMask (_mm_set_ps(-0.0f, -0.0f, -0.0f, -0.0f)) -#define v1110 (_mm_set_ps(0.0f, 1.0f, 1.0f, 1.0f)) -#define vHalf (_mm_set_ps(0.5f, 0.5f, 0.5f, 0.5f)) -#define v1_5 (_mm_set_ps(1.5f, 1.5f, 1.5f, 1.5f)) +#define v1110 (_mm_set_ps(0.0f, 1.0f, 1.0f, 1.0f)) +#define vHalf (_mm_set_ps(0.5f, 0.5f, 0.5f, 0.5f)) +#define v1_5 (_mm_set_ps(1.5f, 1.5f, 1.5f, 1.5f)) //const __m128 ATTRIBUTE_ALIGNED16(btvMzeroMask) = {-0.0f, -0.0f, -0.0f, -0.0f}; //const __m128 ATTRIBUTE_ALIGNED16(v1110) = {1.0f, 1.0f, 1.0f, 0.0f}; @@ -70,7 +67,7 @@ subject to the following restrictions: const float32x4_t ATTRIBUTE_ALIGNED16(btvMzeroMask) = (float32x4_t){-0.0f, -0.0f, -0.0f, -0.0f}; const int32x4_t ATTRIBUTE_ALIGNED16(btvFFF0Mask) = (int32x4_t){static_cast(0xFFFFFFFF), - static_cast(0xFFFFFFFF), static_cast(0xFFFFFFFF), 0x0}; + static_cast(0xFFFFFFFF), static_cast(0xFFFFFFFF), 0x0}; const int32x4_t ATTRIBUTE_ALIGNED16(btvAbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; const int32x4_t ATTRIBUTE_ALIGNED16(btv3AbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x0}; @@ -80,50 +77,48 @@ const int32x4_t ATTRIBUTE_ALIGNED16(btv3AbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FF * It has an un-used w component to suit 16-byte alignment when btVector3 is stored in containers. This extra component can be used by derived classes (Quaternion?) or by user * Ideally, this class should be replaced by a platform optimized SIMD version that keeps the data in registers */ -ATTRIBUTE_ALIGNED16(class) btVector3 +ATTRIBUTE_ALIGNED16(class) +btVector3 { public: - BT_DECLARE_ALIGNED_ALLOCATOR(); -#if defined (__SPU__) && defined (__CELLOS_LV2__) - btScalar m_floats[4]; +#if defined(__SPU__) && defined(__CELLOS_LV2__) + btScalar m_floats[4]; + public: - SIMD_FORCE_INLINE const vec_float4& get128() const + SIMD_FORCE_INLINE const vec_float4& get128() const { return *((const vec_float4*)&m_floats[0]); } + public: -#else //__CELLOS_LV2__ __SPU__ - #if defined (BT_USE_SSE) || defined(BT_USE_NEON) // _WIN32 || ARM - union { - btSimdFloat4 mVec128; - btScalar m_floats[4]; - }; - SIMD_FORCE_INLINE btSimdFloat4 get128() const - { - return mVec128; - } - SIMD_FORCE_INLINE void set128(btSimdFloat4 v128) - { - mVec128 = v128; - } - #else - btScalar m_floats[4]; - #endif -#endif //__CELLOS_LV2__ __SPU__ - - public: - - /**@brief No initialization constructor */ - SIMD_FORCE_INLINE btVector3() +#else //__CELLOS_LV2__ __SPU__ +#if defined(BT_USE_SSE) || defined(BT_USE_NEON) // _WIN32 || ARM + union { + btSimdFloat4 mVec128; + btScalar m_floats[4]; + }; + SIMD_FORCE_INLINE btSimdFloat4 get128() const + { + return mVec128; + } + SIMD_FORCE_INLINE void set128(btSimdFloat4 v128) { + mVec128 = v128; + } +#else + btScalar m_floats[4]; +#endif +#endif //__CELLOS_LV2__ __SPU__ +public: + /**@brief No initialization constructor */ + SIMD_FORCE_INLINE btVector3() + { } - - - /**@brief Constructor from scalars + /**@brief Constructor from scalars * @param x X value * @param y Y value * @param z Z value @@ -136,9 +131,9 @@ public: m_floats[3] = btScalar(0.f); } -#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) )|| defined (BT_USE_NEON) - // Set Vector - SIMD_FORCE_INLINE btVector3( btSimdFloat4 v) +#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) + // Set Vector + SIMD_FORCE_INLINE btVector3(btSimdFloat4 v) { mVec128 = v; } @@ -150,73 +145,72 @@ public: } // Assignment Operator - SIMD_FORCE_INLINE btVector3& - operator=(const btVector3& v) + SIMD_FORCE_INLINE btVector3& + operator=(const btVector3& v) { mVec128 = v.mVec128; - + return *this; } -#endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON) - -/**@brief Add a vector to this one +#endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON) + + /**@brief Add a vector to this one * @param The vector to add to this one */ SIMD_FORCE_INLINE btVector3& operator+=(const btVector3& v) { -#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) mVec128 = _mm_add_ps(mVec128, v.mVec128); #elif defined(BT_USE_NEON) mVec128 = vaddq_f32(mVec128, v.mVec128); #else - m_floats[0] += v.m_floats[0]; + m_floats[0] += v.m_floats[0]; m_floats[1] += v.m_floats[1]; m_floats[2] += v.m_floats[2]; #endif return *this; } - - /**@brief Subtract a vector from this one + /**@brief Subtract a vector from this one * @param The vector to subtract */ - SIMD_FORCE_INLINE btVector3& operator-=(const btVector3& v) + SIMD_FORCE_INLINE btVector3& operator-=(const btVector3& v) { -#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) mVec128 = _mm_sub_ps(mVec128, v.mVec128); #elif defined(BT_USE_NEON) mVec128 = vsubq_f32(mVec128, v.mVec128); #else - m_floats[0] -= v.m_floats[0]; + m_floats[0] -= v.m_floats[0]; m_floats[1] -= v.m_floats[1]; m_floats[2] -= v.m_floats[2]; #endif return *this; } - - /**@brief Scale the vector + + /**@brief Scale the vector * @param s Scale factor */ SIMD_FORCE_INLINE btVector3& operator*=(const btScalar& s) { -#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) - __m128 vs = _mm_load_ss(&s); // (S 0 0 0) - vs = bt_pshufd_ps(vs, 0x80); // (S S S 0.0) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) + __m128 vs = _mm_load_ss(&s); // (S 0 0 0) + vs = bt_pshufd_ps(vs, 0x80); // (S S S 0.0) mVec128 = _mm_mul_ps(mVec128, vs); #elif defined(BT_USE_NEON) mVec128 = vmulq_n_f32(mVec128, s); #else - m_floats[0] *= s; + m_floats[0] *= s; m_floats[1] *= s; m_floats[2] *= s; #endif return *this; } - /**@brief Inversely scale the vector + /**@brief Inversely scale the vector * @param s Scale factor to divide by */ - SIMD_FORCE_INLINE btVector3& operator/=(const btScalar& s) + SIMD_FORCE_INLINE btVector3& operator/=(const btScalar& s) { btFullAssert(s != btScalar(0.0)); -#if 0 //defined(BT_USE_SSE_IN_API) +#if 0 //defined(BT_USE_SSE_IN_API) // this code is not faster ! __m128 vs = _mm_load_ss(&s); vs = _mm_div_ss(v1110, vs); @@ -230,11 +224,11 @@ public: #endif } - /**@brief Return the dot product + /**@brief Return the dot product * @param v The other vector in the dot product */ SIMD_FORCE_INLINE btScalar dot(const btVector3& v) const { -#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) __m128 vd = _mm_mul_ps(mVec128, v.mVec128); __m128 z = _mm_movehl_ps(vd, vd); __m128 y = _mm_shuffle_ps(vd, vd, 0x55); @@ -243,23 +237,23 @@ public: return _mm_cvtss_f32(vd); #elif defined(BT_USE_NEON) float32x4_t vd = vmulq_f32(mVec128, v.mVec128); - float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_low_f32(vd)); + float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_low_f32(vd)); x = vadd_f32(x, vget_high_f32(vd)); return vget_lane_f32(x, 0); -#else - return m_floats[0] * v.m_floats[0] + - m_floats[1] * v.m_floats[1] + - m_floats[2] * v.m_floats[2]; +#else + return m_floats[0] * v.m_floats[0] + + m_floats[1] * v.m_floats[1] + + m_floats[2] * v.m_floats[2]; #endif } - /**@brief Return the length of the vector squared */ + /**@brief Return the length of the vector squared */ SIMD_FORCE_INLINE btScalar length2() const { return dot(*this); } - /**@brief Return the length of the vector */ + /**@brief Return the length of the vector */ SIMD_FORCE_INLINE btScalar length() const { return btSqrt(length2()); @@ -267,7 +261,7 @@ public: /**@brief Return the norm (length) of the vector */ SIMD_FORCE_INLINE btScalar norm() const - { + { return length(); } @@ -276,24 +270,24 @@ public: { btScalar d = length2(); //workaround for some clang/gcc issue of sqrtf(tiny number) = -INF - if (d>SIMD_EPSILON) + if (d > SIMD_EPSILON) return btSqrt(d); return btScalar(0); } - /**@brief Return the distance squared between the ends of this and another vector + /**@brief Return the distance squared between the ends of this and another vector * This is symantically treating the vector like a point */ SIMD_FORCE_INLINE btScalar distance2(const btVector3& v) const; - /**@brief Return the distance between the ends of this and another vector + /**@brief Return the distance between the ends of this and another vector * This is symantically treating the vector like a point */ SIMD_FORCE_INLINE btScalar distance(const btVector3& v) const; - SIMD_FORCE_INLINE btVector3& safeNormalize() + SIMD_FORCE_INLINE btVector3& safeNormalize() { btScalar l2 = length2(); //triNormal.normalize(); - if (l2 >= SIMD_EPSILON*SIMD_EPSILON) + if (l2 >= SIMD_EPSILON * SIMD_EPSILON) { (*this) /= btSqrt(l2); } @@ -304,100 +298,97 @@ public: return *this; } - /**@brief Normalize this vector + /**@brief Normalize this vector * x^2 + y^2 + z^2 = 1 */ - SIMD_FORCE_INLINE btVector3& normalize() + SIMD_FORCE_INLINE btVector3& normalize() { - btAssert(!fuzzyZero()); -#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) - // dot product first +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) + // dot product first __m128 vd = _mm_mul_ps(mVec128, mVec128); __m128 z = _mm_movehl_ps(vd, vd); __m128 y = _mm_shuffle_ps(vd, vd, 0x55); vd = _mm_add_ss(vd, y); vd = _mm_add_ss(vd, z); - - #if 0 + +#if 0 vd = _mm_sqrt_ss(vd); vd = _mm_div_ss(v1110, vd); vd = bt_splat_ps(vd, 0x80); mVec128 = _mm_mul_ps(mVec128, vd); - #else - - // NR step 1/sqrt(x) - vd is x, y is output - y = _mm_rsqrt_ss(vd); // estimate - - // one step NR - z = v1_5; - vd = _mm_mul_ss(vd, vHalf); // vd * 0.5 - //x2 = vd; - vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0 - vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0 * y0 - z = _mm_sub_ss(z, vd); // 1.5 - vd * 0.5 * y0 * y0 - - y = _mm_mul_ss(y, z); // y0 * (1.5 - vd * 0.5 * y0 * y0) +#else + + // NR step 1/sqrt(x) - vd is x, y is output + y = _mm_rsqrt_ss(vd); // estimate + + // one step NR + z = v1_5; + vd = _mm_mul_ss(vd, vHalf); // vd * 0.5 + //x2 = vd; + vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0 + vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0 * y0 + z = _mm_sub_ss(z, vd); // 1.5 - vd * 0.5 * y0 * y0 + + y = _mm_mul_ss(y, z); // y0 * (1.5 - vd * 0.5 * y0 * y0) y = bt_splat_ps(y, 0x80); mVec128 = _mm_mul_ps(mVec128, y); - #endif +#endif - return *this; -#else +#else return *this /= length(); #endif } - /**@brief Return a normalized version of this vector */ + /**@brief Return a normalized version of this vector */ SIMD_FORCE_INLINE btVector3 normalized() const; - /**@brief Return a rotated version of this vector + /**@brief Return a rotated version of this vector * @param wAxis The axis to rotate about * @param angle The angle to rotate by */ - SIMD_FORCE_INLINE btVector3 rotate( const btVector3& wAxis, const btScalar angle ) const; + SIMD_FORCE_INLINE btVector3 rotate(const btVector3& wAxis, const btScalar angle) const; - /**@brief Return the angle between this and another vector + /**@brief Return the angle between this and another vector * @param v The other vector */ - SIMD_FORCE_INLINE btScalar angle(const btVector3& v) const + SIMD_FORCE_INLINE btScalar angle(const btVector3& v) const { btScalar s = btSqrt(length2() * v.length2()); btFullAssert(s != btScalar(0.0)); return btAcos(dot(v) / s); } - - /**@brief Return a vector with the absolute values of each element */ - SIMD_FORCE_INLINE btVector3 absolute() const - { -#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) + /**@brief Return a vector with the absolute values of each element */ + SIMD_FORCE_INLINE btVector3 absolute() const + { +#if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) return btVector3(_mm_and_ps(mVec128, btv3AbsfMask)); #elif defined(BT_USE_NEON) return btVector3(vabsq_f32(mVec128)); -#else +#else return btVector3( - btFabs(m_floats[0]), - btFabs(m_floats[1]), + btFabs(m_floats[0]), + btFabs(m_floats[1]), btFabs(m_floats[2])); #endif } - - /**@brief Return the cross product between this and another vector + + /**@brief Return the cross product between this and another vector * @param v The other vector */ SIMD_FORCE_INLINE btVector3 cross(const btVector3& v) const { -#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) - __m128 T, V; - - T = bt_pshufd_ps(mVec128, BT_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0) - V = bt_pshufd_ps(v.mVec128, BT_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0) - +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) + __m128 T, V; + + T = bt_pshufd_ps(mVec128, BT_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0) + V = bt_pshufd_ps(v.mVec128, BT_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0) + V = _mm_mul_ps(V, mVec128); T = _mm_mul_ps(T, v.mVec128); V = _mm_sub_ps(V, T); - + V = bt_pshufd_ps(V, BT_SHUFFLE(1, 2, 0, 3)); return btVector3(V); #elif defined(BT_USE_NEON) @@ -407,7 +398,7 @@ public: float32x2_t Vlow = vget_low_f32(v.mVec128); T = vcombine_f32(vext_f32(Tlow, vget_high_f32(mVec128), 1), Tlow); V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v.mVec128), 1), Vlow); - + V = vmulq_f32(V, mVec128); T = vmulq_f32(T, v.mVec128); V = vsubq_f32(V, T); @@ -415,7 +406,7 @@ public: // form (Y, Z, X, _); V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow); V = (float32x4_t)vandq_s32((int32x4_t)V, btvFFF0Mask); - + return btVector3(V); #else return btVector3( @@ -427,18 +418,18 @@ public: SIMD_FORCE_INLINE btScalar triple(const btVector3& v1, const btVector3& v2) const { -#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) // cross: - __m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, BT_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0) - __m128 V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, BT_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0) - + __m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, BT_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0) + __m128 V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, BT_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0) + V = _mm_mul_ps(V, v1.mVec128); T = _mm_mul_ps(T, v2.mVec128); V = _mm_sub_ps(V, T); - + V = _mm_shuffle_ps(V, V, BT_SHUFFLE(1, 2, 0, 3)); - // dot: + // dot: V = _mm_mul_ps(V, mVec128); __m128 z = _mm_movehl_ps(V, V); __m128 y = _mm_shuffle_ps(V, V, 0x55); @@ -454,7 +445,7 @@ public: float32x2_t Vlow = vget_low_f32(v2.mVec128); T = vcombine_f32(vext_f32(Tlow, vget_high_f32(v1.mVec128), 1), Tlow); V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v2.mVec128), 1), Vlow); - + V = vmulq_f32(V, v1.mVec128); T = vmulq_f32(T, v2.mVec128); V = vsubq_f32(V, T); @@ -462,31 +453,30 @@ public: // form (Y, Z, X, _); V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow); - // dot: + // dot: V = vmulq_f32(mVec128, V); - float32x2_t x = vpadd_f32(vget_low_f32(V), vget_low_f32(V)); + float32x2_t x = vpadd_f32(vget_low_f32(V), vget_low_f32(V)); x = vadd_f32(x, vget_high_f32(V)); return vget_lane_f32(x, 0); #else - return - m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) + - m_floats[1] * (v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2]) + - m_floats[2] * (v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0]); + return m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) + + m_floats[1] * (v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2]) + + m_floats[2] * (v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0]); #endif } - /**@brief Return the axis with the smallest value + /**@brief Return the axis with the smallest value * Note return values are 0,1,2 for x, y, or z */ SIMD_FORCE_INLINE int minAxis() const { - return m_floats[0] < m_floats[1] ? (m_floats[0] return this, t=1 => return other) */ - SIMD_FORCE_INLINE btVector3 lerp(const btVector3& v, const btScalar& t) const + SIMD_FORCE_INLINE btVector3 lerp(const btVector3& v, const btScalar& t) const { -#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) - __m128 vt = _mm_load_ss(&t); // (t 0 0 0) - vt = bt_pshufd_ps(vt, 0x80); // (rt rt rt 0.0) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) + __m128 vt = _mm_load_ss(&t); // (t 0 0 0) + vt = bt_pshufd_ps(vt, 0x80); // (rt rt rt 0.0) __m128 vl = _mm_sub_ps(v.mVec128, mVec128); vl = _mm_mul_ps(vl, vt); vl = _mm_add_ps(vl, mVec128); - + return btVector3(vl); #elif defined(BT_USE_NEON) float32x4_t vl = vsubq_f32(v.mVec128, mVec128); vl = vmulq_n_f32(vl, t); vl = vaddq_f32(vl, mVec128); - + return btVector3(vl); -#else - return - btVector3( m_floats[0] + (v.m_floats[0] - m_floats[0]) * t, - m_floats[1] + (v.m_floats[1] - m_floats[1]) * t, - m_floats[2] + (v.m_floats[2] - m_floats[2]) * t); +#else + return btVector3(m_floats[0] + (v.m_floats[0] - m_floats[0]) * t, + m_floats[1] + (v.m_floats[1] - m_floats[1]) * t, + m_floats[2] + (v.m_floats[2] - m_floats[2]) * t); #endif } - /**@brief Elementwise multiply this vector by the other + /**@brief Elementwise multiply this vector by the other * @param v The other vector */ SIMD_FORCE_INLINE btVector3& operator*=(const btVector3& v) { -#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) mVec128 = _mm_mul_ps(mVec128, v.mVec128); #elif defined(BT_USE_NEON) mVec128 = vmulq_f32(mVec128, v.mVec128); -#else - m_floats[0] *= v.m_floats[0]; +#else + m_floats[0] *= v.m_floats[0]; m_floats[1] *= v.m_floats[1]; m_floats[2] *= v.m_floats[2]; #endif return *this; } - /**@brief Return the x value */ - SIMD_FORCE_INLINE const btScalar& getX() const { return m_floats[0]; } - /**@brief Return the y value */ - SIMD_FORCE_INLINE const btScalar& getY() const { return m_floats[1]; } - /**@brief Return the z value */ - SIMD_FORCE_INLINE const btScalar& getZ() const { return m_floats[2]; } - /**@brief Set the x value */ - SIMD_FORCE_INLINE void setX(btScalar _x) { m_floats[0] = _x;}; - /**@brief Set the y value */ - SIMD_FORCE_INLINE void setY(btScalar _y) { m_floats[1] = _y;}; - /**@brief Set the z value */ - SIMD_FORCE_INLINE void setZ(btScalar _z) { m_floats[2] = _z;}; - /**@brief Set the w value */ - SIMD_FORCE_INLINE void setW(btScalar _w) { m_floats[3] = _w;}; - /**@brief Return the x value */ - SIMD_FORCE_INLINE const btScalar& x() const { return m_floats[0]; } - /**@brief Return the y value */ - SIMD_FORCE_INLINE const btScalar& y() const { return m_floats[1]; } - /**@brief Return the z value */ - SIMD_FORCE_INLINE const btScalar& z() const { return m_floats[2]; } - /**@brief Return the w value */ - SIMD_FORCE_INLINE const btScalar& w() const { return m_floats[3]; } - - //SIMD_FORCE_INLINE btScalar& operator[](int i) { return (&m_floats[0])[i]; } + /**@brief Return the x value */ + SIMD_FORCE_INLINE const btScalar& getX() const { return m_floats[0]; } + /**@brief Return the y value */ + SIMD_FORCE_INLINE const btScalar& getY() const { return m_floats[1]; } + /**@brief Return the z value */ + SIMD_FORCE_INLINE const btScalar& getZ() const { return m_floats[2]; } + /**@brief Set the x value */ + SIMD_FORCE_INLINE void setX(btScalar _x) { m_floats[0] = _x; }; + /**@brief Set the y value */ + SIMD_FORCE_INLINE void setY(btScalar _y) { m_floats[1] = _y; }; + /**@brief Set the z value */ + SIMD_FORCE_INLINE void setZ(btScalar _z) { m_floats[2] = _z; }; + /**@brief Set the w value */ + SIMD_FORCE_INLINE void setW(btScalar _w) { m_floats[3] = _w; }; + /**@brief Return the x value */ + SIMD_FORCE_INLINE const btScalar& x() const { return m_floats[0]; } + /**@brief Return the y value */ + SIMD_FORCE_INLINE const btScalar& y() const { return m_floats[1]; } + /**@brief Return the z value */ + SIMD_FORCE_INLINE const btScalar& z() const { return m_floats[2]; } + /**@brief Return the w value */ + SIMD_FORCE_INLINE const btScalar& w() const { return m_floats[3]; } + + //SIMD_FORCE_INLINE btScalar& operator[](int i) { return (&m_floats[0])[i]; } //SIMD_FORCE_INLINE const btScalar& operator[](int i) const { return (&m_floats[0])[i]; } ///operator btScalar*() replaces operator[], using implicit conversion. We added operator != and operator == to avoid pointer comparisons. - SIMD_FORCE_INLINE operator btScalar *() { return &m_floats[0]; } - SIMD_FORCE_INLINE operator const btScalar *() const { return &m_floats[0]; } + SIMD_FORCE_INLINE operator btScalar*() { return &m_floats[0]; } + SIMD_FORCE_INLINE operator const btScalar*() const { return &m_floats[0]; } - SIMD_FORCE_INLINE bool operator==(const btVector3& other) const + SIMD_FORCE_INLINE bool operator==(const btVector3& other) const { -#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) - return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128))); -#else - return ((m_floats[3]==other.m_floats[3]) && - (m_floats[2]==other.m_floats[2]) && - (m_floats[1]==other.m_floats[1]) && - (m_floats[0]==other.m_floats[0])); +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) + return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128))); +#else + return ((m_floats[3] == other.m_floats[3]) && + (m_floats[2] == other.m_floats[2]) && + (m_floats[1] == other.m_floats[1]) && + (m_floats[0] == other.m_floats[0])); #endif } - SIMD_FORCE_INLINE bool operator!=(const btVector3& other) const + SIMD_FORCE_INLINE bool operator!=(const btVector3& other) const { return !(*this == other); } - /**@brief Set each element to the max of the current values and the values of another btVector3 + /**@brief Set each element to the max of the current values and the values of another btVector3 * @param other The other btVector3 to compare with */ - SIMD_FORCE_INLINE void setMax(const btVector3& other) + SIMD_FORCE_INLINE void setMax(const btVector3& other) { -#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) mVec128 = _mm_max_ps(mVec128, other.mVec128); #elif defined(BT_USE_NEON) mVec128 = vmaxq_f32(mVec128, other.mVec128); @@ -632,12 +620,12 @@ public: #endif } - /**@brief Set each element to the min of the current values and the values of another btVector3 + /**@brief Set each element to the min of the current values and the values of another btVector3 * @param other The other btVector3 to compare with */ - SIMD_FORCE_INLINE void setMin(const btVector3& other) + SIMD_FORCE_INLINE void setMin(const btVector3& other) { -#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) mVec128 = _mm_min_ps(mVec128, other.mVec128); #elif defined(BT_USE_NEON) mVec128 = vminq_f32(mVec128, other.mVec128); @@ -649,156 +637,155 @@ public: #endif } - SIMD_FORCE_INLINE void setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z) + SIMD_FORCE_INLINE void setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z) { - m_floats[0]=_x; - m_floats[1]=_y; - m_floats[2]=_z; + m_floats[0] = _x; + m_floats[1] = _y; + m_floats[2] = _z; m_floats[3] = btScalar(0.f); } - void getSkewSymmetricMatrix(btVector3* v0,btVector3* v1,btVector3* v2) const + void getSkewSymmetricMatrix(btVector3 * v0, btVector3 * v1, btVector3 * v2) const { -#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) - - __m128 V = _mm_and_ps(mVec128, btvFFF0fMask); +#if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) + + __m128 V = _mm_and_ps(mVec128, btvFFF0fMask); __m128 V0 = _mm_xor_ps(btvMzeroMask, V); __m128 V2 = _mm_movelh_ps(V0, V); - + __m128 V1 = _mm_shuffle_ps(V, V0, 0xCE); - - V0 = _mm_shuffle_ps(V0, V, 0xDB); + + V0 = _mm_shuffle_ps(V0, V, 0xDB); V2 = _mm_shuffle_ps(V2, V, 0xF9); - + v0->mVec128 = V0; v1->mVec128 = V1; v2->mVec128 = V2; #else - v0->setValue(0. ,-z() ,y()); - v1->setValue(z() ,0. ,-x()); - v2->setValue(-y() ,x() ,0.); + v0->setValue(0., -z(), y()); + v1->setValue(z(), 0., -x()); + v2->setValue(-y(), x(), 0.); #endif } void setZero() { -#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) mVec128 = (__m128)_mm_xor_ps(mVec128, mVec128); #elif defined(BT_USE_NEON) - int32x4_t vi = vdupq_n_s32(0); + int32x4_t vi = vdupq_n_s32(0); mVec128 = vreinterpretq_f32_s32(vi); -#else - setValue(btScalar(0.),btScalar(0.),btScalar(0.)); +#else + setValue(btScalar(0.), btScalar(0.), btScalar(0.)); #endif } - SIMD_FORCE_INLINE bool isZero() const + SIMD_FORCE_INLINE bool isZero() const { return m_floats[0] == btScalar(0) && m_floats[1] == btScalar(0) && m_floats[2] == btScalar(0); } - - SIMD_FORCE_INLINE bool fuzzyZero() const + SIMD_FORCE_INLINE bool fuzzyZero() const { - return length2() < SIMD_EPSILON*SIMD_EPSILON; + return length2() < SIMD_EPSILON * SIMD_EPSILON; } - SIMD_FORCE_INLINE void serialize(struct btVector3Data& dataOut) const; + SIMD_FORCE_INLINE void serialize(struct btVector3Data & dataOut) const; + + SIMD_FORCE_INLINE void deSerialize(const struct btVector3DoubleData& dataIn); - SIMD_FORCE_INLINE void deSerialize(const struct btVector3DoubleData& dataIn); + SIMD_FORCE_INLINE void deSerialize(const struct btVector3FloatData& dataIn); - SIMD_FORCE_INLINE void deSerialize(const struct btVector3FloatData& dataIn); + SIMD_FORCE_INLINE void serializeFloat(struct btVector3FloatData & dataOut) const; - SIMD_FORCE_INLINE void serializeFloat(struct btVector3FloatData& dataOut) const; + SIMD_FORCE_INLINE void deSerializeFloat(const struct btVector3FloatData& dataIn); - SIMD_FORCE_INLINE void deSerializeFloat(const struct btVector3FloatData& dataIn); + SIMD_FORCE_INLINE void serializeDouble(struct btVector3DoubleData & dataOut) const; - SIMD_FORCE_INLINE void serializeDouble(struct btVector3DoubleData& dataOut) const; + SIMD_FORCE_INLINE void deSerializeDouble(const struct btVector3DoubleData& dataIn); - SIMD_FORCE_INLINE void deSerializeDouble(const struct btVector3DoubleData& dataIn); - - /**@brief returns index of maximum dot product between this and vectors in array[] + /**@brief returns index of maximum dot product between this and vectors in array[] * @param array The other vectors * @param array_count The number of other vectors * @param dotOut The maximum dot product */ - SIMD_FORCE_INLINE long maxDot( const btVector3 *array, long array_count, btScalar &dotOut ) const; + SIMD_FORCE_INLINE long maxDot(const btVector3* array, long array_count, btScalar& dotOut) const; - /**@brief returns index of minimum dot product between this and vectors in array[] + /**@brief returns index of minimum dot product between this and vectors in array[] * @param array The other vectors * @param array_count The number of other vectors - * @param dotOut The minimum dot product */ - SIMD_FORCE_INLINE long minDot( const btVector3 *array, long array_count, btScalar &dotOut ) const; - - /* create a vector as btVector3( this->dot( btVector3 v0 ), this->dot( btVector3 v1), this->dot( btVector3 v2 )) */ - SIMD_FORCE_INLINE btVector3 dot3( const btVector3 &v0, const btVector3 &v1, const btVector3 &v2 ) const - { -#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) - - __m128 a0 = _mm_mul_ps( v0.mVec128, this->mVec128 ); - __m128 a1 = _mm_mul_ps( v1.mVec128, this->mVec128 ); - __m128 a2 = _mm_mul_ps( v2.mVec128, this->mVec128 ); - __m128 b0 = _mm_unpacklo_ps( a0, a1 ); - __m128 b1 = _mm_unpackhi_ps( a0, a1 ); - __m128 b2 = _mm_unpacklo_ps( a2, _mm_setzero_ps() ); - __m128 r = _mm_movelh_ps( b0, b2 ); - r = _mm_add_ps( r, _mm_movehl_ps( b2, b0 )); - a2 = _mm_and_ps( a2, btvxyzMaskf); - r = _mm_add_ps( r, btCastdTo128f (_mm_move_sd( btCastfTo128d(a2), btCastfTo128d(b1) ))); - return btVector3(r); - + * @param dotOut The minimum dot product */ + SIMD_FORCE_INLINE long minDot(const btVector3* array, long array_count, btScalar& dotOut) const; + + /* create a vector as btVector3( this->dot( btVector3 v0 ), this->dot( btVector3 v1), this->dot( btVector3 v2 )) */ + SIMD_FORCE_INLINE btVector3 dot3(const btVector3& v0, const btVector3& v1, const btVector3& v2) const + { +#if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) + + __m128 a0 = _mm_mul_ps(v0.mVec128, this->mVec128); + __m128 a1 = _mm_mul_ps(v1.mVec128, this->mVec128); + __m128 a2 = _mm_mul_ps(v2.mVec128, this->mVec128); + __m128 b0 = _mm_unpacklo_ps(a0, a1); + __m128 b1 = _mm_unpackhi_ps(a0, a1); + __m128 b2 = _mm_unpacklo_ps(a2, _mm_setzero_ps()); + __m128 r = _mm_movelh_ps(b0, b2); + r = _mm_add_ps(r, _mm_movehl_ps(b2, b0)); + a2 = _mm_and_ps(a2, btvxyzMaskf); + r = _mm_add_ps(r, btCastdTo128f(_mm_move_sd(btCastfTo128d(a2), btCastfTo128d(b1)))); + return btVector3(r); + #elif defined(BT_USE_NEON) - static const uint32x4_t xyzMask = (const uint32x4_t){ static_cast(-1), static_cast(-1), static_cast(-1), 0 }; - float32x4_t a0 = vmulq_f32( v0.mVec128, this->mVec128); - float32x4_t a1 = vmulq_f32( v1.mVec128, this->mVec128); - float32x4_t a2 = vmulq_f32( v2.mVec128, this->mVec128); - float32x2x2_t zLo = vtrn_f32( vget_high_f32(a0), vget_high_f32(a1)); - a2 = (float32x4_t) vandq_u32((uint32x4_t) a2, xyzMask ); - float32x2_t b0 = vadd_f32( vpadd_f32( vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0] ); - float32x2_t b1 = vpadd_f32( vpadd_f32( vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f)); - return btVector3( vcombine_f32(b0, b1) ); -#else - return btVector3( dot(v0), dot(v1), dot(v2)); + static const uint32x4_t xyzMask = (const uint32x4_t){static_cast(-1), static_cast(-1), static_cast(-1), 0}; + float32x4_t a0 = vmulq_f32(v0.mVec128, this->mVec128); + float32x4_t a1 = vmulq_f32(v1.mVec128, this->mVec128); + float32x4_t a2 = vmulq_f32(v2.mVec128, this->mVec128); + float32x2x2_t zLo = vtrn_f32(vget_high_f32(a0), vget_high_f32(a1)); + a2 = (float32x4_t)vandq_u32((uint32x4_t)a2, xyzMask); + float32x2_t b0 = vadd_f32(vpadd_f32(vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0]); + float32x2_t b1 = vpadd_f32(vpadd_f32(vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f)); + return btVector3(vcombine_f32(b0, b1)); +#else + return btVector3(dot(v0), dot(v1), dot(v2)); #endif - } + } }; /**@brief Return the sum of two vectors (Point symantics)*/ -SIMD_FORCE_INLINE btVector3 -operator+(const btVector3& v1, const btVector3& v2) +SIMD_FORCE_INLINE btVector3 +operator+(const btVector3& v1, const btVector3& v2) { -#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) return btVector3(_mm_add_ps(v1.mVec128, v2.mVec128)); #elif defined(BT_USE_NEON) return btVector3(vaddq_f32(v1.mVec128, v2.mVec128)); #else return btVector3( - v1.m_floats[0] + v2.m_floats[0], - v1.m_floats[1] + v2.m_floats[1], - v1.m_floats[2] + v2.m_floats[2]); + v1.m_floats[0] + v2.m_floats[0], + v1.m_floats[1] + v2.m_floats[1], + v1.m_floats[2] + v2.m_floats[2]); #endif } /**@brief Return the elementwise product of two vectors */ -SIMD_FORCE_INLINE btVector3 -operator*(const btVector3& v1, const btVector3& v2) +SIMD_FORCE_INLINE btVector3 +operator*(const btVector3& v1, const btVector3& v2) { -#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) return btVector3(_mm_mul_ps(v1.mVec128, v2.mVec128)); #elif defined(BT_USE_NEON) return btVector3(vmulq_f32(v1.mVec128, v2.mVec128)); #else return btVector3( - v1.m_floats[0] * v2.m_floats[0], - v1.m_floats[1] * v2.m_floats[1], - v1.m_floats[2] * v2.m_floats[2]); + v1.m_floats[0] * v2.m_floats[0], + v1.m_floats[1] * v2.m_floats[1], + v1.m_floats[2] * v2.m_floats[2]); #endif } /**@brief Return the difference between two vectors */ -SIMD_FORCE_INLINE btVector3 +SIMD_FORCE_INLINE btVector3 operator-(const btVector3& v1, const btVector3& v2) { -#if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) +#if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) // without _mm_and_ps this code causes slowdown in Concave moving __m128 r = _mm_sub_ps(v1.mVec128, v2.mVec128); @@ -808,33 +795,33 @@ operator-(const btVector3& v1, const btVector3& v2) return btVector3((float32x4_t)vandq_s32((int32x4_t)r, btvFFF0Mask)); #else return btVector3( - v1.m_floats[0] - v2.m_floats[0], - v1.m_floats[1] - v2.m_floats[1], - v1.m_floats[2] - v2.m_floats[2]); + v1.m_floats[0] - v2.m_floats[0], + v1.m_floats[1] - v2.m_floats[1], + v1.m_floats[2] - v2.m_floats[2]); #endif } /**@brief Return the negative of the vector */ -SIMD_FORCE_INLINE btVector3 +SIMD_FORCE_INLINE btVector3 operator-(const btVector3& v) { -#if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)) +#if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) __m128 r = _mm_xor_ps(v.mVec128, btvMzeroMask); - return btVector3(_mm_and_ps(r, btvFFF0fMask)); + return btVector3(_mm_and_ps(r, btvFFF0fMask)); #elif defined(BT_USE_NEON) return btVector3((btSimdFloat4)veorq_s32((int32x4_t)v.mVec128, (int32x4_t)btvMzeroMask)); -#else +#else return btVector3(-v.m_floats[0], -v.m_floats[1], -v.m_floats[2]); #endif } /**@brief Return the vector scaled by s */ -SIMD_FORCE_INLINE btVector3 +SIMD_FORCE_INLINE btVector3 operator*(const btVector3& v, const btScalar& s) { -#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) - __m128 vs = _mm_load_ss(&s); // (S 0 0 0) - vs = bt_pshufd_ps(vs, 0x80); // (S S S 0.0) +#if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) + __m128 vs = _mm_load_ss(&s); // (S 0 0 0) + vs = bt_pshufd_ps(vs, 0x80); // (S S S 0.0) return btVector3(_mm_mul_ps(v.mVec128, vs)); #elif defined(BT_USE_NEON) float32x4_t r = vmulq_n_f32(v.mVec128, s); @@ -845,10 +832,10 @@ operator*(const btVector3& v, const btScalar& s) } /**@brief Return the vector scaled by s */ -SIMD_FORCE_INLINE btVector3 +SIMD_FORCE_INLINE btVector3 operator*(const btScalar& s, const btVector3& v) -{ - return v * s; +{ + return v * s; } /**@brief Return the vector inversely scaled by s */ @@ -856,7 +843,7 @@ SIMD_FORCE_INLINE btVector3 operator/(const btVector3& v, const btScalar& s) { btFullAssert(s != btScalar(0.0)); -#if 0 //defined(BT_USE_SSE_IN_API) +#if 0 //defined(BT_USE_SSE_IN_API) // this code is not faster ! __m128 vs = _mm_load_ss(&s); vs = _mm_div_ss(v1110, vs); @@ -872,67 +859,65 @@ operator/(const btVector3& v, const btScalar& s) SIMD_FORCE_INLINE btVector3 operator/(const btVector3& v1, const btVector3& v2) { -#if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) +#if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) __m128 vec = _mm_div_ps(v1.mVec128, v2.mVec128); vec = _mm_and_ps(vec, btvFFF0fMask); - return btVector3(vec); + return btVector3(vec); #elif defined(BT_USE_NEON) float32x4_t x, y, v, m; x = v1.mVec128; y = v2.mVec128; - - v = vrecpeq_f32(y); // v ~ 1/y - m = vrecpsq_f32(y, v); // m = (2-v*y) - v = vmulq_f32(v, m); // vv = v*m ~~ 1/y - m = vrecpsq_f32(y, v); // mm = (2-vv*y) - v = vmulq_f32(v, x); // x*vv - v = vmulq_f32(v, m); // (x*vv)*(2-vv*y) = x*(vv(2-vv*y)) ~~~ x/y + + v = vrecpeq_f32(y); // v ~ 1/y + m = vrecpsq_f32(y, v); // m = (2-v*y) + v = vmulq_f32(v, m); // vv = v*m ~~ 1/y + m = vrecpsq_f32(y, v); // mm = (2-vv*y) + v = vmulq_f32(v, x); // x*vv + v = vmulq_f32(v, m); // (x*vv)*(2-vv*y) = x*(vv(2-vv*y)) ~~~ x/y return btVector3(v); #else return btVector3( - v1.m_floats[0] / v2.m_floats[0], - v1.m_floats[1] / v2.m_floats[1], - v1.m_floats[2] / v2.m_floats[2]); + v1.m_floats[0] / v2.m_floats[0], + v1.m_floats[1] / v2.m_floats[1], + v1.m_floats[2] / v2.m_floats[2]); #endif } /**@brief Return the dot product between two vectors */ -SIMD_FORCE_INLINE btScalar -btDot(const btVector3& v1, const btVector3& v2) -{ - return v1.dot(v2); +SIMD_FORCE_INLINE btScalar +btDot(const btVector3& v1, const btVector3& v2) +{ + return v1.dot(v2); } - /**@brief Return the distance squared between two vectors */ SIMD_FORCE_INLINE btScalar -btDistance2(const btVector3& v1, const btVector3& v2) -{ - return v1.distance2(v2); +btDistance2(const btVector3& v1, const btVector3& v2) +{ + return v1.distance2(v2); } - /**@brief Return the distance between two vectors */ SIMD_FORCE_INLINE btScalar -btDistance(const btVector3& v1, const btVector3& v2) -{ - return v1.distance(v2); +btDistance(const btVector3& v1, const btVector3& v2) +{ + return v1.distance(v2); } /**@brief Return the angle between two vectors */ SIMD_FORCE_INLINE btScalar -btAngle(const btVector3& v1, const btVector3& v2) -{ - return v1.angle(v2); +btAngle(const btVector3& v1, const btVector3& v2) +{ + return v1.angle(v2); } /**@brief Return the cross product of two vectors */ -SIMD_FORCE_INLINE btVector3 -btCross(const btVector3& v1, const btVector3& v2) -{ - return v1.cross(v2); +SIMD_FORCE_INLINE btVector3 +btCross(const btVector3& v1, const btVector3& v2) +{ + return v1.cross(v2); } SIMD_FORCE_INLINE btScalar @@ -945,14 +930,12 @@ btTriple(const btVector3& v1, const btVector3& v2, const btVector3& v3) * @param v1 One vector * @param v2 The other vector * @param t The ration of this to v (t = 0 => return v1, t=1 => return v2) */ -SIMD_FORCE_INLINE btVector3 +SIMD_FORCE_INLINE btVector3 lerp(const btVector3& v1, const btVector3& v2, const btScalar& t) { return v1.lerp(v2, t); } - - SIMD_FORCE_INLINE btScalar btVector3::distance2(const btVector3& v) const { return (v - *this).length2(); @@ -968,140 +951,137 @@ SIMD_FORCE_INLINE btVector3 btVector3::normalized() const btVector3 nrm = *this; return nrm.normalize(); -} +} -SIMD_FORCE_INLINE btVector3 btVector3::rotate( const btVector3& wAxis, const btScalar _angle ) const +SIMD_FORCE_INLINE btVector3 btVector3::rotate(const btVector3& wAxis, const btScalar _angle) const { // wAxis must be a unit lenght vector -#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) - __m128 O = _mm_mul_ps(wAxis.mVec128, mVec128); - btScalar ssin = btSin( _angle ); - __m128 C = wAxis.cross( mVec128 ).mVec128; + __m128 O = _mm_mul_ps(wAxis.mVec128, mVec128); + btScalar ssin = btSin(_angle); + __m128 C = wAxis.cross(mVec128).mVec128; O = _mm_and_ps(O, btvFFF0fMask); - btScalar scos = btCos( _angle ); - - __m128 vsin = _mm_load_ss(&ssin); // (S 0 0 0) - __m128 vcos = _mm_load_ss(&scos); // (S 0 0 0) - - __m128 Y = bt_pshufd_ps(O, 0xC9); // (Y Z X 0) - __m128 Z = bt_pshufd_ps(O, 0xD2); // (Z X Y 0) + btScalar scos = btCos(_angle); + + __m128 vsin = _mm_load_ss(&ssin); // (S 0 0 0) + __m128 vcos = _mm_load_ss(&scos); // (S 0 0 0) + + __m128 Y = bt_pshufd_ps(O, 0xC9); // (Y Z X 0) + __m128 Z = bt_pshufd_ps(O, 0xD2); // (Z X Y 0) O = _mm_add_ps(O, Y); - vsin = bt_pshufd_ps(vsin, 0x80); // (S S S 0) + vsin = bt_pshufd_ps(vsin, 0x80); // (S S S 0) O = _mm_add_ps(O, Z); - vcos = bt_pshufd_ps(vcos, 0x80); // (S S S 0) - - vsin = vsin * C; - O = O * wAxis.mVec128; - __m128 X = mVec128 - O; - - O = O + vsin; + vcos = bt_pshufd_ps(vcos, 0x80); // (S S S 0) + + vsin = vsin * C; + O = O * wAxis.mVec128; + __m128 X = mVec128 - O; + + O = O + vsin; vcos = vcos * X; - O = O + vcos; - + O = O + vcos; + return btVector3(O); #else - btVector3 o = wAxis * wAxis.dot( *this ); + btVector3 o = wAxis * wAxis.dot(*this); btVector3 _x = *this - o; btVector3 _y; - _y = wAxis.cross( *this ); + _y = wAxis.cross(*this); - return ( o + _x * btCos( _angle ) + _y * btSin( _angle ) ); + return (o + _x * btCos(_angle) + _y * btSin(_angle)); #endif } -SIMD_FORCE_INLINE long btVector3::maxDot( const btVector3 *array, long array_count, btScalar &dotOut ) const +SIMD_FORCE_INLINE long btVector3::maxDot(const btVector3* array, long array_count, btScalar& dotOut) const { -#if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON) - #if defined _WIN32 || defined (BT_USE_SSE) - const long scalar_cutoff = 10; - long _maxdot_large( const float *array, const float *vec, unsigned long array_count, float *dotOut ); - #elif defined BT_USE_NEON - const long scalar_cutoff = 4; - extern long (*_maxdot_large)( const float *array, const float *vec, unsigned long array_count, float *dotOut ); - #endif - if( array_count < scalar_cutoff ) +#if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined(BT_USE_NEON) +#if defined _WIN32 || defined(BT_USE_SSE) + const long scalar_cutoff = 10; + long _maxdot_large(const float* array, const float* vec, unsigned long array_count, float* dotOut); +#elif defined BT_USE_NEON + const long scalar_cutoff = 4; + extern long (*_maxdot_large)(const float* array, const float* vec, unsigned long array_count, float* dotOut); #endif - { - btScalar maxDot1 = -SIMD_INFINITY; - int i = 0; - int ptIndex = -1; - for( i = 0; i < array_count; i++ ) - { - btScalar dot = array[i].dot(*this); - - if( dot > maxDot1 ) - { - maxDot1 = dot; - ptIndex = i; - } - } - - dotOut = maxDot1; - return ptIndex; - } -#if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON) - return _maxdot_large( (float*) array, (float*) &m_floats[0], array_count, &dotOut ); + if (array_count < scalar_cutoff) +#endif + { + btScalar maxDot1 = -SIMD_INFINITY; + int i = 0; + int ptIndex = -1; + for (i = 0; i < array_count; i++) + { + btScalar dot = array[i].dot(*this); + + if (dot > maxDot1) + { + maxDot1 = dot; + ptIndex = i; + } + } + + dotOut = maxDot1; + return ptIndex; + } +#if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined(BT_USE_NEON) + return _maxdot_large((float*)array, (float*)&m_floats[0], array_count, &dotOut); #endif } -SIMD_FORCE_INLINE long btVector3::minDot( const btVector3 *array, long array_count, btScalar &dotOut ) const +SIMD_FORCE_INLINE long btVector3::minDot(const btVector3* array, long array_count, btScalar& dotOut) const { -#if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON) - #if defined BT_USE_SSE - const long scalar_cutoff = 10; - long _mindot_large( const float *array, const float *vec, unsigned long array_count, float *dotOut ); - #elif defined BT_USE_NEON - const long scalar_cutoff = 4; - extern long (*_mindot_large)( const float *array, const float *vec, unsigned long array_count, float *dotOut ); - #else - #error unhandled arch! - #endif - - if( array_count < scalar_cutoff ) +#if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined(BT_USE_NEON) +#if defined BT_USE_SSE + const long scalar_cutoff = 10; + long _mindot_large(const float* array, const float* vec, unsigned long array_count, float* dotOut); +#elif defined BT_USE_NEON + const long scalar_cutoff = 4; + extern long (*_mindot_large)(const float* array, const float* vec, unsigned long array_count, float* dotOut); +#else +#error unhandled arch! #endif - { - btScalar minDot = SIMD_INFINITY; - int i = 0; - int ptIndex = -1; - - for( i = 0; i < array_count; i++ ) - { - btScalar dot = array[i].dot(*this); - - if( dot < minDot ) - { - minDot = dot; - ptIndex = i; - } - } - - dotOut = minDot; - - return ptIndex; - } -#if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON) - return _mindot_large( (float*) array, (float*) &m_floats[0], array_count, &dotOut ); -#endif//BT_USE_SIMD_VECTOR3 -} + if (array_count < scalar_cutoff) +#endif + { + btScalar minDot = SIMD_INFINITY; + int i = 0; + int ptIndex = -1; + + for (i = 0; i < array_count; i++) + { + btScalar dot = array[i].dot(*this); + + if (dot < minDot) + { + minDot = dot; + ptIndex = i; + } + } + + dotOut = minDot; + + return ptIndex; + } +#if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined(BT_USE_NEON) + return _mindot_large((float*)array, (float*)&m_floats[0], array_count, &dotOut); +#endif //BT_USE_SIMD_VECTOR3 +} class btVector4 : public btVector3 { public: - SIMD_FORCE_INLINE btVector4() {} - - SIMD_FORCE_INLINE btVector4(const btScalar& _x, const btScalar& _y, const btScalar& _z,const btScalar& _w) - : btVector3(_x,_y,_z) + SIMD_FORCE_INLINE btVector4(const btScalar& _x, const btScalar& _y, const btScalar& _z, const btScalar& _w) + : btVector3(_x, _y, _z) { m_floats[3] = _w; } -#if (defined (BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined (BT_USE_NEON) +#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) SIMD_FORCE_INLINE btVector4(const btSimdFloat4 vec) { mVec128 = vec; @@ -1112,34 +1092,32 @@ public: mVec128 = rhs.mVec128; } - SIMD_FORCE_INLINE btVector4& - operator=(const btVector4& v) + SIMD_FORCE_INLINE btVector4& + operator=(const btVector4& v) { mVec128 = v.mVec128; return *this; } -#endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON) +#endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON) - SIMD_FORCE_INLINE btVector4 absolute4() const + SIMD_FORCE_INLINE btVector4 absolute4() const { -#if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) +#if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) return btVector4(_mm_and_ps(mVec128, btvAbsfMask)); #elif defined(BT_USE_NEON) return btVector4(vabsq_f32(mVec128)); -#else +#else return btVector4( - btFabs(m_floats[0]), - btFabs(m_floats[1]), + btFabs(m_floats[0]), + btFabs(m_floats[1]), btFabs(m_floats[2]), btFabs(m_floats[3])); #endif } + btScalar getW() const { return m_floats[3]; } - btScalar getW() const { return m_floats[3];} - - - SIMD_FORCE_INLINE int maxAxis4() const + SIMD_FORCE_INLINE int maxAxis4() const { int maxIndex = -1; btScalar maxVal = btScalar(-BT_LARGE_FLOAT); @@ -1156,7 +1134,7 @@ public: if (m_floats[2] > maxVal) { maxIndex = 2; - maxVal =m_floats[2]; + maxVal = m_floats[2]; } if (m_floats[3] > maxVal) { @@ -1166,7 +1144,6 @@ public: return maxIndex; } - SIMD_FORCE_INLINE int minAxis4() const { int minIndex = -1; @@ -1184,190 +1161,176 @@ public: if (m_floats[2] < minVal) { minIndex = 2; - minVal =m_floats[2]; + minVal = m_floats[2]; } if (m_floats[3] < minVal) { minIndex = 3; } - + return minIndex; } - - SIMD_FORCE_INLINE int closestAxis4() const + SIMD_FORCE_INLINE int closestAxis4() const { return absolute4().maxAxis4(); } - - - - /**@brief Set x,y,z and zero w + /**@brief Set x,y,z and zero w * @param x Value of x * @param y Value of y * @param z Value of z */ - -/* void getValue(btScalar *m) const + /* void getValue(btScalar *m) const { m[0] = m_floats[0]; m[1] = m_floats[1]; m[2] =m_floats[2]; } */ -/**@brief Set the values + /**@brief Set the values * @param x Value of x * @param y Value of y * @param z Value of z * @param w Value of w */ - SIMD_FORCE_INLINE void setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z,const btScalar& _w) - { - m_floats[0]=_x; - m_floats[1]=_y; - m_floats[2]=_z; - m_floats[3]=_w; - } - - + SIMD_FORCE_INLINE void setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z, const btScalar& _w) + { + m_floats[0] = _x; + m_floats[1] = _y; + m_floats[2] = _z; + m_floats[3] = _w; + } }; - ///btSwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization -SIMD_FORCE_INLINE void btSwapScalarEndian(const btScalar& sourceVal, btScalar& destVal) +SIMD_FORCE_INLINE void btSwapScalarEndian(const btScalar& sourceVal, btScalar& destVal) { #ifdef BT_USE_DOUBLE_PRECISION - unsigned char* dest = (unsigned char*) &destVal; - const unsigned char* src = (const unsigned char*) &sourceVal; + unsigned char* dest = (unsigned char*)&destVal; + const unsigned char* src = (const unsigned char*)&sourceVal; dest[0] = src[7]; - dest[1] = src[6]; - dest[2] = src[5]; - dest[3] = src[4]; - dest[4] = src[3]; - dest[5] = src[2]; - dest[6] = src[1]; - dest[7] = src[0]; + dest[1] = src[6]; + dest[2] = src[5]; + dest[3] = src[4]; + dest[4] = src[3]; + dest[5] = src[2]; + dest[6] = src[1]; + dest[7] = src[0]; #else - unsigned char* dest = (unsigned char*) &destVal; - const unsigned char* src = (const unsigned char*) &sourceVal; + unsigned char* dest = (unsigned char*)&destVal; + const unsigned char* src = (const unsigned char*)&sourceVal; dest[0] = src[3]; - dest[1] = src[2]; - dest[2] = src[1]; - dest[3] = src[0]; -#endif //BT_USE_DOUBLE_PRECISION + dest[1] = src[2]; + dest[2] = src[1]; + dest[3] = src[0]; +#endif //BT_USE_DOUBLE_PRECISION } ///btSwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization -SIMD_FORCE_INLINE void btSwapVector3Endian(const btVector3& sourceVec, btVector3& destVec) +SIMD_FORCE_INLINE void btSwapVector3Endian(const btVector3& sourceVec, btVector3& destVec) { - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) { - btSwapScalarEndian(sourceVec[i],destVec[i]); + btSwapScalarEndian(sourceVec[i], destVec[i]); } - } ///btUnSwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization -SIMD_FORCE_INLINE void btUnSwapVector3Endian(btVector3& vector) +SIMD_FORCE_INLINE void btUnSwapVector3Endian(btVector3& vector) { - - btVector3 swappedVec; - for (int i=0;i<4;i++) + btVector3 swappedVec; + for (int i = 0; i < 4; i++) { - btSwapScalarEndian(vector[i],swappedVec[i]); + btSwapScalarEndian(vector[i], swappedVec[i]); } vector = swappedVec; } template -SIMD_FORCE_INLINE void btPlaneSpace1 (const T& n, T& p, T& q) +SIMD_FORCE_INLINE void btPlaneSpace1(const T& n, T& p, T& q) { - if (btFabs(n[2]) > SIMDSQRT12) { - // choose p in y-z plane - btScalar a = n[1]*n[1] + n[2]*n[2]; - btScalar k = btRecipSqrt (a); - p[0] = 0; - p[1] = -n[2]*k; - p[2] = n[1]*k; - // set q = n x p - q[0] = a*k; - q[1] = -n[0]*p[2]; - q[2] = n[0]*p[1]; - } - else { - // choose p in x-y plane - btScalar a = n[0]*n[0] + n[1]*n[1]; - btScalar k = btRecipSqrt (a); - p[0] = -n[1]*k; - p[1] = n[0]*k; - p[2] = 0; - // set q = n x p - q[0] = -n[2]*p[1]; - q[1] = n[2]*p[0]; - q[2] = a*k; - } + if (btFabs(n[2]) > SIMDSQRT12) + { + // choose p in y-z plane + btScalar a = n[1] * n[1] + n[2] * n[2]; + btScalar k = btRecipSqrt(a); + p[0] = 0; + p[1] = -n[2] * k; + p[2] = n[1] * k; + // set q = n x p + q[0] = a * k; + q[1] = -n[0] * p[2]; + q[2] = n[0] * p[1]; + } + else + { + // choose p in x-y plane + btScalar a = n[0] * n[0] + n[1] * n[1]; + btScalar k = btRecipSqrt(a); + p[0] = -n[1] * k; + p[1] = n[0] * k; + p[2] = 0; + // set q = n x p + q[0] = -n[2] * p[1]; + q[1] = n[2] * p[0]; + q[2] = a * k; + } } - -struct btVector3FloatData +struct btVector3FloatData { - float m_floats[4]; + float m_floats[4]; }; -struct btVector3DoubleData +struct btVector3DoubleData { - double m_floats[4]; - + double m_floats[4]; }; -SIMD_FORCE_INLINE void btVector3::serializeFloat(struct btVector3FloatData& dataOut) const +SIMD_FORCE_INLINE void btVector3::serializeFloat(struct btVector3FloatData& dataOut) const { ///could also do a memcpy, check if it is worth it - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) dataOut.m_floats[i] = float(m_floats[i]); } -SIMD_FORCE_INLINE void btVector3::deSerializeFloat(const struct btVector3FloatData& dataIn) +SIMD_FORCE_INLINE void btVector3::deSerializeFloat(const struct btVector3FloatData& dataIn) { - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) m_floats[i] = btScalar(dataIn.m_floats[i]); } - -SIMD_FORCE_INLINE void btVector3::serializeDouble(struct btVector3DoubleData& dataOut) const +SIMD_FORCE_INLINE void btVector3::serializeDouble(struct btVector3DoubleData& dataOut) const { ///could also do a memcpy, check if it is worth it - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) dataOut.m_floats[i] = double(m_floats[i]); } -SIMD_FORCE_INLINE void btVector3::deSerializeDouble(const struct btVector3DoubleData& dataIn) +SIMD_FORCE_INLINE void btVector3::deSerializeDouble(const struct btVector3DoubleData& dataIn) { - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) m_floats[i] = btScalar(dataIn.m_floats[i]); } - -SIMD_FORCE_INLINE void btVector3::serialize(struct btVector3Data& dataOut) const +SIMD_FORCE_INLINE void btVector3::serialize(struct btVector3Data& dataOut) const { ///could also do a memcpy, check if it is worth it - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) dataOut.m_floats[i] = m_floats[i]; } - -SIMD_FORCE_INLINE void btVector3::deSerialize(const struct btVector3FloatData& dataIn) +SIMD_FORCE_INLINE void btVector3::deSerialize(const struct btVector3FloatData& dataIn) { - for (int i = 0; i<4; i++) + for (int i = 0; i < 4; i++) m_floats[i] = (btScalar)dataIn.m_floats[i]; } - -SIMD_FORCE_INLINE void btVector3::deSerialize(const struct btVector3DoubleData& dataIn) +SIMD_FORCE_INLINE void btVector3::deSerialize(const struct btVector3DoubleData& dataIn) { - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) m_floats[i] = (btScalar)dataIn.m_floats[i]; } -#endif //BT_VECTOR3_H +#endif //BT_VECTOR3_H -- cgit v1.2.3