diff options
author | Andrea Catania <info@andreacatania.com> | 2018-09-07 16:11:04 +0200 |
---|---|---|
committer | Andrea Catania <info@andreacatania.com> | 2018-09-07 16:11:04 +0200 |
commit | 6142448417f4e15bf0bc0c94df7d1862a790e3c7 (patch) | |
tree | b6b0a44df905e0ad2e6f82eacd5ef4acdf9a0df1 /thirdparty/bullet/LinearMath/TaskScheduler | |
parent | 53070437514e448c87f6cb31cf5b27a3839dbfa1 (diff) |
Update bullet to Master 12409f1118a7c7a266f9071350c70789dfe73bb9
Diffstat (limited to 'thirdparty/bullet/LinearMath/TaskScheduler')
4 files changed, 1709 insertions, 0 deletions
diff --git a/thirdparty/bullet/LinearMath/TaskScheduler/btTaskScheduler.cpp b/thirdparty/bullet/LinearMath/TaskScheduler/btTaskScheduler.cpp new file mode 100644 index 0000000000..49510d1660 --- /dev/null +++ b/thirdparty/bullet/LinearMath/TaskScheduler/btTaskScheduler.cpp @@ -0,0 +1,802 @@ + +#include "LinearMath/btMinMax.h" +#include "LinearMath/btAlignedObjectArray.h" +#include "LinearMath/btThreads.h" +#include "LinearMath/btQuickprof.h" +#include <stdio.h> +#include <algorithm> + + + +#if BT_THREADSAFE + +#include "btThreadSupportInterface.h" + +#if defined( _WIN32 ) + +#define WIN32_LEAN_AND_MEAN + +#include <windows.h> + +#endif + + +typedef unsigned long long btU64; +static const int kCacheLineSize = 64; + +void btSpinPause() +{ +#if defined( _WIN32 ) + YieldProcessor(); +#endif +} + + +struct WorkerThreadStatus +{ + enum Type + { + kInvalid, + kWaitingForWork, + kWorking, + kSleeping, + }; +}; + + +ATTRIBUTE_ALIGNED64(class) WorkerThreadDirectives +{ + static const int kMaxThreadCount = BT_MAX_THREAD_COUNT; + // directives for all worker threads packed into a single cacheline + char m_threadDirs[kMaxThreadCount]; + +public: + enum Type + { + kInvalid, + kGoToSleep, // go to sleep + kStayAwakeButIdle, // wait for not checking job queue + kScanForJobs, // actively scan job queue for jobs + }; + WorkerThreadDirectives() + { + for ( int i = 0; i < kMaxThreadCount; ++i ) + { + m_threadDirs[ i ] = 0; + } + } + + Type getDirective(int threadId) + { + btAssert(threadId < kMaxThreadCount); + return static_cast<Type>(m_threadDirs[threadId]); + } + + void setDirectiveByRange(int threadBegin, int threadEnd, Type dir) + { + btAssert( threadBegin < threadEnd ); + btAssert( threadEnd <= kMaxThreadCount ); + char dirChar = static_cast<char>(dir); + for ( int i = threadBegin; i < threadEnd; ++i ) + { + m_threadDirs[ i ] = dirChar; + } + } +}; + +class JobQueue; + +ATTRIBUTE_ALIGNED64(struct) ThreadLocalStorage +{ + int m_threadId; + WorkerThreadStatus::Type m_status; + int m_numJobsFinished; + btSpinMutex m_mutex; + btScalar m_sumResult; + WorkerThreadDirectives * m_directive; + JobQueue* m_queue; + btClock* m_clock; + unsigned int m_cooldownTime; +}; + + +struct IJob +{ + virtual void executeJob(int threadId) = 0; +}; + +class ParallelForJob : public IJob +{ + const btIParallelForBody* m_body; + int m_begin; + int m_end; + +public: + ParallelForJob( int iBegin, int iEnd, const btIParallelForBody& body ) + { + m_body = &body; + m_begin = iBegin; + m_end = iEnd; + } + virtual void executeJob(int threadId) BT_OVERRIDE + { + BT_PROFILE( "executeJob" ); + + // call the functor body to do the work + m_body->forLoop( m_begin, m_end ); + } +}; + + +class ParallelSumJob : public IJob +{ + const btIParallelSumBody* m_body; + ThreadLocalStorage* m_threadLocalStoreArray; + int m_begin; + int m_end; + +public: + ParallelSumJob( int iBegin, int iEnd, const btIParallelSumBody& body, ThreadLocalStorage* tls ) + { + m_body = &body; + m_threadLocalStoreArray = tls; + m_begin = iBegin; + m_end = iEnd; + } + virtual void executeJob( int threadId ) BT_OVERRIDE + { + BT_PROFILE( "executeJob" ); + + // call the functor body to do the work + btScalar val = m_body->sumLoop( m_begin, m_end ); +#if BT_PARALLEL_SUM_DETERMINISTISM + // by truncating bits of the result, we can make the parallelSum deterministic (at the expense of precision) + const float TRUNC_SCALE = float(1<<19); + val = floor(val*TRUNC_SCALE+0.5f)/TRUNC_SCALE; // truncate some bits +#endif + m_threadLocalStoreArray[threadId].m_sumResult += val; + } +}; + + +ATTRIBUTE_ALIGNED64(class) JobQueue +{ + btThreadSupportInterface* m_threadSupport; + btCriticalSection* m_queueLock; + btSpinMutex m_mutex; + + btAlignedObjectArray<IJob*> m_jobQueue; + char* m_jobMem; + int m_jobMemSize; + bool m_queueIsEmpty; + int m_tailIndex; + int m_headIndex; + int m_allocSize; + bool m_useSpinMutex; + btAlignedObjectArray<JobQueue*> m_neighborContexts; + char m_cachePadding[kCacheLineSize]; // prevent false sharing + + void freeJobMem() + { + if ( m_jobMem ) + { + // free old + btAlignedFree(m_jobMem); + m_jobMem = NULL; + } + } + void resizeJobMem(int newSize) + { + if (newSize > m_jobMemSize) + { + freeJobMem(); + m_jobMem = static_cast<char*>(btAlignedAlloc(newSize, kCacheLineSize)); + m_jobMemSize = newSize; + } + } + +public: + + JobQueue() + { + m_jobMem = NULL; + m_jobMemSize = 0; + m_threadSupport = NULL; + m_queueLock = NULL; + m_headIndex = 0; + m_tailIndex = 0; + m_useSpinMutex = false; + } + ~JobQueue() + { + exit(); + } + void exit() + { + freeJobMem(); + if (m_queueLock && m_threadSupport) + { + m_threadSupport->deleteCriticalSection(m_queueLock); + m_queueLock = NULL; + m_threadSupport = 0; + } + } + + void init(btThreadSupportInterface* threadSup, btAlignedObjectArray<JobQueue>* contextArray) + { + m_threadSupport = threadSup; + if (threadSup) + { + m_queueLock = m_threadSupport->createCriticalSection(); + } + setupJobStealing(contextArray, contextArray->size()); + } + void setupJobStealing(btAlignedObjectArray<JobQueue>* contextArray, int numActiveContexts) + { + btAlignedObjectArray<JobQueue>& contexts = *contextArray; + int selfIndex = 0; + for (int i = 0; i < contexts.size(); ++i) + { + if ( this == &contexts[ i ] ) + { + selfIndex = i; + break; + } + } + int numNeighbors = btMin(2, contexts.size() - 1); + int neighborOffsets[ ] = {-1, 1, -2, 2, -3, 3}; + int numOffsets = sizeof(neighborOffsets)/sizeof(neighborOffsets[0]); + m_neighborContexts.reserve( numNeighbors ); + m_neighborContexts.resizeNoInitialize(0); + for (int i = 0; i < numOffsets && m_neighborContexts.size() < numNeighbors; i++) + { + int neighborIndex = selfIndex + neighborOffsets[i]; + if ( neighborIndex >= 0 && neighborIndex < numActiveContexts) + { + m_neighborContexts.push_back( &contexts[ neighborIndex ] ); + } + } + } + + bool isQueueEmpty() const {return m_queueIsEmpty;} + void lockQueue() + { + if ( m_useSpinMutex ) + { + m_mutex.lock(); + } + else + { + m_queueLock->lock(); + } + } + void unlockQueue() + { + if ( m_useSpinMutex ) + { + m_mutex.unlock(); + } + else + { + m_queueLock->unlock(); + } + } + void clearQueue(int jobCount, int jobSize) + { + lockQueue(); + m_headIndex = 0; + m_tailIndex = 0; + m_allocSize = 0; + m_queueIsEmpty = true; + int jobBufSize = jobSize * jobCount; + // make sure we have enough memory allocated to store jobs + if ( jobBufSize > m_jobMemSize ) + { + resizeJobMem( jobBufSize ); + } + // make sure job queue is big enough + if ( jobCount > m_jobQueue.capacity() ) + { + m_jobQueue.reserve( jobCount ); + } + unlockQueue(); + m_jobQueue.resizeNoInitialize( 0 ); + } + void* allocJobMem(int jobSize) + { + btAssert(m_jobMemSize >= (m_allocSize + jobSize)); + void* jobMem = &m_jobMem[m_allocSize]; + m_allocSize += jobSize; + return jobMem; + } + void submitJob( IJob* job ) + { + btAssert( reinterpret_cast<char*>( job ) >= &m_jobMem[ 0 ] && reinterpret_cast<char*>( job ) < &m_jobMem[ 0 ] + m_allocSize ); + m_jobQueue.push_back( job ); + lockQueue(); + m_tailIndex++; + m_queueIsEmpty = false; + unlockQueue(); + } + IJob* consumeJobFromOwnQueue() + { + if ( m_queueIsEmpty ) + { + // lock free path. even if this is taken erroneously it isn't harmful + return NULL; + } + IJob* job = NULL; + lockQueue(); + if ( !m_queueIsEmpty ) + { + job = m_jobQueue[ m_headIndex++ ]; + btAssert( reinterpret_cast<char*>( job ) >= &m_jobMem[ 0 ] && reinterpret_cast<char*>( job ) < &m_jobMem[ 0 ] + m_allocSize ); + if ( m_headIndex == m_tailIndex ) + { + m_queueIsEmpty = true; + } + } + unlockQueue(); + return job; + } + IJob* consumeJob() + { + if (IJob* job = consumeJobFromOwnQueue()) + { + return job; + } + // own queue is empty, try to steal from neighbor + for (int i = 0; i < m_neighborContexts.size(); ++i) + { + JobQueue* otherContext = m_neighborContexts[ i ]; + if ( IJob* job = otherContext->consumeJobFromOwnQueue() ) + { + return job; + } + } + return NULL; + } +}; + + +static void WorkerThreadFunc( void* userPtr ) +{ + BT_PROFILE( "WorkerThreadFunc" ); + ThreadLocalStorage* localStorage = (ThreadLocalStorage*) userPtr; + JobQueue* jobQueue = localStorage->m_queue; + + bool shouldSleep = false; + int threadId = localStorage->m_threadId; + while (! shouldSleep) + { + // do work + localStorage->m_mutex.lock(); + while ( IJob* job = jobQueue->consumeJob() ) + { + localStorage->m_status = WorkerThreadStatus::kWorking; + job->executeJob( threadId ); + localStorage->m_numJobsFinished++; + } + localStorage->m_status = WorkerThreadStatus::kWaitingForWork; + localStorage->m_mutex.unlock(); + btU64 clockStart = localStorage->m_clock->getTimeMicroseconds(); + // while queue is empty, + while (jobQueue->isQueueEmpty()) + { + // todo: spin wait a bit to avoid hammering the empty queue + btSpinPause(); + if ( localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kGoToSleep ) + { + shouldSleep = true; + break; + } + // if jobs are incoming, + if ( localStorage->m_directive->getDirective( threadId ) == WorkerThreadDirectives::kScanForJobs ) + { + clockStart = localStorage->m_clock->getTimeMicroseconds(); // reset clock + } + else + { + for ( int i = 0; i < 50; ++i ) + { + btSpinPause(); + btSpinPause(); + btSpinPause(); + btSpinPause(); + if (localStorage->m_directive->getDirective( threadId ) == WorkerThreadDirectives::kScanForJobs || !jobQueue->isQueueEmpty()) + { + break; + } + } + // if no jobs incoming and queue has been empty for the cooldown time, sleep + btU64 timeElapsed = localStorage->m_clock->getTimeMicroseconds() - clockStart; + if (timeElapsed > localStorage->m_cooldownTime) + { + shouldSleep = true; + break; + } + } + } + } + { + BT_PROFILE("sleep"); + // go sleep + localStorage->m_mutex.lock(); + localStorage->m_status = WorkerThreadStatus::kSleeping; + localStorage->m_mutex.unlock(); + } +} + + +class btTaskSchedulerDefault : public btITaskScheduler +{ + btThreadSupportInterface* m_threadSupport; + WorkerThreadDirectives* m_workerDirective; + btAlignedObjectArray<JobQueue> m_jobQueues; + btAlignedObjectArray<JobQueue*> m_perThreadJobQueues; + btAlignedObjectArray<ThreadLocalStorage> m_threadLocalStorage; + btSpinMutex m_antiNestingLock; // prevent nested parallel-for + btClock m_clock; + int m_numThreads; + int m_numWorkerThreads; + int m_numActiveJobQueues; + int m_maxNumThreads; + int m_numJobs; + static const int kFirstWorkerThreadId = 1; +public: + + btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport") + { + m_threadSupport = NULL; + m_workerDirective = NULL; + } + + virtual ~btTaskSchedulerDefault() + { + waitForWorkersToSleep(); + + for ( int i = 0; i < m_jobQueues.size(); ++i ) + { + m_jobQueues[i].exit(); + } + + if (m_threadSupport) + { + delete m_threadSupport; + m_threadSupport = NULL; + } + if (m_workerDirective) + { + btAlignedFree(m_workerDirective); + m_workerDirective = NULL; + } + } + + void init() + { + btThreadSupportInterface::ConstructionInfo constructionInfo( "TaskScheduler", WorkerThreadFunc ); + m_threadSupport = btThreadSupportInterface::create( constructionInfo ); + m_workerDirective = static_cast<WorkerThreadDirectives*>(btAlignedAlloc(sizeof(*m_workerDirective), 64)); + + m_numWorkerThreads = m_threadSupport->getNumWorkerThreads(); + m_maxNumThreads = m_threadSupport->getNumWorkerThreads() + 1; + m_numThreads = m_maxNumThreads; + // ideal to have one job queue for each physical processor (except for the main thread which needs no queue) + int numThreadsPerQueue = m_threadSupport->getLogicalToPhysicalCoreRatio(); + int numJobQueues = (numThreadsPerQueue == 1) ? (m_maxNumThreads-1) : (m_maxNumThreads / numThreadsPerQueue); + m_jobQueues.resize(numJobQueues); + m_numActiveJobQueues = numJobQueues; + for ( int i = 0; i < m_jobQueues.size(); ++i ) + { + m_jobQueues[i].init( m_threadSupport, &m_jobQueues ); + } + m_perThreadJobQueues.resize(m_numThreads); + for ( int i = 0; i < m_numThreads; i++ ) + { + JobQueue* jq = NULL; + // only worker threads get a job queue + if (i > 0) + { + if (numThreadsPerQueue == 1) + { + // one queue per worker thread + jq = &m_jobQueues[ i - kFirstWorkerThreadId ]; + } + else + { + // 2 threads share each queue + jq = &m_jobQueues[ i / numThreadsPerQueue ]; + } + } + m_perThreadJobQueues[i] = jq; + } + m_threadLocalStorage.resize(m_numThreads); + for ( int i = 0; i < m_numThreads; i++ ) + { + ThreadLocalStorage& storage = m_threadLocalStorage[i]; + storage.m_threadId = i; + storage.m_directive = m_workerDirective; + storage.m_status = WorkerThreadStatus::kSleeping; + storage.m_cooldownTime = 100; // 100 microseconds, threads go to sleep after this long if they have nothing to do + storage.m_clock = &m_clock; + storage.m_queue = m_perThreadJobQueues[i]; + } + setWorkerDirectives( WorkerThreadDirectives::kGoToSleep ); // no work for them yet + setNumThreads( m_threadSupport->getCacheFriendlyNumThreads() ); + } + + void setWorkerDirectives(WorkerThreadDirectives::Type dir) + { + m_workerDirective->setDirectiveByRange(kFirstWorkerThreadId, m_numThreads, dir); + } + + virtual int getMaxNumThreads() const BT_OVERRIDE + { + return m_maxNumThreads; + } + + virtual int getNumThreads() const BT_OVERRIDE + { + return m_numThreads; + } + + virtual void setNumThreads( int numThreads ) BT_OVERRIDE + { + m_numThreads = btMax( btMin(numThreads, int(m_maxNumThreads)), 1 ); + m_numWorkerThreads = m_numThreads - 1; + m_numActiveJobQueues = 0; + // if there is at least 1 worker, + if ( m_numWorkerThreads > 0 ) + { + // re-setup job stealing between queues to avoid attempting to steal from an inactive job queue + JobQueue* lastActiveContext = m_perThreadJobQueues[ m_numThreads - 1 ]; + int iLastActiveContext = lastActiveContext - &m_jobQueues[0]; + m_numActiveJobQueues = iLastActiveContext + 1; + for ( int i = 0; i < m_jobQueues.size(); ++i ) + { + m_jobQueues[ i ].setupJobStealing( &m_jobQueues, m_numActiveJobQueues ); + } + } + m_workerDirective->setDirectiveByRange(m_numThreads, BT_MAX_THREAD_COUNT, WorkerThreadDirectives::kGoToSleep); + } + + void waitJobs() + { + BT_PROFILE( "waitJobs" ); + // have the main thread work until the job queues are empty + int numMainThreadJobsFinished = 0; + for ( int i = 0; i < m_numActiveJobQueues; ++i ) + { + while ( IJob* job = m_jobQueues[i].consumeJob() ) + { + job->executeJob( 0 ); + numMainThreadJobsFinished++; + } + } + + // done with jobs for now, tell workers to rest (but not sleep) + setWorkerDirectives( WorkerThreadDirectives::kStayAwakeButIdle ); + + btU64 clockStart = m_clock.getTimeMicroseconds(); + // wait for workers to finish any jobs in progress + while ( true ) + { + int numWorkerJobsFinished = 0; + for ( int iThread = kFirstWorkerThreadId; iThread < m_numThreads; ++iThread ) + { + ThreadLocalStorage* storage = &m_threadLocalStorage[iThread]; + storage->m_mutex.lock(); + numWorkerJobsFinished += storage->m_numJobsFinished; + storage->m_mutex.unlock(); + } + if (numWorkerJobsFinished + numMainThreadJobsFinished == m_numJobs) + { + break; + } + btU64 timeElapsed = m_clock.getTimeMicroseconds() - clockStart; + btAssert(timeElapsed < 1000); + if (timeElapsed > 100000) + { + break; + } + btSpinPause(); + } + } + + void wakeWorkers(int numWorkersToWake) + { + BT_PROFILE( "wakeWorkers" ); + btAssert( m_workerDirective->getDirective(1) == WorkerThreadDirectives::kScanForJobs ); + int numDesiredWorkers = btMin(numWorkersToWake, m_numWorkerThreads); + int numActiveWorkers = 0; + for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker ) + { + // note this count of active workers is not necessarily totally reliable, because a worker thread could be + // just about to put itself to sleep. So we may on occasion fail to wake up all the workers. It should be rare. + ThreadLocalStorage& storage = m_threadLocalStorage[ kFirstWorkerThreadId + iWorker ]; + if (storage.m_status != WorkerThreadStatus::kSleeping) + { + numActiveWorkers++; + } + } + for ( int iWorker = 0; iWorker < m_numWorkerThreads && numActiveWorkers < numDesiredWorkers; ++iWorker ) + { + ThreadLocalStorage& storage = m_threadLocalStorage[ kFirstWorkerThreadId + iWorker ]; + if (storage.m_status == WorkerThreadStatus::kSleeping) + { + m_threadSupport->runTask( iWorker, &storage ); + numActiveWorkers++; + } + } + } + + void waitForWorkersToSleep() + { + BT_PROFILE( "waitForWorkersToSleep" ); + setWorkerDirectives( WorkerThreadDirectives::kGoToSleep ); + m_threadSupport->waitForAllTasks(); + for ( int i = kFirstWorkerThreadId; i < m_numThreads; i++ ) + { + ThreadLocalStorage& storage = m_threadLocalStorage[i]; + btAssert( storage.m_status == WorkerThreadStatus::kSleeping ); + } + } + + virtual void sleepWorkerThreadsHint() BT_OVERRIDE + { + BT_PROFILE( "sleepWorkerThreadsHint" ); + // hint the task scheduler that we may not be using these threads for a little while + setWorkerDirectives( WorkerThreadDirectives::kGoToSleep ); + } + + void prepareWorkerThreads() + { + for ( int i = kFirstWorkerThreadId; i < m_numThreads; ++i ) + { + ThreadLocalStorage& storage = m_threadLocalStorage[i]; + storage.m_mutex.lock(); + storage.m_numJobsFinished = 0; + storage.m_mutex.unlock(); + } + setWorkerDirectives( WorkerThreadDirectives::kScanForJobs ); + } + + virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE + { + BT_PROFILE( "parallelFor_ThreadSupport" ); + btAssert( iEnd >= iBegin ); + btAssert( grainSize >= 1 ); + int iterationCount = iEnd - iBegin; + if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() ) + { + typedef ParallelForJob JobType; + int jobCount = ( iterationCount + grainSize - 1 ) / grainSize; + m_numJobs = jobCount; + btAssert( jobCount >= 2 ); // need more than one job for multithreading + int jobSize = sizeof( JobType ); + + for (int i = 0; i < m_numActiveJobQueues; ++i) + { + m_jobQueues[i].clearQueue( jobCount, jobSize ); + } + // prepare worker threads for incoming work + prepareWorkerThreads(); + // submit all of the jobs + int iJob = 0; + int iThread = kFirstWorkerThreadId; // first worker thread + for ( int i = iBegin; i < iEnd; i += grainSize ) + { + btAssert( iJob < jobCount ); + int iE = btMin( i + grainSize, iEnd ); + JobQueue* jq = m_perThreadJobQueues[ iThread ]; + btAssert(jq); + btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues); + void* jobMem = jq->allocJobMem(jobSize); + JobType* job = new ( jobMem ) ParallelForJob( i, iE, body ); // placement new + jq->submitJob( job ); + iJob++; + iThread++; + if ( iThread >= m_numThreads ) + { + iThread = kFirstWorkerThreadId; // first worker thread + } + } + wakeWorkers( jobCount - 1 ); + + // put the main thread to work on emptying the job queue and then wait for all workers to finish + waitJobs(); + m_antiNestingLock.unlock(); + } + else + { + BT_PROFILE( "parallelFor_mainThread" ); + // just run on main thread + body.forLoop( iBegin, iEnd ); + } + } + virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE + { + BT_PROFILE( "parallelSum_ThreadSupport" ); + btAssert( iEnd >= iBegin ); + btAssert( grainSize >= 1 ); + int iterationCount = iEnd - iBegin; + if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() ) + { + typedef ParallelSumJob JobType; + int jobCount = ( iterationCount + grainSize - 1 ) / grainSize; + m_numJobs = jobCount; + btAssert( jobCount >= 2 ); // need more than one job for multithreading + int jobSize = sizeof( JobType ); + for (int i = 0; i < m_numActiveJobQueues; ++i) + { + m_jobQueues[i].clearQueue( jobCount, jobSize ); + } + + // initialize summation + for ( int iThread = 0; iThread < m_numThreads; ++iThread ) + { + m_threadLocalStorage[iThread].m_sumResult = btScalar(0); + } + + // prepare worker threads for incoming work + prepareWorkerThreads(); + // submit all of the jobs + int iJob = 0; + int iThread = kFirstWorkerThreadId; // first worker thread + for ( int i = iBegin; i < iEnd; i += grainSize ) + { + btAssert( iJob < jobCount ); + int iE = btMin( i + grainSize, iEnd ); + JobQueue* jq = m_perThreadJobQueues[ iThread ]; + btAssert(jq); + btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues); + void* jobMem = jq->allocJobMem(jobSize); + JobType* job = new ( jobMem ) ParallelSumJob( i, iE, body, &m_threadLocalStorage[0] ); // placement new + jq->submitJob( job ); + iJob++; + iThread++; + if ( iThread >= m_numThreads ) + { + iThread = kFirstWorkerThreadId; // first worker thread + } + } + wakeWorkers( jobCount - 1 ); + + // put the main thread to work on emptying the job queue and then wait for all workers to finish + waitJobs(); + + // add up all the thread sums + btScalar sum = btScalar(0); + for ( int iThread = 0; iThread < m_numThreads; ++iThread ) + { + sum += m_threadLocalStorage[ iThread ].m_sumResult; + } + m_antiNestingLock.unlock(); + return sum; + } + else + { + BT_PROFILE( "parallelSum_mainThread" ); + // just run on main thread + return body.sumLoop( iBegin, iEnd ); + } + } +}; + + + +btITaskScheduler* btCreateDefaultTaskScheduler() +{ + btTaskSchedulerDefault* ts = new btTaskSchedulerDefault(); + ts->init(); + return ts; +} + +#else // #if BT_THREADSAFE + +btITaskScheduler* btCreateDefaultTaskScheduler() +{ + return NULL; +} + +#endif // #else // #if BT_THREADSAFE diff --git a/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportInterface.h b/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportInterface.h new file mode 100644 index 0000000000..a0ad802b1e --- /dev/null +++ b/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportInterface.h @@ -0,0 +1,70 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2018 Erwin Coumans http://bulletphysics.com + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef BT_THREAD_SUPPORT_INTERFACE_H +#define BT_THREAD_SUPPORT_INTERFACE_H + + + +class btCriticalSection +{ +public: + btCriticalSection() {} + virtual ~btCriticalSection() {} + + virtual void lock() = 0; + virtual void unlock() = 0; +}; + + +class btThreadSupportInterface +{ +public: + + virtual ~btThreadSupportInterface() {} + + virtual int getNumWorkerThreads() const = 0; // number of worker threads (total number of logical processors - 1) + virtual int getCacheFriendlyNumThreads() const = 0; // the number of logical processors sharing a single L3 cache + virtual int getLogicalToPhysicalCoreRatio() const = 0; // the number of logical processors per physical processor (usually 1 or 2) + virtual void runTask( int threadIndex, void* userData ) = 0; + virtual void waitForAllTasks() = 0; + + virtual btCriticalSection* createCriticalSection() = 0; + virtual void deleteCriticalSection( btCriticalSection* criticalSection ) = 0; + + typedef void( *ThreadFunc )( void* userPtr ); + + struct ConstructionInfo + { + ConstructionInfo( const char* uniqueName, + ThreadFunc userThreadFunc, + int threadStackSize = 65535 + ) + :m_uniqueName( uniqueName ), + m_userThreadFunc( userThreadFunc ), + m_threadStackSize( threadStackSize ) + { + } + + const char* m_uniqueName; + ThreadFunc m_userThreadFunc; + int m_threadStackSize; + }; + + static btThreadSupportInterface* create( const ConstructionInfo& info ); +}; + +#endif //BT_THREAD_SUPPORT_INTERFACE_H + diff --git a/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportPosix.cpp b/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportPosix.cpp new file mode 100644 index 0000000000..50ca060dfe --- /dev/null +++ b/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportPosix.cpp @@ -0,0 +1,365 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2018 Erwin Coumans http://bulletphysics.com + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + +#if BT_THREADSAFE && !defined( _WIN32 ) + + +#include "LinearMath/btScalar.h" +#include "LinearMath/btAlignedObjectArray.h" +#include "LinearMath/btThreads.h" +#include "LinearMath/btMinMax.h" +#include "btThreadSupportInterface.h" + +#include <stdio.h> +#include <errno.h> +#include <unistd.h> + + +#ifndef _XOPEN_SOURCE +#define _XOPEN_SOURCE 600 //for definition of pthread_barrier_t, see http://pages.cs.wisc.edu/~travitch/pthreads_primer.html +#endif //_XOPEN_SOURCE +#include <pthread.h> +#include <semaphore.h> +#include <unistd.h> //for sysconf + + +/// +/// getNumHardwareThreads() +/// +/// +/// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine +/// +#if __cplusplus >= 201103L + +#include <thread> + +int btGetNumHardwareThreads() +{ + return btMin<int>(BT_MAX_THREAD_COUNT, std::thread::hardware_concurrency()); +} + +#else + +int btGetNumHardwareThreads() +{ + return btMin<int>(BT_MAX_THREAD_COUNT, sysconf( _SC_NPROCESSORS_ONLN )); +} + +#endif + + +// btThreadSupportPosix helps to initialize/shutdown libspe2, start/stop SPU tasks and communication +class btThreadSupportPosix : public btThreadSupportInterface +{ +public: + struct btThreadStatus + { + int m_taskId; + int m_commandId; + int m_status; + + ThreadFunc m_userThreadFunc; + void* m_userPtr; //for taskDesc etc + + pthread_t thread; + //each tread will wait until this signal to start its work + sem_t* startSemaphore; + + // this is a copy of m_mainSemaphore, + //each tread will signal once it is finished with its work + sem_t* m_mainSemaphore; + unsigned long threadUsed; + }; +private: + typedef unsigned long long UINT64; + + btAlignedObjectArray<btThreadStatus> m_activeThreadStatus; + // m_mainSemaphoresemaphore will signal, if and how many threads are finished with their work + sem_t* m_mainSemaphore; + int m_numThreads; + UINT64 m_startedThreadsMask; + void startThreads( const ConstructionInfo& threadInfo ); + void stopThreads(); + int waitForResponse(); + +public: + btThreadSupportPosix( const ConstructionInfo& threadConstructionInfo ); + virtual ~btThreadSupportPosix(); + + virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; } + // TODO: return the number of logical processors sharing the first L3 cache + virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return m_numThreads + 1; } + // TODO: detect if CPU has hyperthreading enabled + virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return 1; } + + virtual void runTask( int threadIndex, void* userData ) BT_OVERRIDE; + virtual void waitForAllTasks() BT_OVERRIDE; + + virtual btCriticalSection* createCriticalSection() BT_OVERRIDE; + virtual void deleteCriticalSection( btCriticalSection* criticalSection ) BT_OVERRIDE; +}; + + +#define checkPThreadFunction(returnValue) \ + if(0 != returnValue) { \ + printf("PThread problem at line %i in file %s: %i %d\n", __LINE__, __FILE__, returnValue, errno); \ + } + +// The number of threads should be equal to the number of available cores +// Todo: each worker should be linked to a single core, using SetThreadIdealProcessor. + + +btThreadSupportPosix::btThreadSupportPosix( const ConstructionInfo& threadConstructionInfo ) +{ + startThreads( threadConstructionInfo ); +} + +// cleanup/shutdown Libspe2 +btThreadSupportPosix::~btThreadSupportPosix() +{ + stopThreads(); +} + +#if (defined (__APPLE__)) +#define NAMED_SEMAPHORES +#endif + + +static sem_t* createSem( const char* baseName ) +{ + static int semCount = 0; +#ifdef NAMED_SEMAPHORES + /// Named semaphore begin + char name[ 32 ]; + snprintf( name, 32, "/%8.s-%4.d-%4.4d", baseName, getpid(), semCount++ ); + sem_t* tempSem = sem_open( name, O_CREAT, 0600, 0 ); + + if ( tempSem != reinterpret_cast<sem_t *>( SEM_FAILED ) ) + { + // printf("Created \"%s\" Semaphore %p\n", name, tempSem); + } + else + { + //printf("Error creating Semaphore %d\n", errno); + exit( -1 ); + } + /// Named semaphore end +#else + sem_t* tempSem = new sem_t; + checkPThreadFunction( sem_init( tempSem, 0, 0 ) ); +#endif + return tempSem; +} + +static void destroySem( sem_t* semaphore ) +{ +#ifdef NAMED_SEMAPHORES + checkPThreadFunction( sem_close( semaphore ) ); +#else + checkPThreadFunction( sem_destroy( semaphore ) ); + delete semaphore; +#endif +} + +static void *threadFunction( void *argument ) +{ + btThreadSupportPosix::btThreadStatus* status = ( btThreadSupportPosix::btThreadStatus* )argument; + + while ( 1 ) + { + checkPThreadFunction( sem_wait( status->startSemaphore ) ); + void* userPtr = status->m_userPtr; + + if ( userPtr ) + { + btAssert( status->m_status ); + status->m_userThreadFunc( userPtr ); + status->m_status = 2; + checkPThreadFunction( sem_post( status->m_mainSemaphore ) ); + status->threadUsed++; + } + else + { + //exit Thread + status->m_status = 3; + checkPThreadFunction( sem_post( status->m_mainSemaphore ) ); + printf( "Thread with taskId %i exiting\n", status->m_taskId ); + break; + } + } + + printf( "Thread TERMINATED\n" ); + return 0; +} + +///send messages to SPUs +void btThreadSupportPosix::runTask( int threadIndex, void* userData ) +{ + ///we should spawn an SPU task here, and in 'waitForResponse' it should wait for response of the (one of) the first tasks that finished + btThreadStatus& threadStatus = m_activeThreadStatus[ threadIndex ]; + btAssert( threadIndex >= 0 ); + btAssert( threadIndex < m_activeThreadStatus.size() ); + + threadStatus.m_commandId = 1; + threadStatus.m_status = 1; + threadStatus.m_userPtr = userData; + m_startedThreadsMask |= UINT64( 1 ) << threadIndex; + + // fire event to start new task + checkPThreadFunction( sem_post( threadStatus.startSemaphore ) ); +} + + +///check for messages from SPUs +int btThreadSupportPosix::waitForResponse() +{ + ///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response + ///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback' + + btAssert( m_activeThreadStatus.size() ); + + // wait for any of the threads to finish + checkPThreadFunction( sem_wait( m_mainSemaphore ) ); + // get at least one thread which has finished + size_t last = -1; + + for ( size_t t = 0; t < size_t( m_activeThreadStatus.size() ); ++t ) + { + if ( 2 == m_activeThreadStatus[ t ].m_status ) + { + last = t; + break; + } + } + + btThreadStatus& threadStatus = m_activeThreadStatus[ last ]; + + btAssert( threadStatus.m_status > 1 ); + threadStatus.m_status = 0; + + // need to find an active spu + btAssert( last >= 0 ); + m_startedThreadsMask &= ~( UINT64( 1 ) << last ); + + return last; +} + + +void btThreadSupportPosix::waitForAllTasks() +{ + while ( m_startedThreadsMask ) + { + waitForResponse(); + } +} + + +void btThreadSupportPosix::startThreads( const ConstructionInfo& threadConstructionInfo ) +{ + m_numThreads = btGetNumHardwareThreads() - 1; // main thread exists already + printf( "%s creating %i threads.\n", __FUNCTION__, m_numThreads ); + m_activeThreadStatus.resize( m_numThreads ); + m_startedThreadsMask = 0; + + m_mainSemaphore = createSem( "main" ); + //checkPThreadFunction(sem_wait(mainSemaphore)); + + for ( int i = 0; i < m_numThreads; i++ ) + { + printf( "starting thread %d\n", i ); + btThreadStatus& threadStatus = m_activeThreadStatus[ i ]; + threadStatus.startSemaphore = createSem( "threadLocal" ); + checkPThreadFunction( pthread_create( &threadStatus.thread, NULL, &threadFunction, (void*) &threadStatus ) ); + + threadStatus.m_userPtr = 0; + threadStatus.m_taskId = i; + threadStatus.m_commandId = 0; + threadStatus.m_status = 0; + threadStatus.m_mainSemaphore = m_mainSemaphore; + threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc; + threadStatus.threadUsed = 0; + + printf( "started thread %d \n", i ); + } +} + +///tell the task scheduler we are done with the SPU tasks +void btThreadSupportPosix::stopThreads() +{ + for ( size_t t = 0; t < size_t( m_activeThreadStatus.size() ); ++t ) + { + btThreadStatus& threadStatus = m_activeThreadStatus[ t ]; + printf( "%s: Thread %i used: %ld\n", __FUNCTION__, int( t ), threadStatus.threadUsed ); + + threadStatus.m_userPtr = 0; + checkPThreadFunction( sem_post( threadStatus.startSemaphore ) ); + checkPThreadFunction( sem_wait( m_mainSemaphore ) ); + + printf( "destroy semaphore\n" ); + destroySem( threadStatus.startSemaphore ); + printf( "semaphore destroyed\n" ); + checkPThreadFunction( pthread_join( threadStatus.thread, 0 ) ); + + } + printf( "destroy main semaphore\n" ); + destroySem( m_mainSemaphore ); + printf( "main semaphore destroyed\n" ); + m_activeThreadStatus.clear(); +} + +class btCriticalSectionPosix : public btCriticalSection +{ + pthread_mutex_t m_mutex; + +public: + btCriticalSectionPosix() + { + pthread_mutex_init( &m_mutex, NULL ); + } + virtual ~btCriticalSectionPosix() + { + pthread_mutex_destroy( &m_mutex ); + } + + virtual void lock() + { + pthread_mutex_lock( &m_mutex ); + } + virtual void unlock() + { + pthread_mutex_unlock( &m_mutex ); + } +}; + + +btCriticalSection* btThreadSupportPosix::createCriticalSection() +{ + return new btCriticalSectionPosix(); +} + +void btThreadSupportPosix::deleteCriticalSection( btCriticalSection* cs ) +{ + delete cs; +} + + +btThreadSupportInterface* btThreadSupportInterface::create( const ConstructionInfo& info ) +{ + return new btThreadSupportPosix( info ); +} + +#endif // BT_THREADSAFE && !defined( _WIN32 ) + diff --git a/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportWin32.cpp b/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportWin32.cpp new file mode 100644 index 0000000000..00edac650b --- /dev/null +++ b/thirdparty/bullet/LinearMath/TaskScheduler/btThreadSupportWin32.cpp @@ -0,0 +1,472 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2018 Erwin Coumans http://bulletphysics.com + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#if defined( _WIN32 ) && BT_THREADSAFE + +#include "LinearMath/btScalar.h" +#include "LinearMath/btMinMax.h" +#include "LinearMath/btAlignedObjectArray.h" +#include "LinearMath/btThreads.h" +#include "btThreadSupportInterface.h" +#include <windows.h> +#include <stdio.h> + + +struct btProcessorInfo +{ + int numLogicalProcessors; + int numCores; + int numNumaNodes; + int numL1Cache; + int numL2Cache; + int numL3Cache; + int numPhysicalPackages; + static const int maxNumTeamMasks = 32; + int numTeamMasks; + UINT64 processorTeamMasks[ maxNumTeamMasks ]; +}; + +UINT64 getProcessorTeamMask( const btProcessorInfo& procInfo, int procId ) +{ + UINT64 procMask = UINT64( 1 ) << procId; + for ( int i = 0; i < procInfo.numTeamMasks; ++i ) + { + if ( procMask & procInfo.processorTeamMasks[ i ] ) + { + return procInfo.processorTeamMasks[ i ]; + } + } + return 0; +} + +int getProcessorTeamIndex( const btProcessorInfo& procInfo, int procId ) +{ + UINT64 procMask = UINT64( 1 ) << procId; + for ( int i = 0; i < procInfo.numTeamMasks; ++i ) + { + if ( procMask & procInfo.processorTeamMasks[ i ] ) + { + return i; + } + } + return -1; +} + +int countSetBits( ULONG64 bits ) +{ + int count = 0; + while ( bits ) + { + if ( bits & 1 ) + { + count++; + } + bits >>= 1; + } + return count; +} + + +typedef BOOL( WINAPI *Pfn_GetLogicalProcessorInformation )( PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD ); + + +void getProcessorInformation( btProcessorInfo* procInfo ) +{ + memset( procInfo, 0, sizeof( *procInfo ) ); + Pfn_GetLogicalProcessorInformation getLogicalProcInfo = + (Pfn_GetLogicalProcessorInformation) GetProcAddress( GetModuleHandle( TEXT( "kernel32" ) ), "GetLogicalProcessorInformation" ); + if ( getLogicalProcInfo == NULL ) + { + // no info + return; + } + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buf = NULL; + DWORD bufSize = 0; + while ( true ) + { + if ( getLogicalProcInfo( buf, &bufSize ) ) + { + break; + } + else + { + if ( GetLastError() == ERROR_INSUFFICIENT_BUFFER ) + { + if ( buf ) + { + free( buf ); + } + buf = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION) malloc( bufSize ); + } + } + } + + int len = bufSize / sizeof( *buf ); + for ( int i = 0; i < len; ++i ) + { + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION info = buf + i; + switch ( info->Relationship ) + { + case RelationNumaNode: + procInfo->numNumaNodes++; + break; + + case RelationProcessorCore: + procInfo->numCores++; + procInfo->numLogicalProcessors += countSetBits( info->ProcessorMask ); + break; + + case RelationCache: + if ( info->Cache.Level == 1 ) + { + procInfo->numL1Cache++; + } + else if ( info->Cache.Level == 2 ) + { + procInfo->numL2Cache++; + } + else if ( info->Cache.Level == 3 ) + { + procInfo->numL3Cache++; + // processors that share L3 cache are considered to be on the same team + // because they can more easily work together on the same data. + // Large performance penalties will occur if 2 or more threads from different + // teams attempt to frequently read and modify the same cache lines. + // + // On the AMD Ryzen 7 CPU for example, the 8 cores on the CPU are split into + // 2 CCX units of 4 cores each. Each CCX has a separate L3 cache, so if both + // CCXs are operating on the same data, many cycles will be spent keeping the + // two caches coherent. + if ( procInfo->numTeamMasks < btProcessorInfo::maxNumTeamMasks ) + { + procInfo->processorTeamMasks[ procInfo->numTeamMasks ] = info->ProcessorMask; + procInfo->numTeamMasks++; + } + } + break; + + case RelationProcessorPackage: + procInfo->numPhysicalPackages++; + break; + } + } + free( buf ); +} + + + +///btThreadSupportWin32 helps to initialize/shutdown libspe2, start/stop SPU tasks and communication +class btThreadSupportWin32 : public btThreadSupportInterface +{ +public: + struct btThreadStatus + { + int m_taskId; + int m_commandId; + int m_status; + + ThreadFunc m_userThreadFunc; + void* m_userPtr; //for taskDesc etc + + void* m_threadHandle; //this one is calling 'Win32ThreadFunc' + + void* m_eventStartHandle; + char m_eventStartHandleName[ 32 ]; + + void* m_eventCompleteHandle; + char m_eventCompleteHandleName[ 32 ]; + }; + +private: + btAlignedObjectArray<btThreadStatus> m_activeThreadStatus; + btAlignedObjectArray<void*> m_completeHandles; + int m_numThreads; + DWORD_PTR m_startedThreadMask; + btProcessorInfo m_processorInfo; + + void startThreads( const ConstructionInfo& threadInfo ); + void stopThreads(); + int waitForResponse(); + +public: + + btThreadSupportWin32( const ConstructionInfo& threadConstructionInfo ); + virtual ~btThreadSupportWin32(); + + virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; } + virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return countSetBits(m_processorInfo.processorTeamMasks[0]); } + virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return m_processorInfo.numLogicalProcessors / m_processorInfo.numCores; } + + virtual void runTask( int threadIndex, void* userData ) BT_OVERRIDE; + virtual void waitForAllTasks() BT_OVERRIDE; + + virtual btCriticalSection* createCriticalSection() BT_OVERRIDE; + virtual void deleteCriticalSection( btCriticalSection* criticalSection ) BT_OVERRIDE; +}; + + +btThreadSupportWin32::btThreadSupportWin32( const ConstructionInfo & threadConstructionInfo ) +{ + startThreads( threadConstructionInfo ); +} + + +btThreadSupportWin32::~btThreadSupportWin32() +{ + stopThreads(); +} + + +DWORD WINAPI win32threadStartFunc( LPVOID lpParam ) +{ + btThreadSupportWin32::btThreadStatus* status = ( btThreadSupportWin32::btThreadStatus* )lpParam; + + while ( 1 ) + { + WaitForSingleObject( status->m_eventStartHandle, INFINITE ); + void* userPtr = status->m_userPtr; + + if ( userPtr ) + { + btAssert( status->m_status ); + status->m_userThreadFunc( userPtr ); + status->m_status = 2; + SetEvent( status->m_eventCompleteHandle ); + } + else + { + //exit Thread + status->m_status = 3; + printf( "Thread with taskId %i with handle %p exiting\n", status->m_taskId, status->m_threadHandle ); + SetEvent( status->m_eventCompleteHandle ); + break; + } + } + printf( "Thread TERMINATED\n" ); + return 0; +} + + +void btThreadSupportWin32::runTask( int threadIndex, void* userData ) +{ + btThreadStatus& threadStatus = m_activeThreadStatus[ threadIndex ]; + btAssert( threadIndex >= 0 ); + btAssert( int( threadIndex ) < m_activeThreadStatus.size() ); + + threadStatus.m_commandId = 1; + threadStatus.m_status = 1; + threadStatus.m_userPtr = userData; + m_startedThreadMask |= DWORD_PTR( 1 ) << threadIndex; + + ///fire event to start new task + SetEvent( threadStatus.m_eventStartHandle ); +} + + +int btThreadSupportWin32::waitForResponse() +{ + btAssert( m_activeThreadStatus.size() ); + + int last = -1; + DWORD res = WaitForMultipleObjects( m_completeHandles.size(), &m_completeHandles[ 0 ], FALSE, INFINITE ); + btAssert( res != WAIT_FAILED ); + last = res - WAIT_OBJECT_0; + + btThreadStatus& threadStatus = m_activeThreadStatus[ last ]; + btAssert( threadStatus.m_threadHandle ); + btAssert( threadStatus.m_eventCompleteHandle ); + + //WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE); + btAssert( threadStatus.m_status > 1 ); + threadStatus.m_status = 0; + + ///need to find an active spu + btAssert( last >= 0 ); + m_startedThreadMask &= ~( DWORD_PTR( 1 ) << last ); + + return last; +} + + +void btThreadSupportWin32::waitForAllTasks() +{ + while ( m_startedThreadMask ) + { + waitForResponse(); + } +} + + +void btThreadSupportWin32::startThreads( const ConstructionInfo& threadConstructionInfo ) +{ + static int uniqueId = 0; + uniqueId++; + btProcessorInfo& procInfo = m_processorInfo; + getProcessorInformation( &procInfo ); + DWORD_PTR dwProcessAffinityMask = 0; + DWORD_PTR dwSystemAffinityMask = 0; + if ( !GetProcessAffinityMask( GetCurrentProcess(), &dwProcessAffinityMask, &dwSystemAffinityMask ) ) + { + dwProcessAffinityMask = 0; + } + ///The number of threads should be equal to the number of available cores - 1 + m_numThreads = btMin(procInfo.numLogicalProcessors, int(BT_MAX_THREAD_COUNT)) - 1; // cap to max thread count (-1 because main thread already exists) + + m_activeThreadStatus.resize( m_numThreads ); + m_completeHandles.resize( m_numThreads ); + m_startedThreadMask = 0; + + // set main thread affinity + if ( DWORD_PTR mask = dwProcessAffinityMask & getProcessorTeamMask( procInfo, 0 )) + { + SetThreadAffinityMask( GetCurrentThread(), mask ); + SetThreadIdealProcessor( GetCurrentThread(), 0 ); + } + + for ( int i = 0; i < m_numThreads; i++ ) + { + printf( "starting thread %d\n", i ); + + btThreadStatus& threadStatus = m_activeThreadStatus[ i ]; + + LPSECURITY_ATTRIBUTES lpThreadAttributes = NULL; + SIZE_T dwStackSize = threadConstructionInfo.m_threadStackSize; + LPTHREAD_START_ROUTINE lpStartAddress = &win32threadStartFunc; + LPVOID lpParameter = &threadStatus; + DWORD dwCreationFlags = 0; + LPDWORD lpThreadId = 0; + + threadStatus.m_userPtr = 0; + + sprintf( threadStatus.m_eventStartHandleName, "es%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i ); + threadStatus.m_eventStartHandle = CreateEventA( 0, false, false, threadStatus.m_eventStartHandleName ); + + sprintf( threadStatus.m_eventCompleteHandleName, "ec%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i ); + threadStatus.m_eventCompleteHandle = CreateEventA( 0, false, false, threadStatus.m_eventCompleteHandleName ); + + m_completeHandles[ i ] = threadStatus.m_eventCompleteHandle; + + HANDLE handle = CreateThread( lpThreadAttributes, dwStackSize, lpStartAddress, lpParameter, dwCreationFlags, lpThreadId ); + //SetThreadPriority( handle, THREAD_PRIORITY_HIGHEST ); + // highest priority -- can cause erratic performance when numThreads > numCores + // we don't want worker threads to be higher priority than the main thread or the main thread could get + // totally shut out and unable to tell the workers to stop + //SetThreadPriority( handle, THREAD_PRIORITY_BELOW_NORMAL ); + + { + int processorId = i + 1; // leave processor 0 for main thread + DWORD_PTR teamMask = getProcessorTeamMask( procInfo, processorId ); + if ( teamMask ) + { + // bind each thread to only execute on processors of it's assigned team + // - for single-socket Intel x86 CPUs this has no effect (only a single, shared L3 cache so there is only 1 team) + // - for multi-socket Intel this will keep threads from migrating from one socket to another + // - for AMD Ryzen this will keep threads from migrating from one CCX to another + DWORD_PTR mask = teamMask & dwProcessAffinityMask; + if ( mask ) + { + SetThreadAffinityMask( handle, mask ); + } + } + SetThreadIdealProcessor( handle, processorId ); + } + + threadStatus.m_taskId = i; + threadStatus.m_commandId = 0; + threadStatus.m_status = 0; + threadStatus.m_threadHandle = handle; + threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc; + + printf( "started %s thread %d with threadHandle %p\n", threadConstructionInfo.m_uniqueName, i, handle ); + } +} + +///tell the task scheduler we are done with the SPU tasks +void btThreadSupportWin32::stopThreads() +{ + for ( int i = 0; i < m_activeThreadStatus.size(); i++ ) + { + btThreadStatus& threadStatus = m_activeThreadStatus[ i ]; + if ( threadStatus.m_status > 0 ) + { + WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE ); + } + + threadStatus.m_userPtr = NULL; + SetEvent( threadStatus.m_eventStartHandle ); + WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE ); + + CloseHandle( threadStatus.m_eventCompleteHandle ); + CloseHandle( threadStatus.m_eventStartHandle ); + CloseHandle( threadStatus.m_threadHandle ); + + } + + m_activeThreadStatus.clear(); + m_completeHandles.clear(); +} + + +class btWin32CriticalSection : public btCriticalSection +{ +private: + CRITICAL_SECTION mCriticalSection; + +public: + btWin32CriticalSection() + { + InitializeCriticalSection( &mCriticalSection ); + } + + ~btWin32CriticalSection() + { + DeleteCriticalSection( &mCriticalSection ); + } + + void lock() + { + EnterCriticalSection( &mCriticalSection ); + } + + void unlock() + { + LeaveCriticalSection( &mCriticalSection ); + } +}; + + +btCriticalSection* btThreadSupportWin32::createCriticalSection() +{ + unsigned char* mem = (unsigned char*) btAlignedAlloc( sizeof( btWin32CriticalSection ), 16 ); + btWin32CriticalSection* cs = new( mem ) btWin32CriticalSection(); + return cs; +} + +void btThreadSupportWin32::deleteCriticalSection( btCriticalSection* criticalSection ) +{ + criticalSection->~btCriticalSection(); + btAlignedFree( criticalSection ); +} + + +btThreadSupportInterface* btThreadSupportInterface::create( const ConstructionInfo& info ) +{ + return new btThreadSupportWin32( info ); +} + + + +#endif //defined(_WIN32) && BT_THREADSAFE + |