#include "LinearMath/btMinMax.h" #include "LinearMath/btAlignedObjectArray.h" #include "LinearMath/btThreads.h" #include "LinearMath/btQuickprof.h" #include <stdio.h> #include <algorithm> #if BT_THREADSAFE #include "btThreadSupportInterface.h" #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN #include <windows.h> #endif typedef unsigned long long btU64; static const int kCacheLineSize = 64; void btSpinPause() { #if defined(_WIN32) YieldProcessor(); #endif } struct WorkerThreadStatus { enum Type { kInvalid, kWaitingForWork, kWorking, kSleeping, }; }; ATTRIBUTE_ALIGNED64(class) WorkerThreadDirectives { static const int kMaxThreadCount = BT_MAX_THREAD_COUNT; // directives for all worker threads packed into a single cacheline char m_threadDirs[kMaxThreadCount]; public: enum Type { kInvalid, kGoToSleep, // go to sleep kStayAwakeButIdle, // wait for not checking job queue kScanForJobs, // actively scan job queue for jobs }; WorkerThreadDirectives() { for (int i = 0; i < kMaxThreadCount; ++i) { m_threadDirs[i] = 0; } } Type getDirective(int threadId) { btAssert(threadId < kMaxThreadCount); return static_cast<Type>(m_threadDirs[threadId]); } void setDirectiveByRange(int threadBegin, int threadEnd, Type dir) { btAssert(threadBegin < threadEnd); btAssert(threadEnd <= kMaxThreadCount); char dirChar = static_cast<char>(dir); for (int i = threadBegin; i < threadEnd; ++i) { m_threadDirs[i] = dirChar; } } }; class JobQueue; ATTRIBUTE_ALIGNED64(struct) ThreadLocalStorage { int m_threadId; WorkerThreadStatus::Type m_status; int m_numJobsFinished; btSpinMutex m_mutex; btScalar m_sumResult; WorkerThreadDirectives* m_directive; JobQueue* m_queue; btClock* m_clock; unsigned int m_cooldownTime; }; struct IJob { virtual void executeJob(int threadId) = 0; }; class ParallelForJob : public IJob { const btIParallelForBody* m_body; int m_begin; int m_end; public: ParallelForJob(int iBegin, int iEnd, const btIParallelForBody& body) { m_body = &body; m_begin = iBegin; m_end = iEnd; } virtual void executeJob(int threadId) BT_OVERRIDE { BT_PROFILE("executeJob"); // call the functor body to do the work m_body->forLoop(m_begin, m_end); } }; class ParallelSumJob : public IJob { const btIParallelSumBody* m_body; ThreadLocalStorage* m_threadLocalStoreArray; int m_begin; int m_end; public: ParallelSumJob(int iBegin, int iEnd, const btIParallelSumBody& body, ThreadLocalStorage* tls) { m_body = &body; m_threadLocalStoreArray = tls; m_begin = iBegin; m_end = iEnd; } virtual void executeJob(int threadId) BT_OVERRIDE { BT_PROFILE("executeJob"); // call the functor body to do the work btScalar val = m_body->sumLoop(m_begin, m_end); #if BT_PARALLEL_SUM_DETERMINISTISM // by truncating bits of the result, we can make the parallelSum deterministic (at the expense of precision) const float TRUNC_SCALE = float(1 << 19); val = floor(val * TRUNC_SCALE + 0.5f) / TRUNC_SCALE; // truncate some bits #endif m_threadLocalStoreArray[threadId].m_sumResult += val; } }; ATTRIBUTE_ALIGNED64(class) JobQueue { btThreadSupportInterface* m_threadSupport; btCriticalSection* m_queueLock; btSpinMutex m_mutex; btAlignedObjectArray<IJob*> m_jobQueue; char* m_jobMem; int m_jobMemSize; bool m_queueIsEmpty; int m_tailIndex; int m_headIndex; int m_allocSize; bool m_useSpinMutex; btAlignedObjectArray<JobQueue*> m_neighborContexts; char m_cachePadding[kCacheLineSize]; // prevent false sharing void freeJobMem() { if (m_jobMem) { // free old btAlignedFree(m_jobMem); m_jobMem = NULL; } } void resizeJobMem(int newSize) { if (newSize > m_jobMemSize) { freeJobMem(); m_jobMem = static_cast<char*>(btAlignedAlloc(newSize, kCacheLineSize)); m_jobMemSize = newSize; } } public: JobQueue() { m_jobMem = NULL; m_jobMemSize = 0; m_threadSupport = NULL; m_queueLock = NULL; m_headIndex = 0; m_tailIndex = 0; m_useSpinMutex = false; } ~JobQueue() { exit(); } void exit() { freeJobMem(); if (m_queueLock && m_threadSupport) { m_threadSupport->deleteCriticalSection(m_queueLock); m_queueLock = NULL; m_threadSupport = 0; } } void init(btThreadSupportInterface * threadSup, btAlignedObjectArray<JobQueue> * contextArray) { m_threadSupport = threadSup; if (threadSup) { m_queueLock = m_threadSupport->createCriticalSection(); } setupJobStealing(contextArray, contextArray->size()); } void setupJobStealing(btAlignedObjectArray<JobQueue> * contextArray, int numActiveContexts) { btAlignedObjectArray<JobQueue>& contexts = *contextArray; int selfIndex = 0; for (int i = 0; i < contexts.size(); ++i) { if (this == &contexts[i]) { selfIndex = i; break; } } int numNeighbors = btMin(2, contexts.size() - 1); int neighborOffsets[] = {-1, 1, -2, 2, -3, 3}; int numOffsets = sizeof(neighborOffsets) / sizeof(neighborOffsets[0]); m_neighborContexts.reserve(numNeighbors); m_neighborContexts.resizeNoInitialize(0); for (int i = 0; i < numOffsets && m_neighborContexts.size() < numNeighbors; i++) { int neighborIndex = selfIndex + neighborOffsets[i]; if (neighborIndex >= 0 && neighborIndex < numActiveContexts) { m_neighborContexts.push_back(&contexts[neighborIndex]); } } } bool isQueueEmpty() const { return m_queueIsEmpty; } void lockQueue() { if (m_useSpinMutex) { m_mutex.lock(); } else { m_queueLock->lock(); } } void unlockQueue() { if (m_useSpinMutex) { m_mutex.unlock(); } else { m_queueLock->unlock(); } } void clearQueue(int jobCount, int jobSize) { lockQueue(); m_headIndex = 0; m_tailIndex = 0; m_allocSize = 0; m_queueIsEmpty = true; int jobBufSize = jobSize * jobCount; // make sure we have enough memory allocated to store jobs if (jobBufSize > m_jobMemSize) { resizeJobMem(jobBufSize); } // make sure job queue is big enough if (jobCount > m_jobQueue.capacity()) { m_jobQueue.reserve(jobCount); } unlockQueue(); m_jobQueue.resizeNoInitialize(0); } void* allocJobMem(int jobSize) { btAssert(m_jobMemSize >= (m_allocSize + jobSize)); void* jobMem = &m_jobMem[m_allocSize]; m_allocSize += jobSize; return jobMem; } void submitJob(IJob * job) { btAssert(reinterpret_cast<char*>(job) >= &m_jobMem[0] && reinterpret_cast<char*>(job) < &m_jobMem[0] + m_allocSize); m_jobQueue.push_back(job); lockQueue(); m_tailIndex++; m_queueIsEmpty = false; unlockQueue(); } IJob* consumeJobFromOwnQueue() { if (m_queueIsEmpty) { // lock free path. even if this is taken erroneously it isn't harmful return NULL; } IJob* job = NULL; lockQueue(); if (!m_queueIsEmpty) { job = m_jobQueue[m_headIndex++]; btAssert(reinterpret_cast<char*>(job) >= &m_jobMem[0] && reinterpret_cast<char*>(job) < &m_jobMem[0] + m_allocSize); if (m_headIndex == m_tailIndex) { m_queueIsEmpty = true; } } unlockQueue(); return job; } IJob* consumeJob() { if (IJob* job = consumeJobFromOwnQueue()) { return job; } // own queue is empty, try to steal from neighbor for (int i = 0; i < m_neighborContexts.size(); ++i) { JobQueue* otherContext = m_neighborContexts[i]; if (IJob* job = otherContext->consumeJobFromOwnQueue()) { return job; } } return NULL; } }; static void WorkerThreadFunc(void* userPtr) { BT_PROFILE("WorkerThreadFunc"); ThreadLocalStorage* localStorage = (ThreadLocalStorage*)userPtr; JobQueue* jobQueue = localStorage->m_queue; bool shouldSleep = false; int threadId = localStorage->m_threadId; while (!shouldSleep) { // do work localStorage->m_mutex.lock(); while (IJob* job = jobQueue->consumeJob()) { localStorage->m_status = WorkerThreadStatus::kWorking; job->executeJob(threadId); localStorage->m_numJobsFinished++; } localStorage->m_status = WorkerThreadStatus::kWaitingForWork; localStorage->m_mutex.unlock(); btU64 clockStart = localStorage->m_clock->getTimeMicroseconds(); // while queue is empty, while (jobQueue->isQueueEmpty()) { // todo: spin wait a bit to avoid hammering the empty queue btSpinPause(); if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kGoToSleep) { shouldSleep = true; break; } // if jobs are incoming, if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kScanForJobs) { clockStart = localStorage->m_clock->getTimeMicroseconds(); // reset clock } else { for (int i = 0; i < 50; ++i) { btSpinPause(); btSpinPause(); btSpinPause(); btSpinPause(); if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kScanForJobs || !jobQueue->isQueueEmpty()) { break; } } // if no jobs incoming and queue has been empty for the cooldown time, sleep btU64 timeElapsed = localStorage->m_clock->getTimeMicroseconds() - clockStart; if (timeElapsed > localStorage->m_cooldownTime) { shouldSleep = true; break; } } } } { BT_PROFILE("sleep"); // go sleep localStorage->m_mutex.lock(); localStorage->m_status = WorkerThreadStatus::kSleeping; localStorage->m_mutex.unlock(); } } class btTaskSchedulerDefault : public btITaskScheduler { btThreadSupportInterface* m_threadSupport; WorkerThreadDirectives* m_workerDirective; btAlignedObjectArray<JobQueue> m_jobQueues; btAlignedObjectArray<JobQueue*> m_perThreadJobQueues; btAlignedObjectArray<ThreadLocalStorage> m_threadLocalStorage; btSpinMutex m_antiNestingLock; // prevent nested parallel-for btClock m_clock; int m_numThreads; int m_numWorkerThreads; int m_numActiveJobQueues; int m_maxNumThreads; int m_numJobs; static const int kFirstWorkerThreadId = 1; public: btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport") { m_threadSupport = NULL; m_workerDirective = NULL; } virtual ~btTaskSchedulerDefault() { waitForWorkersToSleep(); for (int i = 0; i < m_jobQueues.size(); ++i) { m_jobQueues[i].exit(); } if (m_threadSupport) { delete m_threadSupport; m_threadSupport = NULL; } if (m_workerDirective) { btAlignedFree(m_workerDirective); m_workerDirective = NULL; } } void init() { btThreadSupportInterface::ConstructionInfo constructionInfo("TaskScheduler", WorkerThreadFunc); m_threadSupport = btThreadSupportInterface::create(constructionInfo); m_workerDirective = static_cast<WorkerThreadDirectives*>(btAlignedAlloc(sizeof(*m_workerDirective), 64)); m_numWorkerThreads = m_threadSupport->getNumWorkerThreads(); m_maxNumThreads = m_threadSupport->getNumWorkerThreads() + 1; m_numThreads = m_maxNumThreads; // ideal to have one job queue for each physical processor (except for the main thread which needs no queue) int numThreadsPerQueue = m_threadSupport->getLogicalToPhysicalCoreRatio(); int numJobQueues = (numThreadsPerQueue == 1) ? (m_maxNumThreads - 1) : (m_maxNumThreads / numThreadsPerQueue); m_jobQueues.resize(numJobQueues); m_numActiveJobQueues = numJobQueues; for (int i = 0; i < m_jobQueues.size(); ++i) { m_jobQueues[i].init(m_threadSupport, &m_jobQueues); } m_perThreadJobQueues.resize(m_numThreads); for (int i = 0; i < m_numThreads; i++) { JobQueue* jq = NULL; // only worker threads get a job queue if (i > 0) { if (numThreadsPerQueue == 1) { // one queue per worker thread jq = &m_jobQueues[i - kFirstWorkerThreadId]; } else { // 2 threads share each queue jq = &m_jobQueues[i / numThreadsPerQueue]; } } m_perThreadJobQueues[i] = jq; } m_threadLocalStorage.resize(m_numThreads); for (int i = 0; i < m_numThreads; i++) { ThreadLocalStorage& storage = m_threadLocalStorage[i]; storage.m_threadId = i; storage.m_directive = m_workerDirective; storage.m_status = WorkerThreadStatus::kSleeping; storage.m_cooldownTime = 100; // 100 microseconds, threads go to sleep after this long if they have nothing to do storage.m_clock = &m_clock; storage.m_queue = m_perThreadJobQueues[i]; } setWorkerDirectives(WorkerThreadDirectives::kGoToSleep); // no work for them yet setNumThreads(m_threadSupport->getCacheFriendlyNumThreads()); } void setWorkerDirectives(WorkerThreadDirectives::Type dir) { m_workerDirective->setDirectiveByRange(kFirstWorkerThreadId, m_numThreads, dir); } virtual int getMaxNumThreads() const BT_OVERRIDE { return m_maxNumThreads; } virtual int getNumThreads() const BT_OVERRIDE { return m_numThreads; } virtual void setNumThreads(int numThreads) BT_OVERRIDE { m_numThreads = btMax(btMin(numThreads, int(m_maxNumThreads)), 1); m_numWorkerThreads = m_numThreads - 1; m_numActiveJobQueues = 0; // if there is at least 1 worker, if (m_numWorkerThreads > 0) { // re-setup job stealing between queues to avoid attempting to steal from an inactive job queue JobQueue* lastActiveContext = m_perThreadJobQueues[m_numThreads - 1]; int iLastActiveContext = lastActiveContext - &m_jobQueues[0]; m_numActiveJobQueues = iLastActiveContext + 1; for (int i = 0; i < m_jobQueues.size(); ++i) { m_jobQueues[i].setupJobStealing(&m_jobQueues, m_numActiveJobQueues); } } m_workerDirective->setDirectiveByRange(m_numThreads, BT_MAX_THREAD_COUNT, WorkerThreadDirectives::kGoToSleep); } void waitJobs() { BT_PROFILE("waitJobs"); // have the main thread work until the job queues are empty int numMainThreadJobsFinished = 0; for (int i = 0; i < m_numActiveJobQueues; ++i) { while (IJob* job = m_jobQueues[i].consumeJob()) { job->executeJob(0); numMainThreadJobsFinished++; } } // done with jobs for now, tell workers to rest (but not sleep) setWorkerDirectives(WorkerThreadDirectives::kStayAwakeButIdle); btU64 clockStart = m_clock.getTimeMicroseconds(); // wait for workers to finish any jobs in progress while (true) { int numWorkerJobsFinished = 0; for (int iThread = kFirstWorkerThreadId; iThread < m_numThreads; ++iThread) { ThreadLocalStorage* storage = &m_threadLocalStorage[iThread]; storage->m_mutex.lock(); numWorkerJobsFinished += storage->m_numJobsFinished; storage->m_mutex.unlock(); } if (numWorkerJobsFinished + numMainThreadJobsFinished == m_numJobs) { break; } btU64 timeElapsed = m_clock.getTimeMicroseconds() - clockStart; btAssert(timeElapsed < 1000); if (timeElapsed > 100000) { break; } btSpinPause(); } } void wakeWorkers(int numWorkersToWake) { BT_PROFILE("wakeWorkers"); btAssert(m_workerDirective->getDirective(1) == WorkerThreadDirectives::kScanForJobs); int numDesiredWorkers = btMin(numWorkersToWake, m_numWorkerThreads); int numActiveWorkers = 0; for (int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker) { // note this count of active workers is not necessarily totally reliable, because a worker thread could be // just about to put itself to sleep. So we may on occasion fail to wake up all the workers. It should be rare. ThreadLocalStorage& storage = m_threadLocalStorage[kFirstWorkerThreadId + iWorker]; if (storage.m_status != WorkerThreadStatus::kSleeping) { numActiveWorkers++; } } for (int iWorker = 0; iWorker < m_numWorkerThreads && numActiveWorkers < numDesiredWorkers; ++iWorker) { ThreadLocalStorage& storage = m_threadLocalStorage[kFirstWorkerThreadId + iWorker]; if (storage.m_status == WorkerThreadStatus::kSleeping) { m_threadSupport->runTask(iWorker, &storage); numActiveWorkers++; } } } void waitForWorkersToSleep() { BT_PROFILE("waitForWorkersToSleep"); setWorkerDirectives(WorkerThreadDirectives::kGoToSleep); m_threadSupport->waitForAllTasks(); for (int i = kFirstWorkerThreadId; i < m_numThreads; i++) { ThreadLocalStorage& storage = m_threadLocalStorage[i]; btAssert(storage.m_status == WorkerThreadStatus::kSleeping); } } virtual void sleepWorkerThreadsHint() BT_OVERRIDE { BT_PROFILE("sleepWorkerThreadsHint"); // hint the task scheduler that we may not be using these threads for a little while setWorkerDirectives(WorkerThreadDirectives::kGoToSleep); } void prepareWorkerThreads() { for (int i = kFirstWorkerThreadId; i < m_numThreads; ++i) { ThreadLocalStorage& storage = m_threadLocalStorage[i]; storage.m_mutex.lock(); storage.m_numJobsFinished = 0; storage.m_mutex.unlock(); } setWorkerDirectives(WorkerThreadDirectives::kScanForJobs); } virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE { BT_PROFILE("parallelFor_ThreadSupport"); btAssert(iEnd >= iBegin); btAssert(grainSize >= 1); int iterationCount = iEnd - iBegin; if (iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock()) { typedef ParallelForJob JobType; int jobCount = (iterationCount + grainSize - 1) / grainSize; m_numJobs = jobCount; btAssert(jobCount >= 2); // need more than one job for multithreading int jobSize = sizeof(JobType); for (int i = 0; i < m_numActiveJobQueues; ++i) { m_jobQueues[i].clearQueue(jobCount, jobSize); } // prepare worker threads for incoming work prepareWorkerThreads(); // submit all of the jobs int iJob = 0; int iThread = kFirstWorkerThreadId; // first worker thread for (int i = iBegin; i < iEnd; i += grainSize) { btAssert(iJob < jobCount); int iE = btMin(i + grainSize, iEnd); JobQueue* jq = m_perThreadJobQueues[iThread]; btAssert(jq); btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues); void* jobMem = jq->allocJobMem(jobSize); JobType* job = new (jobMem) ParallelForJob(i, iE, body); // placement new jq->submitJob(job); iJob++; iThread++; if (iThread >= m_numThreads) { iThread = kFirstWorkerThreadId; // first worker thread } } wakeWorkers(jobCount - 1); // put the main thread to work on emptying the job queue and then wait for all workers to finish waitJobs(); m_antiNestingLock.unlock(); } else { BT_PROFILE("parallelFor_mainThread"); // just run on main thread body.forLoop(iBegin, iEnd); } } virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE { BT_PROFILE("parallelSum_ThreadSupport"); btAssert(iEnd >= iBegin); btAssert(grainSize >= 1); int iterationCount = iEnd - iBegin; if (iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock()) { typedef ParallelSumJob JobType; int jobCount = (iterationCount + grainSize - 1) / grainSize; m_numJobs = jobCount; btAssert(jobCount >= 2); // need more than one job for multithreading int jobSize = sizeof(JobType); for (int i = 0; i < m_numActiveJobQueues; ++i) { m_jobQueues[i].clearQueue(jobCount, jobSize); } // initialize summation for (int iThread = 0; iThread < m_numThreads; ++iThread) { m_threadLocalStorage[iThread].m_sumResult = btScalar(0); } // prepare worker threads for incoming work prepareWorkerThreads(); // submit all of the jobs int iJob = 0; int iThread = kFirstWorkerThreadId; // first worker thread for (int i = iBegin; i < iEnd; i += grainSize) { btAssert(iJob < jobCount); int iE = btMin(i + grainSize, iEnd); JobQueue* jq = m_perThreadJobQueues[iThread]; btAssert(jq); btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues); void* jobMem = jq->allocJobMem(jobSize); JobType* job = new (jobMem) ParallelSumJob(i, iE, body, &m_threadLocalStorage[0]); // placement new jq->submitJob(job); iJob++; iThread++; if (iThread >= m_numThreads) { iThread = kFirstWorkerThreadId; // first worker thread } } wakeWorkers(jobCount - 1); // put the main thread to work on emptying the job queue and then wait for all workers to finish waitJobs(); // add up all the thread sums btScalar sum = btScalar(0); for (int iThread = 0; iThread < m_numThreads; ++iThread) { sum += m_threadLocalStorage[iThread].m_sumResult; } m_antiNestingLock.unlock(); return sum; } else { BT_PROFILE("parallelSum_mainThread"); // just run on main thread return body.sumLoop(iBegin, iEnd); } } }; btITaskScheduler* btCreateDefaultTaskScheduler() { btTaskSchedulerDefault* ts = new btTaskSchedulerDefault(); ts->init(); return ts; } #else // #if BT_THREADSAFE btITaskScheduler* btCreateDefaultTaskScheduler() { return NULL; } #endif // #else // #if BT_THREADSAFE