1 files changed, 1128 insertions, 0 deletions
diff --git a/thirdparty/bullet/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp b/thirdparty/bullet/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
new file mode 100644
index 0000000000..c82ba87f9f
--- /dev/null
+++ b/thirdparty/bullet/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
@@ -0,0 +1,1128 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "btBatchedConstraints.h"
+
+#include "LinearMath/btIDebugDraw.h"
+#include "LinearMath/btMinMax.h"
+#include "LinearMath/btStackAlloc.h"
+#include "LinearMath/btQuickprof.h"
+
+#include <string.h> //for memset
+
+const int kNoMerge = -1;
+
+bool btBatchedConstraints::s_debugDrawBatches = false;
+
+
+struct btBatchedConstraintInfo
+{
+    int constraintIndex;
+    int numConstraintRows;
+    int bodyIds[2];
+};
+
+
+struct btBatchInfo
+{
+    int numConstraints;
+    int mergeIndex;
+
+    btBatchInfo() : numConstraints(0), mergeIndex(kNoMerge) {}
+};
+
+
+bool btBatchedConstraints::validate(btConstraintArray* constraints, const btAlignedObjectArray<btSolverBody>& bodies) const
+{
+    //
+    // validate: for debugging only. Verify coloring of bodies, that no body is touched by more than one batch in any given phase
+    //
+    int errors = 0;
+    const int kUnassignedBatch = -1;
+
+    btAlignedObjectArray<int> bodyBatchId;
+    for (int iPhase = 0; iPhase < m_phases.size(); ++iPhase)
+    {
+        bodyBatchId.resizeNoInitialize(0);
+        bodyBatchId.resize( bodies.size(), kUnassignedBatch );
+        const Range& phase = m_phases[iPhase];
+        for (int iBatch = phase.begin; iBatch < phase.end; ++iBatch)
+        {
+            const Range& batch = m_batches[iBatch];
+            for (int iiCons = batch.begin; iiCons < batch.end; ++iiCons)
+            {
+                int iCons = m_constraintIndices[iiCons];
+                const btSolverConstraint& cons = constraints->at(iCons);
+                const btSolverBody& bodyA = bodies[cons.m_solverBodyIdA];
+                const btSolverBody& bodyB = bodies[cons.m_solverBodyIdB];
+                if (! bodyA.internalGetInvMass().isZero())
+                {
+                    int thisBodyBatchId = bodyBatchId[cons.m_solverBodyIdA];
+                    if (thisBodyBatchId == kUnassignedBatch)
+                    {
+                        bodyBatchId[cons.m_solverBodyIdA] = iBatch;
+                    }
+                    else if (thisBodyBatchId != iBatch)
+                    {
+                        btAssert( !"dynamic body is used in 2 different batches in the same phase" );
+                        errors++;
+                    }
+                }
+                if (! bodyB.internalGetInvMass().isZero())
+                {
+                    int thisBodyBatchId = bodyBatchId[cons.m_solverBodyIdB];
+                    if (thisBodyBatchId == kUnassignedBatch)
+                    {
+                        bodyBatchId[cons.m_solverBodyIdB] = iBatch;
+                    }
+                    else if (thisBodyBatchId != iBatch)
+                    {
+                        btAssert( !"dynamic body is used in 2 different batches in the same phase" );
+                        errors++;
+                    }
+                }
+            }
+        }
+    }
+    return errors == 0;
+}
+
+
+static void debugDrawSingleBatch( const btBatchedConstraints* bc,
+    btConstraintArray* constraints,
+    const btAlignedObjectArray<btSolverBody>& bodies,
+    int iBatch,
+    const btVector3& color,
+    const btVector3& offset
+    )
+{
+    if (bc && bc->m_debugDrawer && iBatch < bc->m_batches.size())
+    {
+        const btBatchedConstraints::Range& b = bc->m_batches[iBatch];
+        for (int iiCon = b.begin; iiCon < b.end; ++iiCon)
+        {
+            int iCon = bc->m_constraintIndices[iiCon];
+            const btSolverConstraint& con = constraints->at(iCon);
+            int iBody0 = con.m_solverBodyIdA;
+            int iBody1 = con.m_solverBodyIdB;
+            btVector3 pos0 = bodies[iBody0].getWorldTransform().getOrigin() + offset;
+            btVector3 pos1 = bodies[iBody1].getWorldTransform().getOrigin() + offset;
+            bc->m_debugDrawer->drawLine(pos0, pos1, color);
+        }
+    }
+}
+
+
+static void debugDrawPhase( const btBatchedConstraints* bc,
+    btConstraintArray* constraints,
+    const btAlignedObjectArray<btSolverBody>& bodies,
+    int iPhase,
+    const btVector3& color0,
+    const btVector3& color1,
+    const btVector3& offset
+    )
+{
+    BT_PROFILE( "debugDrawPhase" );
+    if ( bc && bc->m_debugDrawer && iPhase < bc->m_phases.size() )
+    {
+        const btBatchedConstraints::Range& phase = bc->m_phases[iPhase];
+        for (int iBatch = phase.begin; iBatch < phase.end; ++iBatch)
+        {
+            float tt = float(iBatch - phase.begin) / float(btMax(1, phase.end - phase.begin - 1));
+            btVector3 col = lerp(color0, color1, tt);
+            debugDrawSingleBatch(bc, constraints, bodies, iBatch, col, offset);
+        }
+    }
+}
+
+
+static void debugDrawAllBatches( const btBatchedConstraints* bc,
+    btConstraintArray* constraints,
+    const btAlignedObjectArray<btSolverBody>& bodies
+    )
+{
+    BT_PROFILE( "debugDrawAllBatches" );
+    if ( bc && bc->m_debugDrawer && bc->m_phases.size() > 0 )
+    {
+        btVector3 bboxMin(BT_LARGE_FLOAT, BT_LARGE_FLOAT, BT_LARGE_FLOAT);
+        btVector3 bboxMax = -bboxMin;
+        for (int iBody = 0; iBody < bodies.size(); ++iBody)
+        {
+            const btVector3& pos = bodies[iBody].getWorldTransform().getOrigin();
+            bboxMin.setMin(pos);
+            bboxMax.setMax(pos);
+        }
+        btVector3 bboxExtent = bboxMax - bboxMin;
+        btVector3 offsetBase = btVector3( 0, bboxExtent.y()*1.1f, 0 );
+        btVector3 offsetStep = btVector3( 0, 0, bboxExtent.z()*1.1f );
+        int numPhases = bc->m_phases.size();
+        for (int iPhase = 0; iPhase < numPhases; ++iPhase)
+        {
+            float b = float(iPhase)/float(numPhases-1);
+            btVector3 color0 = btVector3(1,0,b);
+            btVector3 color1 = btVector3(0,1,b);
+            btVector3 offset = offsetBase + offsetStep*(float(iPhase) - float(numPhases-1)*0.5);
+            debugDrawPhase(bc, constraints, bodies, iPhase, color0, color1, offset);
+        }
+    }
+}
+
+
+static void initBatchedBodyDynamicFlags(btAlignedObjectArray<bool>* outBodyDynamicFlags, const btAlignedObjectArray<btSolverBody>& bodies)
+{
+    BT_PROFILE("initBatchedBodyDynamicFlags");
+    btAlignedObjectArray<bool>& bodyDynamicFlags = *outBodyDynamicFlags;
+    bodyDynamicFlags.resizeNoInitialize(bodies.size());
+    for (int i = 0; i < bodies.size(); ++i)
+    {
+        const btSolverBody& body = bodies[ i ];
+        bodyDynamicFlags[i] = ( body.internalGetInvMass().x() > btScalar( 0 ) );
+    }
+}
+
+
+static int runLengthEncodeConstraintInfo(btBatchedConstraintInfo* outConInfos, int numConstraints)
+{
+    BT_PROFILE("runLengthEncodeConstraintInfo");
+    // detect and run-length encode constraint rows that repeat the same bodies
+    int iDest = 0;
+    int iSrc = 0;
+    while (iSrc < numConstraints)
+    {
+        const btBatchedConstraintInfo& srcConInfo = outConInfos[iSrc];
+        btBatchedConstraintInfo& conInfo = outConInfos[iDest];
+        conInfo.constraintIndex = iSrc;
+        conInfo.bodyIds[0] = srcConInfo.bodyIds[0];
+        conInfo.bodyIds[1] = srcConInfo.bodyIds[1];
+        while (iSrc < numConstraints && outConInfos[iSrc].bodyIds[0] == srcConInfo.bodyIds[0] && outConInfos[iSrc].bodyIds[1] == srcConInfo.bodyIds[1])
+        {
+            ++iSrc;
+        }
+        conInfo.numConstraintRows = iSrc - conInfo.constraintIndex;
+        ++iDest;
+    }
+    return iDest;
+}
+
+
+struct ReadSolverConstraintsLoop : public btIParallelForBody
+{
+    btBatchedConstraintInfo* m_outConInfos;
+    btConstraintArray* m_constraints;
+
+    ReadSolverConstraintsLoop( btBatchedConstraintInfo* outConInfos, btConstraintArray* constraints )
+    {
+        m_outConInfos = outConInfos;
+        m_constraints = constraints;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        for (int i = iBegin; i < iEnd; ++i)
+        {
+            btBatchedConstraintInfo& conInfo = m_outConInfos[i];
+            const btSolverConstraint& con = m_constraints->at( i );
+            conInfo.bodyIds[0] = con.m_solverBodyIdA;
+            conInfo.bodyIds[1] = con.m_solverBodyIdB;
+            conInfo.constraintIndex = i;
+            conInfo.numConstraintRows = 1;
+        }
+    }
+};
+
+
+static int initBatchedConstraintInfo(btBatchedConstraintInfo* outConInfos, btConstraintArray* constraints)
+{
+    BT_PROFILE("initBatchedConstraintInfo");
+    int numConstraints = constraints->size();
+    bool inParallel = true;
+    if (inParallel)
+    {
+        ReadSolverConstraintsLoop loop(outConInfos, constraints);
+        int grainSize = 1200;
+        btParallelFor(0, numConstraints, grainSize, loop);
+    }
+    else
+    {
+        for (int i = 0; i < numConstraints; ++i)
+        {
+            btBatchedConstraintInfo& conInfo = outConInfos[i];
+            const btSolverConstraint& con = constraints->at( i );
+            conInfo.bodyIds[0] = con.m_solverBodyIdA;
+            conInfo.bodyIds[1] = con.m_solverBodyIdB;
+            conInfo.constraintIndex = i;
+            conInfo.numConstraintRows = 1;
+        }
+    }
+    bool useRunLengthEncoding = true;
+    if (useRunLengthEncoding)
+    {
+        numConstraints = runLengthEncodeConstraintInfo(outConInfos, numConstraints);
+    }
+    return numConstraints;
+}
+
+
+static void expandConstraintRowsInPlace(int* constraintBatchIds, const btBatchedConstraintInfo* conInfos, int numConstraints, int numConstraintRows)
+{
+    BT_PROFILE("expandConstraintRowsInPlace");
+    if (numConstraintRows > numConstraints)
+    {
+        // we walk the array in reverse to avoid overwriteing
+        for (int iCon = numConstraints - 1; iCon >= 0; --iCon)
+        {
+            const btBatchedConstraintInfo& conInfo = conInfos[iCon];
+            int iBatch = constraintBatchIds[iCon];
+            for (int i = conInfo.numConstraintRows - 1; i >= 0; --i)
+            {
+                int iDest = conInfo.constraintIndex + i;
+                btAssert(iDest >= iCon);
+                btAssert(iDest >= 0 && iDest < numConstraintRows);
+                constraintBatchIds[iDest] = iBatch;
+            }
+        }
+    }
+}
+
+
+static void expandConstraintRows(int* destConstraintBatchIds, const int* srcConstraintBatchIds, const btBatchedConstraintInfo* conInfos, int numConstraints, int numConstraintRows)
+{
+    BT_PROFILE("expandConstraintRows");
+    for ( int iCon = 0; iCon < numConstraints; ++iCon )
+    {
+        const btBatchedConstraintInfo& conInfo = conInfos[ iCon ];
+        int iBatch = srcConstraintBatchIds[ iCon ];
+        for ( int i = 0; i < conInfo.numConstraintRows; ++i )
+        {
+            int iDest = conInfo.constraintIndex + i;
+            btAssert( iDest >= iCon );
+            btAssert( iDest >= 0 && iDest < numConstraintRows );
+            destConstraintBatchIds[ iDest ] = iBatch;
+        }
+    }
+}
+
+
+struct ExpandConstraintRowsLoop : public btIParallelForBody
+{
+    int* m_destConstraintBatchIds;
+    const int* m_srcConstraintBatchIds;
+    const btBatchedConstraintInfo* m_conInfos;
+    int m_numConstraintRows;
+
+    ExpandConstraintRowsLoop( int* destConstraintBatchIds, const int* srcConstraintBatchIds, const btBatchedConstraintInfo* conInfos, int numConstraintRows)
+    {
+        m_destConstraintBatchIds = destConstraintBatchIds;
+        m_srcConstraintBatchIds = srcConstraintBatchIds;
+        m_conInfos = conInfos;
+        m_numConstraintRows = numConstraintRows;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        expandConstraintRows(m_destConstraintBatchIds, m_srcConstraintBatchIds + iBegin, m_conInfos + iBegin, iEnd - iBegin, m_numConstraintRows);
+    }
+};
+
+
+static void expandConstraintRowsMt(int* destConstraintBatchIds, const int* srcConstraintBatchIds, const btBatchedConstraintInfo* conInfos, int numConstraints, int numConstraintRows)
+{
+    BT_PROFILE("expandConstraintRowsMt");
+    ExpandConstraintRowsLoop loop(destConstraintBatchIds, srcConstraintBatchIds, conInfos, numConstraintRows);
+    int grainSize = 600;
+    btParallelFor(0, numConstraints, grainSize, loop);
+}
+
+
+static void initBatchedConstraintInfoArray(btAlignedObjectArray<btBatchedConstraintInfo>* outConInfos, btConstraintArray* constraints)
+{
+    BT_PROFILE("initBatchedConstraintInfoArray");
+    btAlignedObjectArray<btBatchedConstraintInfo>& conInfos = *outConInfos;
+    int numConstraints = constraints->size();
+    conInfos.resizeNoInitialize(numConstraints);
+
+    int newSize = initBatchedConstraintInfo(&outConInfos->at(0), constraints);
+    conInfos.resizeNoInitialize(newSize);
+}
+
+
+static void mergeSmallBatches(btBatchInfo* batches, int iBeginBatch, int iEndBatch, int minBatchSize, int maxBatchSize)
+{
+    BT_PROFILE("mergeSmallBatches");
+    for ( int iBatch = iEndBatch - 1; iBatch >= iBeginBatch; --iBatch )
+    {
+        btBatchInfo& batch = batches[ iBatch ];
+        if ( batch.mergeIndex == kNoMerge && batch.numConstraints > 0 && batch.numConstraints < minBatchSize )
+        {
+            for ( int iDestBatch = iBatch - 1; iDestBatch >= iBeginBatch; --iDestBatch )
+            {
+                btBatchInfo& destBatch = batches[ iDestBatch ];
+                if ( destBatch.mergeIndex == kNoMerge && ( destBatch.numConstraints + batch.numConstraints ) < maxBatchSize )
+                {
+                    destBatch.numConstraints += batch.numConstraints;
+                    batch.numConstraints = 0;
+                    batch.mergeIndex = iDestBatch;
+                    break;
+                }
+            }
+        }
+    }
+    // flatten mergeIndexes
+    // e.g. in case where A was merged into B and then B was merged into C, we need A to point to C instead of B
+    // Note: loop goes forward through batches because batches always merge from higher indexes to lower,
+    //     so by going from low to high it reduces the amount of trail-following
+    for ( int iBatch = iBeginBatch; iBatch < iEndBatch; ++iBatch )
+    {
+        btBatchInfo& batch = batches[ iBatch ];
+        if ( batch.mergeIndex != kNoMerge )
+        {
+            int iMergeDest = batches[ batch.mergeIndex ].mergeIndex;
+            // follow trail of merges to the end
+            while ( iMergeDest != kNoMerge )
+            {
+                int iNext = batches[ iMergeDest ].mergeIndex;
+                if ( iNext == kNoMerge )
+                {
+                    batch.mergeIndex = iMergeDest;
+                    break;
+                }
+                iMergeDest = iNext;
+            }
+        }
+    }
+}
+
+
+static void updateConstraintBatchIdsForMerges(int* constraintBatchIds, int numConstraints, const btBatchInfo* batches, int numBatches)
+{
+    BT_PROFILE("updateConstraintBatchIdsForMerges");
+    // update batchIds to account for merges
+    for (int i = 0; i < numConstraints; ++i)
+    {
+        int iBatch = constraintBatchIds[i];
+        btAssert(iBatch < numBatches);
+        // if this constraint references a batch that was merged into another batch
+        if (batches[iBatch].mergeIndex != kNoMerge)
+        {
+            // update batchId
+            constraintBatchIds[i] = batches[iBatch].mergeIndex;
+        }
+    }
+}
+
+
+struct UpdateConstraintBatchIdsForMergesLoop : public btIParallelForBody
+{
+    int* m_constraintBatchIds;
+    const btBatchInfo* m_batches;
+    int m_numBatches;
+
+    UpdateConstraintBatchIdsForMergesLoop( int* constraintBatchIds, const btBatchInfo* batches, int numBatches )
+    {
+        m_constraintBatchIds = constraintBatchIds;
+        m_batches = batches;
+        m_numBatches = numBatches;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        BT_PROFILE( "UpdateConstraintBatchIdsForMergesLoop" );
+        updateConstraintBatchIdsForMerges( m_constraintBatchIds + iBegin, iEnd - iBegin, m_batches, m_numBatches );
+    }
+};
+
+
+static void updateConstraintBatchIdsForMergesMt(int* constraintBatchIds, int numConstraints, const btBatchInfo* batches, int numBatches)
+{
+    BT_PROFILE( "updateConstraintBatchIdsForMergesMt" );
+    UpdateConstraintBatchIdsForMergesLoop loop(constraintBatchIds, batches, numBatches);
+    int grainSize = 800;
+    btParallelFor(0, numConstraints, grainSize, loop);
+}
+
+
+inline bool BatchCompare(const btBatchedConstraints::Range& a, const btBatchedConstraints::Range& b)
+{
+    int lenA = a.end - a.begin;
+    int lenB = b.end - b.begin;
+    return lenA > lenB;
+}
+
+
+static void writeOutConstraintIndicesForRangeOfBatches(btBatchedConstraints* bc,
+    const int* constraintBatchIds,
+    int numConstraints,
+    int* constraintIdPerBatch,
+    int batchBegin,
+    int batchEnd
+    )
+{
+    BT_PROFILE("writeOutConstraintIndicesForRangeOfBatches");
+    for ( int iCon = 0; iCon < numConstraints; ++iCon )
+    {
+        int iBatch = constraintBatchIds[ iCon ];
+        if (iBatch >= batchBegin && iBatch < batchEnd)
+        {
+            int iDestCon = constraintIdPerBatch[ iBatch ];
+            constraintIdPerBatch[ iBatch ] = iDestCon + 1;
+            bc->m_constraintIndices[ iDestCon ] = iCon;
+        }
+    }
+}
+
+
+struct WriteOutConstraintIndicesLoop : public btIParallelForBody
+{
+    btBatchedConstraints* m_batchedConstraints;
+    const int* m_constraintBatchIds;
+    int m_numConstraints;
+    int* m_constraintIdPerBatch;
+    int m_maxNumBatchesPerPhase;
+
+    WriteOutConstraintIndicesLoop( btBatchedConstraints* bc, const int* constraintBatchIds, int numConstraints, int* constraintIdPerBatch, int maxNumBatchesPerPhase )
+    {
+        m_batchedConstraints = bc;
+        m_constraintBatchIds = constraintBatchIds;
+        m_numConstraints = numConstraints;
+        m_constraintIdPerBatch = constraintIdPerBatch;
+        m_maxNumBatchesPerPhase = maxNumBatchesPerPhase;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        BT_PROFILE( "WriteOutConstraintIndicesLoop" );
+        int batchBegin = iBegin * m_maxNumBatchesPerPhase;
+        int batchEnd = iEnd * m_maxNumBatchesPerPhase;
+        writeOutConstraintIndicesForRangeOfBatches(m_batchedConstraints,
+            m_constraintBatchIds,
+            m_numConstraints,
+            m_constraintIdPerBatch,
+            batchBegin,
+            batchEnd
+        );
+    }
+};
+
+
+static void writeOutConstraintIndicesMt(btBatchedConstraints* bc,
+    const int* constraintBatchIds,
+    int numConstraints,
+    int* constraintIdPerBatch,
+    int maxNumBatchesPerPhase,
+    int numPhases
+    )
+{
+    BT_PROFILE("writeOutConstraintIndicesMt");
+    bool inParallel = true;
+    if (inParallel)
+    {
+        WriteOutConstraintIndicesLoop loop( bc, constraintBatchIds, numConstraints, constraintIdPerBatch, maxNumBatchesPerPhase );
+        btParallelFor( 0, numPhases, 1, loop );
+    }
+    else
+    {
+        for ( int iCon = 0; iCon < numConstraints; ++iCon )
+        {
+            int iBatch = constraintBatchIds[ iCon ];
+            int iDestCon = constraintIdPerBatch[ iBatch ];
+            constraintIdPerBatch[ iBatch ] = iDestCon + 1;
+            bc->m_constraintIndices[ iDestCon ] = iCon;
+        }
+    }
+}
+
+
+static void writeGrainSizes(btBatchedConstraints* bc)
+{
+    typedef btBatchedConstraints::Range Range;
+    int numPhases = bc->m_phases.size();
+    bc->m_phaseGrainSize.resizeNoInitialize(numPhases);
+    int numThreads = btGetTaskScheduler()->getNumThreads();
+    for (int iPhase = 0; iPhase < numPhases; ++iPhase)
+    {
+        const Range& phase = bc->m_phases[ iPhase ];
+        int numBatches = phase.end - phase.begin;
+        float grainSize = floor((0.25f*numBatches / float(numThreads)) + 0.0f);
+        bc->m_phaseGrainSize[ iPhase ] = btMax(1, int(grainSize));
+    }
+}
+
+
+static void writeOutBatches(btBatchedConstraints* bc,
+    const int* constraintBatchIds,
+    int numConstraints,
+    const btBatchInfo* batches,
+    int* batchWork,
+    int maxNumBatchesPerPhase,
+    int numPhases
+)
+{
+    BT_PROFILE("writeOutBatches");
+    typedef btBatchedConstraints::Range Range;
+    bc->m_constraintIndices.reserve( numConstraints );
+    bc->m_batches.resizeNoInitialize( 0 );
+    bc->m_phases.resizeNoInitialize( 0 );
+
+    //int maxNumBatches = numPhases * maxNumBatchesPerPhase;
+    {
+        int* constraintIdPerBatch = batchWork;  // for each batch, keep an index into the next available slot in the m_constraintIndices array
+        int iConstraint = 0;
+        for (int iPhase = 0; iPhase < numPhases; ++iPhase)
+        {
+            int curPhaseBegin = bc->m_batches.size();
+            int iBegin = iPhase * maxNumBatchesPerPhase;
+            int iEnd = iBegin + maxNumBatchesPerPhase;
+            for ( int i = iBegin; i < iEnd; ++i )
+            {
+                const btBatchInfo& batch = batches[ i ];
+                int curBatchBegin = iConstraint;
+                constraintIdPerBatch[ i ] = curBatchBegin;  // record the start of each batch in m_constraintIndices array
+                int numConstraints = batch.numConstraints;
+                iConstraint += numConstraints;
+                if ( numConstraints > 0 )
+                {
+                    bc->m_batches.push_back( Range( curBatchBegin, iConstraint ) );
+                }
+            }
+            // if any batches were emitted this phase,
+            if ( bc->m_batches.size() > curPhaseBegin )
+            {
+                // output phase
+                bc->m_phases.push_back( Range( curPhaseBegin, bc->m_batches.size() ) );
+            }
+        }
+
+        btAssert(iConstraint == numConstraints);
+        bc->m_constraintIndices.resizeNoInitialize( numConstraints );
+        writeOutConstraintIndicesMt( bc, constraintBatchIds, numConstraints, constraintIdPerBatch, maxNumBatchesPerPhase, numPhases );
+    }
+    // for each phase
+    for (int iPhase = 0; iPhase < bc->m_phases.size(); ++iPhase)
+    {
+        // sort the batches from largest to smallest (can be helpful to some task schedulers)
+        const Range& curBatches = bc->m_phases[iPhase];
+        bc->m_batches.quickSortInternal(BatchCompare, curBatches.begin, curBatches.end-1);
+    }
+    bc->m_phaseOrder.resize(bc->m_phases.size());
+    for (int i = 0; i < bc->m_phases.size(); ++i)
+    {
+        bc->m_phaseOrder[i] = i;
+    }
+    writeGrainSizes(bc);
+}
+
+
+//
+// PreallocatedMemoryHelper -- helper object for allocating a number of chunks of memory in a single contiguous block.
+//                             It is generally more efficient to do a single larger allocation than many smaller allocations.
+//
+// Example Usage:
+//
+//  btVector3* bodyPositions = NULL;
+//  btBatchedConstraintInfo* conInfos = NULL;
+//  {
+//    PreallocatedMemoryHelper<8> memHelper;
+//    memHelper.addChunk( (void**) &bodyPositions, sizeof( btVector3 ) * bodies.size() );
+//    memHelper.addChunk( (void**) &conInfos, sizeof( btBatchedConstraintInfo ) * numConstraints );
+//    void* memPtr = malloc( memHelper.getSizeToAllocate() );  // allocate the memory
+//    memHelper.setChunkPointers( memPtr );  // update pointers to chunks
+//  }
+template <int N>
+class PreallocatedMemoryHelper
+{
+    struct Chunk
+    {
+        void** ptr;
+        size_t size;
+    };
+    Chunk m_chunks[N];
+    int m_numChunks;
+public:
+    PreallocatedMemoryHelper() {m_numChunks=0;}
+    void addChunk( void** ptr, size_t sz )
+    {
+        btAssert( m_numChunks < N );
+        if ( m_numChunks < N )
+        {
+            Chunk& chunk = m_chunks[ m_numChunks ];
+            chunk.ptr = ptr;
+            chunk.size = sz;
+            m_numChunks++;
+        }
+    }
+    size_t getSizeToAllocate() const
+    {
+        size_t totalSize = 0;
+        for (int i = 0; i < m_numChunks; ++i)
+        {
+            totalSize += m_chunks[i].size;
+        }
+        return totalSize;
+    }
+    void setChunkPointers(void* mem) const
+    {
+        size_t totalSize = 0;
+        for (int i = 0; i < m_numChunks; ++i)
+        {
+            const Chunk& chunk = m_chunks[ i ];
+            char* chunkPtr = static_cast<char*>(mem) + totalSize;
+            *chunk.ptr = chunkPtr;
+            totalSize += chunk.size;
+        }
+    }
+};
+
+
+
+static btVector3 findMaxDynamicConstraintExtent(
+    btVector3* bodyPositions,
+    bool* bodyDynamicFlags,
+    btBatchedConstraintInfo* conInfos,
+    int numConstraints,
+    int numBodies
+    )
+{
+    BT_PROFILE("findMaxDynamicConstraintExtent");
+    btVector3 consExtent = btVector3(1,1,1) * 0.001;
+    for (int iCon = 0; iCon < numConstraints; ++iCon)
+    {
+        const btBatchedConstraintInfo& con = conInfos[ iCon ];
+        int iBody0 = con.bodyIds[0];
+        int iBody1 = con.bodyIds[1];
+        btAssert(iBody0 >= 0 && iBody0 < numBodies);
+        btAssert(iBody1 >= 0 && iBody1 < numBodies);
+        // is it a dynamic constraint?
+        if (bodyDynamicFlags[iBody0] && bodyDynamicFlags[iBody1])
+        {
+            btVector3 delta = bodyPositions[iBody1] - bodyPositions[iBody0];
+            consExtent.setMax(delta.absolute());
+        }
+    }
+    return consExtent;
+}
+
+
+struct btIntVec3
+{
+    int m_ints[ 3 ];
+
+    SIMD_FORCE_INLINE const int& operator[](int i) const {return m_ints[i];}
+    SIMD_FORCE_INLINE int&       operator[](int i)       {return m_ints[i];}
+};
+
+
+struct AssignConstraintsToGridBatchesParams
+{
+    bool* bodyDynamicFlags;
+    btIntVec3* bodyGridCoords;
+    int numBodies;
+    btBatchedConstraintInfo* conInfos;
+    int* constraintBatchIds;
+    btIntVec3 gridChunkDim;
+    int maxNumBatchesPerPhase;
+    int numPhases;
+    int phaseMask;
+
+    AssignConstraintsToGridBatchesParams()
+    {
+        memset(this, 0, sizeof(*this));
+    }
+};
+
+
+static void assignConstraintsToGridBatches(const AssignConstraintsToGridBatchesParams& params, int iConBegin, int iConEnd)
+{
+    BT_PROFILE("assignConstraintsToGridBatches");
+    // (can be done in parallel)
+    for ( int iCon = iConBegin; iCon < iConEnd; ++iCon )
+    {
+        const btBatchedConstraintInfo& con = params.conInfos[ iCon ];
+        int iBody0 = con.bodyIds[ 0 ];
+        int iBody1 = con.bodyIds[ 1 ];
+        int iPhase = iCon; //iBody0; // pseudorandom choice to distribute evenly amongst phases
+        iPhase &= params.phaseMask;
+        int gridCoord[ 3 ];
+        // is it a dynamic constraint?
+        if ( params.bodyDynamicFlags[ iBody0 ] && params.bodyDynamicFlags[ iBody1 ] )
+        {
+            const btIntVec3& body0Coords = params.bodyGridCoords[iBody0];
+            const btIntVec3& body1Coords = params.bodyGridCoords[iBody1];
+            // for each dimension x,y,z,
+            for (int i = 0; i < 3; ++i)
+            {
+                int coordMin = btMin(body0Coords.m_ints[i], body1Coords.m_ints[i]);
+                int coordMax = btMax(body0Coords.m_ints[i], body1Coords.m_ints[i]);
+                if (coordMin != coordMax)
+                {
+                    btAssert( coordMax == coordMin + 1 );
+                    if ((coordMin&1) == 0)
+                    {
+                        iPhase &= ~(1 << i); // force bit off
+                    }
+                    else
+                    {
+                        iPhase |= (1 << i); // force bit on
+                        iPhase &= params.phaseMask;
+                    }
+                }
+                gridCoord[ i ] = coordMin;
+            }
+        }
+        else
+        {
+            if ( !params.bodyDynamicFlags[ iBody0 ] )
+            {
+                iBody0 = con.bodyIds[ 1 ];
+            }
+            btAssert(params.bodyDynamicFlags[ iBody0 ]);
+            const btIntVec3& body0Coords = params.bodyGridCoords[iBody0];
+            // for each dimension x,y,z,
+            for ( int i = 0; i < 3; ++i )
+            {
+                gridCoord[ i ] = body0Coords.m_ints[ i ];
+            }
+        }
+        // calculate chunk coordinates
+        int chunkCoord[ 3 ];
+        btIntVec3 gridChunkDim = params.gridChunkDim;
+        // for each dimension x,y,z,
+        for ( int i = 0; i < 3; ++i )
+        {
+            int coordOffset = ( iPhase >> i ) & 1;
+            chunkCoord[ i ] = (gridCoord[ i ] - coordOffset)/2;
+            btClamp( chunkCoord[ i ], 0, gridChunkDim[ i ] - 1);
+            btAssert( chunkCoord[ i ] < gridChunkDim[ i ] );
+        }
+        int iBatch = iPhase * params.maxNumBatchesPerPhase + chunkCoord[ 0 ] + chunkCoord[ 1 ] * gridChunkDim[ 0 ] + chunkCoord[ 2 ] * gridChunkDim[ 0 ] * gridChunkDim[ 1 ];
+        btAssert(iBatch >= 0 && iBatch < params.maxNumBatchesPerPhase*params.numPhases);
+        params.constraintBatchIds[ iCon ] = iBatch;
+    }
+}
+
+
+struct AssignConstraintsToGridBatchesLoop : public btIParallelForBody
+{
+    const AssignConstraintsToGridBatchesParams* m_params;
+
+    AssignConstraintsToGridBatchesLoop( const AssignConstraintsToGridBatchesParams& params )
+    {
+        m_params = &params;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        assignConstraintsToGridBatches(*m_params, iBegin, iEnd);
+    }
+};
+
+
+//
+// setupSpatialGridBatchesMt -- generate batches using a uniform 3D grid
+//
+/*
+
+Bodies are treated as 3D points at their center of mass. We only consider dynamic bodies at this stage,
+because only dynamic bodies are mutated when a constraint is solved, thus subject to race conditions.
+
+1. Compute a bounding box around all dynamic bodies
+2. Compute the maximum extent of all dynamic constraints. Each dynamic constraint is treated as a line segment, and we need the size of
+   box that will fully enclose any single dynamic constraint
+
+3. Establish the cell size of our grid, the cell size in each dimension must be at least as large as the dynamic constraints max-extent,
+   so that no dynamic constraint can span more than 2 cells of our grid on any axis of the grid. The cell size should be adjusted
+   larger in order to keep the total number of cells from being excessively high
+
+Key idea: Given that each constraint spans 1 or 2 grid cells in each dimension, we can handle all constraints by processing
+          in chunks of 2x2x2 cells with 8 different 1-cell offsets ((0,0,0),(0,0,1),(0,1,0),(0,1,1),(1,0,0)...).
+          For each of the 8 offsets, we create a phase, and for each 2x2x2 chunk with dynamic constraints becomes a batch in that phase.
+
+4. Once the grid is established, we can calculate for each constraint which phase and batch it belongs in.
+
+5. Do a merge small batches on the batches of each phase separately, to try to even out the sizes of batches
+
+Optionally, we can "collapse" one dimension of our 3D grid to turn it into a 2D grid, which reduces the number of phases
+to 4. With fewer phases, there are more constraints per phase and this makes it easier to create batches of a useful size.
+*/
+//
+static void setupSpatialGridBatchesMt(
+    btBatchedConstraints* batchedConstraints,
+    btAlignedObjectArray<char>* scratchMemory,
+    btConstraintArray* constraints,
+    const btAlignedObjectArray<btSolverBody>& bodies,
+    int minBatchSize,
+    int maxBatchSize,
+    bool use2DGrid
+)
+{
+    BT_PROFILE("setupSpatialGridBatchesMt");
+    const int numPhases = 8;
+    int numConstraints = constraints->size();
+    int numConstraintRows = constraints->size();
+
+    const int maxGridChunkCount = 128;
+    int allocNumBatchesPerPhase = maxGridChunkCount;
+    int minNumBatchesPerPhase = 16;
+    int allocNumBatches = allocNumBatchesPerPhase * numPhases;
+
+    btVector3* bodyPositions = NULL;
+    bool* bodyDynamicFlags = NULL;
+    btIntVec3* bodyGridCoords = NULL;
+    btBatchInfo* batches = NULL;
+    int* batchWork = NULL;
+    btBatchedConstraintInfo* conInfos = NULL;
+    int* constraintBatchIds = NULL;
+    int* constraintRowBatchIds = NULL;
+    {
+        PreallocatedMemoryHelper<10> memHelper;
+        memHelper.addChunk( (void**) &bodyPositions, sizeof( btVector3 ) * bodies.size() );
+        memHelper.addChunk( (void**) &bodyDynamicFlags, sizeof( bool ) * bodies.size() );
+        memHelper.addChunk( (void**) &bodyGridCoords, sizeof( btIntVec3 ) * bodies.size() );
+        memHelper.addChunk( (void**) &batches, sizeof( btBatchInfo )* allocNumBatches );
+        memHelper.addChunk( (void**) &batchWork, sizeof( int )* allocNumBatches );
+        memHelper.addChunk( (void**) &conInfos, sizeof( btBatchedConstraintInfo ) * numConstraints );
+        memHelper.addChunk( (void**) &constraintBatchIds, sizeof( int ) * numConstraints );
+        memHelper.addChunk( (void**) &constraintRowBatchIds, sizeof( int ) * numConstraintRows );
+        size_t scratchSize = memHelper.getSizeToAllocate();
+        // if we need to reallocate
+        if (scratchMemory->capacity() < scratchSize)
+        {
+            // allocate 6.25% extra to avoid repeated reallocs
+            scratchMemory->reserve( scratchSize + scratchSize/16 );
+        }
+        scratchMemory->resizeNoInitialize( scratchSize );
+        char* memPtr = &scratchMemory->at(0);
+        memHelper.setChunkPointers( memPtr );
+    }
+
+    numConstraints = initBatchedConstraintInfo(conInfos, constraints);
+
+    // compute bounding box around all dynamic bodies
+    // (could be done in parallel)
+    btVector3 bboxMin(BT_LARGE_FLOAT, BT_LARGE_FLOAT, BT_LARGE_FLOAT);
+    btVector3 bboxMax = -bboxMin;
+    //int dynamicBodyCount = 0;
+    for (int i = 0; i < bodies.size(); ++i)
+    {
+        const btSolverBody& body = bodies[i];
+        btVector3 bodyPos = body.getWorldTransform().getOrigin();
+        bool isDynamic = ( body.internalGetInvMass().x() > btScalar( 0 ) );
+        bodyPositions[i] = bodyPos;
+        bodyDynamicFlags[i] = isDynamic;
+        if (isDynamic)
+        {
+            //dynamicBodyCount++;
+            bboxMin.setMin(bodyPos);
+            bboxMax.setMax(bodyPos);
+        }
+    }
+
+    // find max extent of all dynamic constraints
+    // (could be done in parallel)
+    btVector3 consExtent = findMaxDynamicConstraintExtent(bodyPositions, bodyDynamicFlags, conInfos, numConstraints, bodies.size());
+
+    btVector3 gridExtent = bboxMax - bboxMin;
+
+    btVector3 gridCellSize = consExtent;
+    int gridDim[3];
+    gridDim[ 0 ] = int( 1.0 + gridExtent.x() / gridCellSize.x() );
+    gridDim[ 1 ] = int( 1.0 + gridExtent.y() / gridCellSize.y() );
+    gridDim[ 2 ] = int( 1.0 + gridExtent.z() / gridCellSize.z() );
+
+    // if we can collapse an axis, it will cut our number of phases in half which could be more efficient
+    int phaseMask = 7;
+    bool collapseAxis = use2DGrid;
+    if ( collapseAxis )
+    {
+        // pick the smallest axis to collapse, leaving us with the greatest number of cells in our grid
+        int iAxisToCollapse = 0;
+        int axisDim = gridDim[iAxisToCollapse];
+        //for each dimension
+        for ( int i = 0; i < 3; ++i )
+        {
+            if (gridDim[i] < axisDim)
+            {
+                iAxisToCollapse = i;
+                axisDim = gridDim[i];
+            }
+        }
+        // collapse it
+        gridCellSize[iAxisToCollapse] = gridExtent[iAxisToCollapse] * 2.0f;
+        phaseMask &= ~(1 << iAxisToCollapse);
+    }
+
+    int numGridChunks = 0;
+    btIntVec3 gridChunkDim;  // each chunk is 2x2x2 group of cells
+    while (true)
+    {
+        gridDim[0] = int( 1.0 + gridExtent.x() / gridCellSize.x() );
+        gridDim[1] = int( 1.0 + gridExtent.y() / gridCellSize.y() );
+        gridDim[2] = int( 1.0 + gridExtent.z() / gridCellSize.z() );
+        gridChunkDim[ 0 ] = btMax( 1, ( gridDim[ 0 ] + 0 ) / 2 );
+        gridChunkDim[ 1 ] = btMax( 1, ( gridDim[ 1 ] + 0 ) / 2 );
+        gridChunkDim[ 2 ] = btMax( 1, ( gridDim[ 2 ] + 0 ) / 2 );
+        numGridChunks = gridChunkDim[ 0 ] * gridChunkDim[ 1 ] * gridChunkDim[ 2 ];
+        float nChunks = float(gridChunkDim[0]) * float(gridChunkDim[1]) * float(gridChunkDim[2]);  // suceptible to integer overflow
+        if ( numGridChunks <= maxGridChunkCount && nChunks <= maxGridChunkCount )
+        {
+            break;
+        }
+        gridCellSize *= 1.25; // should roughly cut numCells in half
+    }
+    btAssert(numGridChunks <= maxGridChunkCount );
+    int maxNumBatchesPerPhase = numGridChunks;
+
+    // for each dynamic body, compute grid coords
+    btVector3 invGridCellSize = btVector3(1,1,1)/gridCellSize;
+    // (can be done in parallel)
+    for (int iBody = 0; iBody < bodies.size(); ++iBody)
+    {
+        btIntVec3& coords = bodyGridCoords[iBody];
+        if (bodyDynamicFlags[iBody])
+        {
+            btVector3 v = ( bodyPositions[ iBody ] - bboxMin )*invGridCellSize;
+            coords.m_ints[0] = int(v.x());
+            coords.m_ints[1] = int(v.y());
+            coords.m_ints[2] = int(v.z());
+            btAssert(coords.m_ints[0] >= 0 && coords.m_ints[0] < gridDim[0]);
+            btAssert(coords.m_ints[1] >= 0 && coords.m_ints[1] < gridDim[1]);
+            btAssert(coords.m_ints[2] >= 0 && coords.m_ints[2] < gridDim[2]);
+        }
+        else
+        {
+            coords.m_ints[0] = -1;
+            coords.m_ints[1] = -1;
+            coords.m_ints[2] = -1;
+        }
+    }
+
+    for (int iPhase = 0; iPhase < numPhases; ++iPhase)
+    {
+        int batchBegin = iPhase * maxNumBatchesPerPhase;
+        int batchEnd = batchBegin + maxNumBatchesPerPhase;
+        for ( int iBatch = batchBegin; iBatch < batchEnd; ++iBatch )
+        {
+            btBatchInfo& batch = batches[ iBatch ];
+            batch = btBatchInfo();
+        }
+    }
+
+    {
+        AssignConstraintsToGridBatchesParams params;
+        params.bodyDynamicFlags = bodyDynamicFlags;
+        params.bodyGridCoords = bodyGridCoords;
+        params.numBodies = bodies.size();
+        params.conInfos = conInfos;
+        params.constraintBatchIds = constraintBatchIds;
+        params.gridChunkDim = gridChunkDim;
+        params.maxNumBatchesPerPhase = maxNumBatchesPerPhase;
+        params.numPhases = numPhases;
+        params.phaseMask = phaseMask;
+        bool inParallel = true;
+        if (inParallel)
+        {
+            AssignConstraintsToGridBatchesLoop loop(params);
+            int grainSize = 250;
+            btParallelFor(0, numConstraints, grainSize, loop);
+        }
+        else
+        {
+            assignConstraintsToGridBatches( params, 0, numConstraints );
+        }
+    }
+    for ( int iCon = 0; iCon < numConstraints; ++iCon )
+    {
+        const btBatchedConstraintInfo& con = conInfos[ iCon ];
+        int iBatch = constraintBatchIds[ iCon ];
+        btBatchInfo& batch = batches[iBatch];
+        batch.numConstraints += con.numConstraintRows;
+    }
+
+    for (int iPhase = 0; iPhase < numPhases; ++iPhase)
+    {
+        // if phase is legit,
+        if (iPhase == (iPhase&phaseMask))
+        {
+            int iBeginBatch = iPhase * maxNumBatchesPerPhase;
+            int iEndBatch = iBeginBatch + maxNumBatchesPerPhase;
+            mergeSmallBatches( batches, iBeginBatch, iEndBatch, minBatchSize, maxBatchSize );
+        }
+    }
+    // all constraints have been assigned a batchId
+    updateConstraintBatchIdsForMergesMt(constraintBatchIds, numConstraints, batches, maxNumBatchesPerPhase*numPhases);
+
+    if (numConstraintRows > numConstraints)
+    {
+        expandConstraintRowsMt(&constraintRowBatchIds[0], &constraintBatchIds[0], &conInfos[0], numConstraints, numConstraintRows);
+    }
+    else
+    {
+        constraintRowBatchIds = constraintBatchIds;
+    }
+
+    writeOutBatches(batchedConstraints, constraintRowBatchIds, numConstraintRows, batches, batchWork, maxNumBatchesPerPhase, numPhases);
+    btAssert(batchedConstraints->validate(constraints, bodies));
+}
+
+
+static void setupSingleBatch(
+    btBatchedConstraints* bc,
+    int numConstraints
+)
+{
+    BT_PROFILE("setupSingleBatch");
+    typedef btBatchedConstraints::Range Range;
+
+    bc->m_constraintIndices.resize( numConstraints );
+    for ( int i = 0; i < numConstraints; ++i )
+    {
+        bc->m_constraintIndices[ i ] = i;
+    }
+
+    bc->m_batches.resizeNoInitialize( 0 );
+    bc->m_phases.resizeNoInitialize( 0 );
+    bc->m_phaseOrder.resizeNoInitialize( 0 );
+    bc->m_phaseGrainSize.resizeNoInitialize( 0 );
+
+    if (numConstraints > 0)
+    {
+        bc->m_batches.push_back( Range( 0, numConstraints ) );
+        bc->m_phases.push_back( Range( 0, 1 ) );
+        bc->m_phaseOrder.push_back(0);
+        bc->m_phaseGrainSize.push_back(1);
+    }
+}
+
+
+void btBatchedConstraints::setup(
+    btConstraintArray* constraints,
+    const btAlignedObjectArray<btSolverBody>& bodies,
+    BatchingMethod batchingMethod,
+    int minBatchSize,
+    int maxBatchSize,
+    btAlignedObjectArray<char>* scratchMemory
+    )
+{
+    if (constraints->size() >= minBatchSize*4)
+    {
+        bool use2DGrid = batchingMethod == BATCHING_METHOD_SPATIAL_GRID_2D;
+        setupSpatialGridBatchesMt( this, scratchMemory, constraints, bodies, minBatchSize, maxBatchSize, use2DGrid );
+        if (s_debugDrawBatches)
+        {
+            debugDrawAllBatches( this, constraints, bodies );
+        }
+    }
+    else
+    {
+        setupSingleBatch( this, constraints->size() );
+    }
+}
+
+