diff options
Diffstat (limited to 'thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp')
-rw-r--r-- | thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp | 692 |
1 files changed, 301 insertions, 391 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp index 0d3d50c548..bd9d6bb04b 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp @@ -14,11 +14,10 @@ subject to the following restrictions: */ //Originally written by Erwin Coumans - bool useGpuInitSolverBodies = true; bool useGpuInfo1 = true; -bool useGpuInfo2= true; -bool useGpuSolveJointConstraintRows=true; +bool useGpuInfo2 = true; +bool useGpuSolveJointConstraintRows = true; bool useGpuWriteBackVelocities = true; bool gpuBreakConstraints = true; @@ -29,27 +28,25 @@ bool gpuBreakConstraints = true; #include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h" #include <new> #include "Bullet3Common/b3AlignedObjectArray.h" -#include <string.h> //for memset +#include <string.h> //for memset #include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h" #include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h" -#include "Bullet3OpenCL/RigidBody/kernels/jointSolver.h" //solveConstraintRowsCL +#include "Bullet3OpenCL/RigidBody/kernels/jointSolver.h" //solveConstraintRowsCL #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" #define B3_JOINT_SOLVER_PATH "src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl" - struct b3GpuPgsJacobiSolverInternalData { - cl_context m_context; cl_device_id m_device; cl_command_queue m_queue; - b3PrefixScanCL* m_prefixScan; + b3PrefixScanCL* m_prefixScan; cl_kernel m_solveJointConstraintRowsKernels; cl_kernel m_initSolverBodiesKernel; @@ -59,31 +56,27 @@ struct b3GpuPgsJacobiSolverInternalData cl_kernel m_writeBackVelocitiesKernel; cl_kernel m_breakViolatedConstraintsKernel; - b3OpenCLArray<unsigned int>* m_gpuConstraintRowOffsets; + b3OpenCLArray<unsigned int>* m_gpuConstraintRowOffsets; - b3OpenCLArray<b3GpuSolverBody>* m_gpuSolverBodies; - b3OpenCLArray<b3BatchConstraint>* m_gpuBatchConstraints; - b3OpenCLArray<b3GpuSolverConstraint>* m_gpuConstraintRows; - b3OpenCLArray<unsigned int>* m_gpuConstraintInfo1; + b3OpenCLArray<b3GpuSolverBody>* m_gpuSolverBodies; + b3OpenCLArray<b3BatchConstraint>* m_gpuBatchConstraints; + b3OpenCLArray<b3GpuSolverConstraint>* m_gpuConstraintRows; + b3OpenCLArray<unsigned int>* m_gpuConstraintInfo1; -// b3AlignedObjectArray<b3GpuSolverBody> m_cpuSolverBodies; - b3AlignedObjectArray<b3BatchConstraint> m_cpuBatchConstraints; - b3AlignedObjectArray<b3GpuSolverConstraint> m_cpuConstraintRows; - b3AlignedObjectArray<unsigned int> m_cpuConstraintInfo1; - b3AlignedObjectArray<unsigned int> m_cpuConstraintRowOffsets; + // b3AlignedObjectArray<b3GpuSolverBody> m_cpuSolverBodies; + b3AlignedObjectArray<b3BatchConstraint> m_cpuBatchConstraints; + b3AlignedObjectArray<b3GpuSolverConstraint> m_cpuConstraintRows; + b3AlignedObjectArray<unsigned int> m_cpuConstraintInfo1; + b3AlignedObjectArray<unsigned int> m_cpuConstraintRowOffsets; - b3AlignedObjectArray<b3RigidBodyData> m_cpuBodies; - b3AlignedObjectArray<b3InertiaData> m_cpuInertias; + b3AlignedObjectArray<b3RigidBodyData> m_cpuBodies; + b3AlignedObjectArray<b3InertiaData> m_cpuInertias; - b3AlignedObjectArray<b3GpuGenericConstraint> m_cpuConstraints; - b3AlignedObjectArray<int> m_batchSizes; - - + b3AlignedObjectArray<int> m_batchSizes; }; - /* static b3Transform getWorldTransform(b3RigidBodyData* rb) { @@ -100,12 +93,12 @@ static const b3Matrix3x3& getInvInertiaTensorWorld(b3InertiaData* inertia) */ -static const b3Vector3& getLinearVelocity(b3RigidBodyData* rb) +static const b3Vector3& getLinearVelocity(b3RigidBodyData* rb) { return rb->m_linVel; } -static const b3Vector3& getAngularVelocity(b3RigidBodyData* rb) +static const b3Vector3& getAngularVelocity(b3RigidBodyData* rb) { return rb->m_angVel; } @@ -114,12 +107,9 @@ b3Vector3 getVelocityInLocalPoint(b3RigidBodyData* rb, const b3Vector3& rel_pos) { //we also calculate lin/ang velocity for kinematic objects return getLinearVelocity(rb) + getAngularVelocity(rb).cross(rel_pos); - } - - -b3GpuPgsConstraintSolver::b3GpuPgsConstraintSolver (cl_context ctx, cl_device_id device, cl_command_queue queue,bool usePgs) +b3GpuPgsConstraintSolver::b3GpuPgsConstraintSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, bool usePgs) { m_usePgs = usePgs; m_gpuData = new b3GpuPgsJacobiSolverInternalData(); @@ -127,45 +117,40 @@ b3GpuPgsConstraintSolver::b3GpuPgsConstraintSolver (cl_context ctx, cl_device_id m_gpuData->m_device = device; m_gpuData->m_queue = queue; - m_gpuData->m_prefixScan = new b3PrefixScanCL(ctx,device,queue); + m_gpuData->m_prefixScan = new b3PrefixScanCL(ctx, device, queue); - m_gpuData->m_gpuConstraintRowOffsets = new b3OpenCLArray<unsigned int>(m_gpuData->m_context,m_gpuData->m_queue); + m_gpuData->m_gpuConstraintRowOffsets = new b3OpenCLArray<unsigned int>(m_gpuData->m_context, m_gpuData->m_queue); - m_gpuData->m_gpuSolverBodies = new b3OpenCLArray<b3GpuSolverBody>(m_gpuData->m_context,m_gpuData->m_queue); - m_gpuData->m_gpuBatchConstraints = new b3OpenCLArray<b3BatchConstraint>(m_gpuData->m_context,m_gpuData->m_queue); - m_gpuData->m_gpuConstraintRows = new b3OpenCLArray<b3GpuSolverConstraint>(m_gpuData->m_context,m_gpuData->m_queue); - m_gpuData->m_gpuConstraintInfo1 = new b3OpenCLArray<unsigned int>(m_gpuData->m_context,m_gpuData->m_queue); - cl_int errNum=0; + m_gpuData->m_gpuSolverBodies = new b3OpenCLArray<b3GpuSolverBody>(m_gpuData->m_context, m_gpuData->m_queue); + m_gpuData->m_gpuBatchConstraints = new b3OpenCLArray<b3BatchConstraint>(m_gpuData->m_context, m_gpuData->m_queue); + m_gpuData->m_gpuConstraintRows = new b3OpenCLArray<b3GpuSolverConstraint>(m_gpuData->m_context, m_gpuData->m_queue); + m_gpuData->m_gpuConstraintInfo1 = new b3OpenCLArray<unsigned int>(m_gpuData->m_context, m_gpuData->m_queue); + cl_int errNum = 0; { - cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,&errNum,"",B3_JOINT_SOLVER_PATH); + cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, &errNum, "", B3_JOINT_SOLVER_PATH); //cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_gpuData->m_context,m_gpuData->m_device,0,&errNum,"",B3_JOINT_SOLVER_PATH,true); - b3Assert(errNum==CL_SUCCESS); - m_gpuData->m_solveJointConstraintRowsKernels = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device,solveConstraintRowsCL, "solveJointConstraintRows",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); - m_gpuData->m_initSolverBodiesKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"initSolverBodies",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); - m_gpuData->m_getInfo1Kernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"getInfo1Kernel",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); - m_gpuData->m_initBatchConstraintsKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"initBatchConstraintsKernel",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); - m_gpuData->m_getInfo2Kernel= b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"getInfo2Kernel",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); - m_gpuData->m_writeBackVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"writeBackVelocitiesKernel",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); - m_gpuData->m_breakViolatedConstraintsKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"breakViolatedConstraintsKernel",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); - - - + b3Assert(errNum == CL_SUCCESS); + m_gpuData->m_solveJointConstraintRowsKernels = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "solveJointConstraintRows", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); + m_gpuData->m_initSolverBodiesKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "initSolverBodies", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); + m_gpuData->m_getInfo1Kernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "getInfo1Kernel", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); + m_gpuData->m_initBatchConstraintsKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "initBatchConstraintsKernel", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); + m_gpuData->m_getInfo2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "getInfo2Kernel", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); + m_gpuData->m_writeBackVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "writeBackVelocitiesKernel", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); + m_gpuData->m_breakViolatedConstraintsKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "breakViolatedConstraintsKernel", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); clReleaseProgram(prog); } - - } -b3GpuPgsConstraintSolver::~b3GpuPgsConstraintSolver () +b3GpuPgsConstraintSolver::~b3GpuPgsConstraintSolver() { clReleaseKernel(m_gpuData->m_solveJointConstraintRowsKernels); clReleaseKernel(m_gpuData->m_initSolverBodiesKernel); @@ -195,16 +180,12 @@ struct b3BatchConstraint static b3AlignedObjectArray<b3BatchConstraint> batchConstraints; - -void b3GpuPgsConstraintSolver::recomputeBatches() +void b3GpuPgsConstraintSolver::recomputeBatches() { m_gpuData->m_batchSizes.clear(); } - - - -b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal) +b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal) { B3_PROFILE("GPU solveGroupCacheFriendlySetup"); batchConstraints.resize(numConstraints); @@ -212,7 +193,6 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 m_staticIdx = -1; m_maxOverrideNumSolverIterations = 0; - /* m_gpuData->m_gpuBodies->resize(numBodies); m_gpuData->m_gpuBodies->copyFromHostPointer(bodies,numBodies); @@ -223,15 +203,13 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 m_gpuData->m_gpuSolverBodies->resize(numBodies); - m_tmpSolverBodyPool.resize(numBodies); { - if (useGpuInitSolverBodies) { B3_PROFILE("m_initSolverBodiesKernel"); - b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_initSolverBodiesKernel,"m_initSolverBodiesKernel"); + b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_initSolverBodiesKernel, "m_initSolverBodiesKernel"); launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL()); launcher.setBuffer(gpuBodies->getBufferCL()); launcher.setConst(numBodies); @@ -239,48 +217,44 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 clFinish(m_gpuData->m_queue); // m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); - } else + } + else { gpuBodies->copyToHost(m_gpuData->m_cpuBodies); - for (int i=0;i<numBodies;i++) + for (int i = 0; i < numBodies; i++) { - b3RigidBodyData& body = m_gpuData->m_cpuBodies[i]; b3GpuSolverBody& solverBody = m_tmpSolverBodyPool[i]; - initSolverBody(i,&solverBody,&body); + initSolverBody(i, &solverBody, &body); solverBody.m_originalBodyIndex = i; } m_gpuData->m_gpuSolverBodies->copyFromHost(m_tmpSolverBodyPool); } } -// int totalBodies = 0; + // int totalBodies = 0; int totalNumRows = 0; //b3RigidBody* rb0=0,*rb1=0; //if (1) { { - - // int i; m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints); // b3OpenCLArray<b3GpuGenericConstraint> gpuConstraints(m_gpuData->m_context,m_gpuData->m_queue); - if (useGpuInfo1) { B3_PROFILE("info1 and init batchConstraint"); - + m_gpuData->m_gpuConstraintInfo1->resize(numConstraints); - if (1) { B3_PROFILE("getInfo1Kernel"); - b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_getInfo1Kernel,"m_getInfo1Kernel"); + b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_getInfo1Kernel, "m_getInfo1Kernel"); launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL()); launcher.setBuffer(gpuConstraints->getBufferCL()); launcher.setConst(numConstraints); @@ -288,19 +262,19 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 clFinish(m_gpuData->m_queue); } - if (m_gpuData->m_batchSizes.size()==0) + if (m_gpuData->m_batchSizes.size() == 0) { B3_PROFILE("initBatchConstraintsKernel"); m_gpuData->m_gpuConstraintRowOffsets->resize(numConstraints); - unsigned int total=0; - m_gpuData->m_prefixScan->execute(*m_gpuData->m_gpuConstraintInfo1,*m_gpuData->m_gpuConstraintRowOffsets,numConstraints,&total); - unsigned int lastElem = m_gpuData->m_gpuConstraintInfo1->at(numConstraints-1); - totalNumRows = total+lastElem; + unsigned int total = 0; + m_gpuData->m_prefixScan->execute(*m_gpuData->m_gpuConstraintInfo1, *m_gpuData->m_gpuConstraintRowOffsets, numConstraints, &total); + unsigned int lastElem = m_gpuData->m_gpuConstraintInfo1->at(numConstraints - 1); + totalNumRows = total + lastElem; { B3_PROFILE("init batch constraints"); - b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_initBatchConstraintsKernel,"m_initBatchConstraintsKernel"); + b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_initBatchConstraintsKernel, "m_initBatchConstraintsKernel"); launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL()); @@ -313,79 +287,74 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 //assume the batching happens on CPU, so copy the data m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); } - } + } else { - totalNumRows = 0; + totalNumRows = 0; gpuConstraints->copyToHost(m_gpuData->m_cpuConstraints); //calculate the total number of contraint rows - for (int i=0;i<numConstraints;i++) + for (int i = 0; i < numConstraints; i++) { - unsigned int& info1= m_tmpConstraintSizesPool[i]; + unsigned int& info1 = m_tmpConstraintSizesPool[i]; // unsigned int info1; if (m_gpuData->m_cpuConstraints[i].isEnabled()) { - - m_gpuData->m_cpuConstraints[i].getInfo1(&info1,&m_gpuData->m_cpuBodies[0]); - } else + m_gpuData->m_cpuConstraints[i].getInfo1(&info1, &m_gpuData->m_cpuBodies[0]); + } + else { info1 = 0; } - + totalNumRows += info1; } m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints); m_gpuData->m_gpuConstraintInfo1->copyFromHost(m_tmpConstraintSizesPool); - } m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows); m_gpuData->m_gpuConstraintRows->resize(totalNumRows); - + // b3GpuConstraintArray verify; if (useGpuInfo2) { { - B3_PROFILE("getInfo2Kernel"); - b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_getInfo2Kernel,"m_getInfo2Kernel"); - launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL()); - launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL()); - launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL()); - launcher.setBuffer(gpuConstraints->getBufferCL()); - launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL()); - launcher.setBuffer(gpuBodies->getBufferCL()); - launcher.setBuffer(gpuInertias->getBufferCL()); - launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL()); - launcher.setConst(infoGlobal.m_timeStep); - launcher.setConst(infoGlobal.m_erp); - launcher.setConst(infoGlobal.m_globalCfm); - launcher.setConst(infoGlobal.m_damping); - launcher.setConst(infoGlobal.m_numIterations); - launcher.setConst(numConstraints); - launcher.launch1D(numConstraints); - clFinish(m_gpuData->m_queue); - - if (m_gpuData->m_batchSizes.size()==0) - m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); - //m_gpuData->m_gpuConstraintRows->copyToHost(verify); - //m_gpuData->m_gpuConstraintRows->copyToHost(m_tmpSolverNonContactConstraintPool); - - + B3_PROFILE("getInfo2Kernel"); + b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_getInfo2Kernel, "m_getInfo2Kernel"); + launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL()); + launcher.setBuffer(gpuConstraints->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL()); + launcher.setBuffer(gpuBodies->getBufferCL()); + launcher.setBuffer(gpuInertias->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL()); + launcher.setConst(infoGlobal.m_timeStep); + launcher.setConst(infoGlobal.m_erp); + launcher.setConst(infoGlobal.m_globalCfm); + launcher.setConst(infoGlobal.m_damping); + launcher.setConst(infoGlobal.m_numIterations); + launcher.setConst(numConstraints); + launcher.launch1D(numConstraints); + clFinish(m_gpuData->m_queue); - } - } + if (m_gpuData->m_batchSizes.size() == 0) + m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); + //m_gpuData->m_gpuConstraintRows->copyToHost(verify); + //m_gpuData->m_gpuConstraintRows->copyToHost(m_tmpSolverNonContactConstraintPool); + } + } else { - gpuInertias->copyToHost(m_gpuData->m_cpuInertias); - ///setup the b3SolverConstraints - - for (int i=0;i<numConstraints;i++) + ///setup the b3SolverConstraints + + for (int i = 0; i < numConstraints; i++) { const int& info1 = m_tmpConstraintSizesPool[i]; - + if (info1) { int constraintIndex = batchConstraints[i].m_originalConstraintIndex; @@ -394,15 +363,13 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 b3GpuSolverConstraint* currentConstraintRow = &m_tmpSolverNonContactConstraintPool[constraintRowOffset]; b3GpuGenericConstraint& constraint = m_gpuData->m_cpuConstraints[i]; - b3RigidBodyData& rbA = m_gpuData->m_cpuBodies[ constraint.getRigidBodyA()]; + b3RigidBodyData& rbA = m_gpuData->m_cpuBodies[constraint.getRigidBodyA()]; //b3RigidBody& rbA = constraint.getRigidBodyA(); - // b3RigidBody& rbB = constraint.getRigidBodyB(); - b3RigidBodyData& rbB = m_gpuData->m_cpuBodies[ constraint.getRigidBodyB()]; - - + // b3RigidBody& rbB = constraint.getRigidBodyB(); + b3RigidBodyData& rbB = m_gpuData->m_cpuBodies[constraint.getRigidBodyB()]; - int solverBodyIdA = constraint.getRigidBodyA();//getOrInitSolverBody(constraint.getRigidBodyA(),bodies,inertias); - int solverBodyIdB = constraint.getRigidBodyB();//getOrInitSolverBody(constraint.getRigidBodyB(),bodies,inertias); + int solverBodyIdA = constraint.getRigidBodyA(); //getOrInitSolverBody(constraint.getRigidBodyA(),bodies,inertias); + int solverBodyIdB = constraint.getRigidBodyB(); //getOrInitSolverBody(constraint.getRigidBodyB(),bodies,inertias); b3GpuSolverBody* bodyAPtr = &m_tmpSolverBodyPool[solverBodyIdA]; b3GpuSolverBody* bodyBPtr = &m_tmpSolverBodyPool[solverBodyIdB]; @@ -410,7 +377,8 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 if (rbA.m_invMass) { batchConstraints[i].m_bodyAPtrAndSignBit = solverBodyIdA; - } else + } + else { if (!solverBodyIdA) m_staticIdx = 0; @@ -420,29 +388,28 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 if (rbB.m_invMass) { batchConstraints[i].m_bodyBPtrAndSignBit = solverBodyIdB; - } else + } + else { if (!solverBodyIdB) m_staticIdx = 0; batchConstraints[i].m_bodyBPtrAndSignBit = -solverBodyIdB; } - - int overrideNumSolverIterations = 0;//constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations; - if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations) + int overrideNumSolverIterations = 0; //constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations; + if (overrideNumSolverIterations > m_maxOverrideNumSolverIterations) m_maxOverrideNumSolverIterations = overrideNumSolverIterations; - int j; - for ( j=0;j<info1;j++) + for (j = 0; j < info1; j++) { - memset(¤tConstraintRow[j],0,sizeof(b3GpuSolverConstraint)); - currentConstraintRow[j].m_angularComponentA.setValue(0,0,0); - currentConstraintRow[j].m_angularComponentB.setValue(0,0,0); + memset(¤tConstraintRow[j], 0, sizeof(b3GpuSolverConstraint)); + currentConstraintRow[j].m_angularComponentA.setValue(0, 0, 0); + currentConstraintRow[j].m_angularComponentB.setValue(0, 0, 0); currentConstraintRow[j].m_appliedImpulse = 0.f; currentConstraintRow[j].m_appliedPushImpulse = 0.f; currentConstraintRow[j].m_cfm = 0.f; - currentConstraintRow[j].m_contactNormal.setValue(0,0,0); + currentConstraintRow[j].m_contactNormal.setValue(0, 0, 0); currentConstraintRow[j].m_friction = 0.f; currentConstraintRow[j].m_frictionIndex = 0; currentConstraintRow[j].m_jacDiagABInv = 0.f; @@ -451,13 +418,13 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 currentConstraintRow[j].m_originalContactPoint = 0; currentConstraintRow[j].m_overrideNumSolverIterations = 0; - currentConstraintRow[j].m_relpos1CrossNormal.setValue(0,0,0); - currentConstraintRow[j].m_relpos2CrossNormal.setValue(0,0,0); + currentConstraintRow[j].m_relpos1CrossNormal.setValue(0, 0, 0); + currentConstraintRow[j].m_relpos2CrossNormal.setValue(0, 0, 0); currentConstraintRow[j].m_rhs = 0.f; currentConstraintRow[j].m_rhsPenetration = 0.f; currentConstraintRow[j].m_solverBodyIdA = 0; currentConstraintRow[j].m_solverBodyIdB = 0; - + currentConstraintRow[j].m_lowerLimit = -B3_INFINITY; currentConstraintRow[j].m_upperLimit = B3_INFINITY; currentConstraintRow[j].m_appliedImpulse = 0.f; @@ -467,26 +434,25 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations; } - bodyAPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f); - bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f); - bodyAPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f); - bodyAPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f); - bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f); - bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f); - bodyBPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f); - bodyBPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f); - + bodyAPtr->internalGetDeltaLinearVelocity().setValue(0.f, 0.f, 0.f); + bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f, 0.f, 0.f); + bodyAPtr->internalGetPushVelocity().setValue(0.f, 0.f, 0.f); + bodyAPtr->internalGetTurnVelocity().setValue(0.f, 0.f, 0.f); + bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f, 0.f, 0.f); + bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f, 0.f, 0.f); + bodyBPtr->internalGetPushVelocity().setValue(0.f, 0.f, 0.f); + bodyBPtr->internalGetTurnVelocity().setValue(0.f, 0.f, 0.f); b3GpuConstraintInfo2 info2; - info2.fps = 1.f/infoGlobal.m_timeStep; + info2.fps = 1.f / infoGlobal.m_timeStep; info2.erp = infoGlobal.m_erp; info2.m_J1linearAxis = currentConstraintRow->m_contactNormal; info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal; info2.m_J2linearAxis = 0; info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal; - info2.rowskip = sizeof(b3GpuSolverConstraint)/sizeof(b3Scalar);//check this + info2.rowskip = sizeof(b3GpuSolverConstraint) / sizeof(b3Scalar); //check this ///the size of b3GpuSolverConstraint needs be a multiple of b3Scalar - b3Assert(info2.rowskip*sizeof(b3Scalar)== sizeof(b3GpuSolverConstraint)); + b3Assert(info2.rowskip * sizeof(b3Scalar) == sizeof(b3GpuSolverConstraint)); info2.m_constraintError = ¤tConstraintRow->m_rhs; currentConstraintRow->m_cfm = infoGlobal.m_globalCfm; info2.m_damping = infoGlobal.m_damping; @@ -494,47 +460,45 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 info2.m_lowerLimit = ¤tConstraintRow->m_lowerLimit; info2.m_upperLimit = ¤tConstraintRow->m_upperLimit; info2.m_numIterations = infoGlobal.m_numIterations; - m_gpuData->m_cpuConstraints[i].getInfo2(&info2,&m_gpuData->m_cpuBodies[0]); + m_gpuData->m_cpuConstraints[i].getInfo2(&info2, &m_gpuData->m_cpuBodies[0]); ///finalize the constraint setup - for ( j=0;j<info1;j++) + for (j = 0; j < info1; j++) { b3GpuSolverConstraint& solverConstraint = currentConstraintRow[j]; - if (solverConstraint.m_upperLimit>=m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold()) + if (solverConstraint.m_upperLimit >= m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold()) { solverConstraint.m_upperLimit = m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold(); } - if (solverConstraint.m_lowerLimit<=-m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold()) + if (solverConstraint.m_lowerLimit <= -m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold()) { solverConstraint.m_lowerLimit = -m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold(); } - // solverConstraint.m_originalContactPoint = constraint; - - b3Matrix3x3& invInertiaWorldA= m_gpuData->m_cpuInertias[constraint.getRigidBodyA()].m_invInertiaWorld; - { + // solverConstraint.m_originalContactPoint = constraint; + b3Matrix3x3& invInertiaWorldA = m_gpuData->m_cpuInertias[constraint.getRigidBodyA()].m_invInertiaWorld; + { //b3Vector3 angularFactorA(1,1,1); const b3Vector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal; - solverConstraint.m_angularComponentA = invInertiaWorldA*ftorqueAxis1;//*angularFactorA; + solverConstraint.m_angularComponentA = invInertiaWorldA * ftorqueAxis1; //*angularFactorA; } - - b3Matrix3x3& invInertiaWorldB= m_gpuData->m_cpuInertias[constraint.getRigidBodyB()].m_invInertiaWorld; - { + b3Matrix3x3& invInertiaWorldB = m_gpuData->m_cpuInertias[constraint.getRigidBodyB()].m_invInertiaWorld; + { const b3Vector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal; - solverConstraint.m_angularComponentB = invInertiaWorldB*ftorqueAxis2;//*constraint.getRigidBodyB().getAngularFactor(); + solverConstraint.m_angularComponentB = invInertiaWorldB * ftorqueAxis2; //*constraint.getRigidBodyB().getAngularFactor(); } { //it is ok to use solverConstraint.m_contactNormal instead of -solverConstraint.m_contactNormal //because it gets multiplied iMJlB - b3Vector3 iMJlA = solverConstraint.m_contactNormal*rbA.m_invMass; - b3Vector3 iMJaA = invInertiaWorldA*solverConstraint.m_relpos1CrossNormal; - b3Vector3 iMJlB = solverConstraint.m_contactNormal*rbB.m_invMass;//sign of normal? - b3Vector3 iMJaB = invInertiaWorldB*solverConstraint.m_relpos2CrossNormal; + b3Vector3 iMJlA = solverConstraint.m_contactNormal * rbA.m_invMass; + b3Vector3 iMJaA = invInertiaWorldA * solverConstraint.m_relpos1CrossNormal; + b3Vector3 iMJlB = solverConstraint.m_contactNormal * rbB.m_invMass; //sign of normal? + b3Vector3 iMJaB = invInertiaWorldB * solverConstraint.m_relpos2CrossNormal; b3Scalar sum = iMJlA.dot(solverConstraint.m_contactNormal); sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal); @@ -542,10 +506,9 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal); b3Scalar fsum = b3Fabs(sum); b3Assert(fsum > B3_EPSILON); - solverConstraint.m_jacDiagABInv = fsum>B3_EPSILON?b3Scalar(1.)/sum : 0.f; + solverConstraint.m_jacDiagABInv = fsum > B3_EPSILON ? b3Scalar(1.) / sum : 0.f; } - ///fix rhs ///todo: add force/torque accelerators { @@ -553,94 +516,80 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 b3Scalar vel1Dotn = solverConstraint.m_contactNormal.dot(rbA.m_linVel) + solverConstraint.m_relpos1CrossNormal.dot(rbA.m_angVel); b3Scalar vel2Dotn = -solverConstraint.m_contactNormal.dot(rbB.m_linVel) + solverConstraint.m_relpos2CrossNormal.dot(rbB.m_angVel); - rel_vel = vel1Dotn+vel2Dotn; + rel_vel = vel1Dotn + vel2Dotn; b3Scalar restitution = 0.f; - b3Scalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2 - b3Scalar velocityError = restitution - rel_vel * info2.m_damping; - b3Scalar penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv; - b3Scalar velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv; - solverConstraint.m_rhs = penetrationImpulse+velocityImpulse; + b3Scalar positionalError = solverConstraint.m_rhs; //already filled in by getConstraintInfo2 + b3Scalar velocityError = restitution - rel_vel * info2.m_damping; + b3Scalar penetrationImpulse = positionalError * solverConstraint.m_jacDiagABInv; + b3Scalar velocityImpulse = velocityError * solverConstraint.m_jacDiagABInv; + solverConstraint.m_rhs = penetrationImpulse + velocityImpulse; solverConstraint.m_appliedImpulse = 0.f; - } } - } } - - m_gpuData->m_gpuConstraintRows->copyFromHost(m_tmpSolverNonContactConstraintPool); m_gpuData->m_gpuConstraintInfo1->copyFromHost(m_tmpConstraintSizesPool); - if (m_gpuData->m_batchSizes.size()==0) + if (m_gpuData->m_batchSizes.size() == 0) m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints); else m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); m_gpuData->m_gpuSolverBodies->copyFromHost(m_tmpSolverBodyPool); - - - }//end useGpuInfo2 - - + } //end useGpuInfo2 } #ifdef B3_SUPPORT_CONTACT_CONSTRAINTS { int i; - for (i=0;i<numManifolds;i++) + for (i = 0; i < numManifolds; i++) { b3Contact4& manifold = manifoldPtr[i]; - convertContact(bodies,inertias,&manifold,infoGlobal); + convertContact(bodies, inertias, &manifold, infoGlobal); } } -#endif //B3_SUPPORT_CONTACT_CONSTRAINTS +#endif //B3_SUPPORT_CONTACT_CONSTRAINTS } -// b3ContactSolverInfo info = infoGlobal; - - -// int numNonContactPool = m_tmpSolverNonContactConstraintPool.size(); -// int numConstraintPool = m_tmpSolverContactConstraintPool.size(); -// int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size(); + // b3ContactSolverInfo info = infoGlobal; + // int numNonContactPool = m_tmpSolverNonContactConstraintPool.size(); + // int numConstraintPool = m_tmpSolverContactConstraintPool.size(); + // int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size(); return 0.f; - } - - ///a straight copy from GPU/OpenCL kernel, for debugging -__inline void internalApplyImpulse( b3GpuSolverBody* body, const b3Vector3& linearComponent, const b3Vector3& angularComponent,float impulseMagnitude) +__inline void internalApplyImpulse(b3GpuSolverBody* body, const b3Vector3& linearComponent, const b3Vector3& angularComponent, float impulseMagnitude) { - body->m_deltaLinearVelocity += linearComponent*impulseMagnitude*body->m_linearFactor; - body->m_deltaAngularVelocity += angularComponent*(impulseMagnitude*body->m_angularFactor); + body->m_deltaLinearVelocity += linearComponent * impulseMagnitude * body->m_linearFactor; + body->m_deltaAngularVelocity += angularComponent * (impulseMagnitude * body->m_angularFactor); } - -void resolveSingleConstraintRowGeneric2( b3GpuSolverBody* body1, b3GpuSolverBody* body2, b3GpuSolverConstraint* c) +void resolveSingleConstraintRowGeneric2(b3GpuSolverBody* body1, b3GpuSolverBody* body2, b3GpuSolverConstraint* c) { - float deltaImpulse = c->m_rhs-b3Scalar(c->m_appliedImpulse)*c->m_cfm; - float deltaVel1Dotn = b3Dot(c->m_contactNormal,body1->m_deltaLinearVelocity) + b3Dot(c->m_relpos1CrossNormal,body1->m_deltaAngularVelocity); - float deltaVel2Dotn = -b3Dot(c->m_contactNormal,body2->m_deltaLinearVelocity) + b3Dot(c->m_relpos2CrossNormal,body2->m_deltaAngularVelocity); + float deltaImpulse = c->m_rhs - b3Scalar(c->m_appliedImpulse) * c->m_cfm; + float deltaVel1Dotn = b3Dot(c->m_contactNormal, body1->m_deltaLinearVelocity) + b3Dot(c->m_relpos1CrossNormal, body1->m_deltaAngularVelocity); + float deltaVel2Dotn = -b3Dot(c->m_contactNormal, body2->m_deltaLinearVelocity) + b3Dot(c->m_relpos2CrossNormal, body2->m_deltaAngularVelocity); - deltaImpulse -= deltaVel1Dotn*c->m_jacDiagABInv; - deltaImpulse -= deltaVel2Dotn*c->m_jacDiagABInv; + deltaImpulse -= deltaVel1Dotn * c->m_jacDiagABInv; + deltaImpulse -= deltaVel2Dotn * c->m_jacDiagABInv; float sum = b3Scalar(c->m_appliedImpulse) + deltaImpulse; if (sum < c->m_lowerLimit) { - deltaImpulse = c->m_lowerLimit-b3Scalar(c->m_appliedImpulse); + deltaImpulse = c->m_lowerLimit - b3Scalar(c->m_appliedImpulse); c->m_appliedImpulse = c->m_lowerLimit; } - else if (sum > c->m_upperLimit) + else if (sum > c->m_upperLimit) { - deltaImpulse = c->m_upperLimit-b3Scalar(c->m_appliedImpulse); + deltaImpulse = c->m_upperLimit - b3Scalar(c->m_appliedImpulse); c->m_appliedImpulse = c->m_upperLimit; } else @@ -648,64 +597,56 @@ void resolveSingleConstraintRowGeneric2( b3GpuSolverBody* body1, b3GpuSolverBod c->m_appliedImpulse = sum; } - internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse); - internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse); - + internalApplyImpulse(body1, c->m_contactNormal * body1->m_invMass, c->m_angularComponentA, deltaImpulse); + internalApplyImpulse(body2, -c->m_contactNormal * body2->m_invMass, c->m_angularComponentB, deltaImpulse); } - - -void b3GpuPgsConstraintSolver::initSolverBody(int bodyIndex, b3GpuSolverBody* solverBody, b3RigidBodyData* rb) +void b3GpuPgsConstraintSolver::initSolverBody(int bodyIndex, b3GpuSolverBody* solverBody, b3RigidBodyData* rb) { - - solverBody->m_deltaLinearVelocity.setValue(0.f,0.f,0.f); - solverBody->m_deltaAngularVelocity.setValue(0.f,0.f,0.f); - solverBody->internalGetPushVelocity().setValue(0.f,0.f,0.f); - solverBody->internalGetTurnVelocity().setValue(0.f,0.f,0.f); + solverBody->m_deltaLinearVelocity.setValue(0.f, 0.f, 0.f); + solverBody->m_deltaAngularVelocity.setValue(0.f, 0.f, 0.f); + solverBody->internalGetPushVelocity().setValue(0.f, 0.f, 0.f); + solverBody->internalGetTurnVelocity().setValue(0.f, 0.f, 0.f); b3Assert(rb); -// solverBody->m_worldTransform = getWorldTransform(rb); - solverBody->internalSetInvMass(b3MakeVector3(rb->m_invMass,rb->m_invMass,rb->m_invMass)); + // solverBody->m_worldTransform = getWorldTransform(rb); + solverBody->internalSetInvMass(b3MakeVector3(rb->m_invMass, rb->m_invMass, rb->m_invMass)); solverBody->m_originalBodyIndex = bodyIndex; - solverBody->m_angularFactor = b3MakeVector3(1,1,1); - solverBody->m_linearFactor = b3MakeVector3(1,1,1); + solverBody->m_angularFactor = b3MakeVector3(1, 1, 1); + solverBody->m_linearFactor = b3MakeVector3(1, 1, 1); solverBody->m_linearVelocity = getLinearVelocity(rb); solverBody->m_angularVelocity = getAngularVelocity(rb); } - -void b3GpuPgsConstraintSolver::averageVelocities() +void b3GpuPgsConstraintSolver::averageVelocities() { } - -b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyIterations(b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints1,int numConstraints,const b3ContactSolverInfo& infoGlobal) +b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyIterations(b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints1, int numConstraints, const b3ContactSolverInfo& infoGlobal) { //only create the batches once. //@todo: incrementally update batches when constraints are added/activated and/or removed/deactivated B3_PROFILE("GpuSolveGroupCacheFriendlyIterations"); - bool createBatches = m_gpuData->m_batchSizes.size()==0; + bool createBatches = m_gpuData->m_batchSizes.size() == 0; { - if (createBatches) { - m_gpuData->m_batchSizes.resize(0); { m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); B3_PROFILE("batch joints"); - b3Assert(batchConstraints.size()==numConstraints); - int simdWidth =numConstraints+1; + b3Assert(batchConstraints.size() == numConstraints); + int simdWidth = numConstraints + 1; int numBodies = m_tmpSolverBodyPool.size(); - sortConstraintByBatch3( &batchConstraints[0], numConstraints, simdWidth , m_staticIdx, numBodies); + sortConstraintByBatch3(&batchConstraints[0], numConstraints, simdWidth, m_staticIdx, numBodies); m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints); - } - } else + } + else { /*b3AlignedObjectArray<b3BatchConstraint> cpuCheckBatches; m_gpuData->m_gpuBatchConstraints->copyToHost(cpuCheckBatches); @@ -715,12 +656,11 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyIterations(b3OpenCLArr //>copyFromHost(batchConstraints); } int maxIterations = infoGlobal.m_numIterations; - + bool useBatching = true; - if (useBatching ) + if (useBatching) { - if (!useGpuSolveJointConstraintRows) { B3_PROFILE("copy to host"); @@ -730,24 +670,21 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyIterations(b3OpenCLArr m_gpuData->m_gpuConstraintInfo1->copyToHost(m_gpuData->m_cpuConstraintInfo1); m_gpuData->m_gpuConstraintRowOffsets->copyToHost(m_gpuData->m_cpuConstraintRowOffsets); gpuConstraints1->copyToHost(m_gpuData->m_cpuConstraints); - } - for ( int iteration = 0 ; iteration< maxIterations ; iteration++) + for (int iteration = 0; iteration < maxIterations; iteration++) { - int batchOffset = 0; - int constraintOffset=0; + int constraintOffset = 0; int numBatches = m_gpuData->m_batchSizes.size(); - for (int bb=0;bb<numBatches;bb++) + for (int bb = 0; bb < numBatches; bb++) { int numConstraintsInBatch = m_gpuData->m_batchSizes[bb]; - if (useGpuSolveJointConstraintRows) { B3_PROFILE("solveJointConstraintRowsKernels"); - + /* __kernel void solveJointConstraintRows(__global b3GpuSolverBody* solverBodies, __global b3BatchConstraint* batchConstraints, @@ -758,53 +695,48 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyIterations(b3OpenCLArr int batchOffset, int numConstraintsInBatch*/ - - b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_solveJointConstraintRowsKernels,"m_solveJointConstraintRowsKernels"); + b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_solveJointConstraintRowsKernels, "m_solveJointConstraintRowsKernels"); launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL()); - launcher.setBuffer(gpuConstraints1->getBufferCL());//to detect disabled constraints + launcher.setBuffer(gpuConstraints1->getBufferCL()); //to detect disabled constraints launcher.setConst(batchOffset); launcher.setConst(numConstraintsInBatch); launcher.launch1D(numConstraintsInBatch); - - - } else//useGpu + } + else //useGpu { - - - - for (int b=0;b<numConstraintsInBatch;b++) + for (int b = 0; b < numConstraintsInBatch; b++) { - const b3BatchConstraint& c = batchConstraints[batchOffset+b]; + const b3BatchConstraint& c = batchConstraints[batchOffset + b]; /*printf("-----------\n"); printf("bb=%d\n",bb); printf("c.batchId = %d\n", c.m_batchId); */ - b3Assert(c.m_batchId==bb); + b3Assert(c.m_batchId == bb); b3GpuGenericConstraint* constraint = &m_gpuData->m_cpuConstraints[c.m_originalConstraintIndex]; - if (constraint->m_flags&B3_CONSTRAINT_FLAG_ENABLED) + if (constraint->m_flags & B3_CONSTRAINT_FLAG_ENABLED) { int numConstraintRows = m_gpuData->m_cpuConstraintInfo1[c.m_originalConstraintIndex]; int constraintOffset = m_gpuData->m_cpuConstraintRowOffsets[c.m_originalConstraintIndex]; - - for (int jj=0;jj<numConstraintRows;jj++) + + for (int jj = 0; jj < numConstraintRows; jj++) { - // - b3GpuSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[constraintOffset+jj]; + // + b3GpuSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[constraintOffset + jj]; //resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint); - resolveSingleConstraintRowGeneric2(&m_tmpSolverBodyPool[constraint.m_solverBodyIdA],&m_tmpSolverBodyPool[constraint.m_solverBodyIdB],&constraint); + resolveSingleConstraintRowGeneric2(&m_tmpSolverBodyPool[constraint.m_solverBodyIdA], &m_tmpSolverBodyPool[constraint.m_solverBodyIdB], &constraint); } } } - }//useGpu - batchOffset+=numConstraintsInBatch; - constraintOffset+=numConstraintsInBatch; + } //useGpu + batchOffset += numConstraintsInBatch; + constraintOffset += numConstraintsInBatch; } - }//for (int iteration... + } //for (int iteration... if (!useGpuSolveJointConstraintRows) { @@ -820,20 +752,16 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyIterations(b3OpenCLArr } //int sz = sizeof(b3GpuSolverBody); //printf("cpu sizeof(b3GpuSolverBody)=%d\n",sz); - - - - - - } else + } + else { - for ( int iteration = 0 ; iteration< maxIterations ; iteration++) - { - int numJoints = m_tmpSolverNonContactConstraintPool.size(); - for (int j=0;j<numJoints;j++) + for (int iteration = 0; iteration < maxIterations; iteration++) + { + int numJoints = m_tmpSolverNonContactConstraintPool.size(); + for (int j = 0; j < numJoints; j++) { b3GpuSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[j]; - resolveSingleConstraintRowGeneric2(&m_tmpSolverBodyPool[constraint.m_solverBodyIdA],&m_tmpSolverBodyPool[constraint.m_solverBodyIdB],&constraint); + resolveSingleConstraintRowGeneric2(&m_tmpSolverBodyPool[constraint.m_solverBodyIdA], &m_tmpSolverBodyPool[constraint.m_solverBodyIdB], &constraint); } if (!m_usePgs) @@ -842,212 +770,198 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyIterations(b3OpenCLArr } } } - } clFinish(m_gpuData->m_queue); return 0.f; } - - - static b3AlignedObjectArray<int> bodyUsed; static b3AlignedObjectArray<int> curUsed; - - -inline int b3GpuPgsConstraintSolver::sortConstraintByBatch3( b3BatchConstraint* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies) +inline int b3GpuPgsConstraintSolver::sortConstraintByBatch3(b3BatchConstraint* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies) { //int sz = sizeof(b3BatchConstraint); B3_PROFILE("sortConstraintByBatch3"); - + static int maxSwaps = 0; int numSwaps = 0; - curUsed.resize(2*simdWidth); + curUsed.resize(2 * simdWidth); static int maxNumConstraints = 0; - if (maxNumConstraints<numConstraints) + if (maxNumConstraints < numConstraints) { maxNumConstraints = numConstraints; //printf("maxNumConstraints = %d\n",maxNumConstraints ); } - int numUsedArray = numBodies/32+1; + int numUsedArray = numBodies / 32 + 1; bodyUsed.resize(numUsedArray); - for (int q=0;q<numUsedArray;q++) - bodyUsed[q]=0; + for (int q = 0; q < numUsedArray; q++) + bodyUsed[q] = 0; - int curBodyUsed = 0; int numIter = 0; - - + #if defined(_DEBUG) - for(int i=0; i<numConstraints; i++) + for (int i = 0; i < numConstraints; i++) cs[i].m_batchId = -1; #endif - + int numValidConstraints = 0; -// int unprocessedConstraintIndex = 0; + // int unprocessedConstraintIndex = 0; int batchIdx = 0; - { B3_PROFILE("cpu batch innerloop"); - - while( numValidConstraints < numConstraints) + + while (numValidConstraints < numConstraints) { numIter++; int nCurrentBatch = 0; // clear flag - for(int i=0; i<curBodyUsed; i++) - bodyUsed[curUsed[i]/32] = 0; + for (int i = 0; i < curBodyUsed; i++) + bodyUsed[curUsed[i] / 32] = 0; - curBodyUsed = 0; + curBodyUsed = 0; - for(int i=numValidConstraints; i<numConstraints; i++) + for (int i = numValidConstraints; i < numConstraints; i++) { int idx = i; - b3Assert( idx < numConstraints ); + b3Assert(idx < numConstraints); // check if it can go int bodyAS = cs[idx].m_bodyAPtrAndSignBit; int bodyBS = cs[idx].m_bodyBPtrAndSignBit; int bodyA = abs(bodyAS); int bodyB = abs(bodyBS); - bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx; - bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx; + bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx; + bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx; int aUnavailable = 0; int bUnavailable = 0; if (!aIsStatic) { - aUnavailable = bodyUsed[ bodyA/32 ] & (1<<(bodyA&31)); + aUnavailable = bodyUsed[bodyA / 32] & (1 << (bodyA & 31)); } if (!aUnavailable) - if (!bIsStatic) - { - bUnavailable = bodyUsed[ bodyB/32 ] & (1<<(bodyB&31)); - } - - if( aUnavailable==0 && bUnavailable==0 ) // ok + if (!bIsStatic) + { + bUnavailable = bodyUsed[bodyB / 32] & (1 << (bodyB & 31)); + } + + if (aUnavailable == 0 && bUnavailable == 0) // ok { if (!aIsStatic) { - bodyUsed[ bodyA/32 ] |= (1<<(bodyA&31)); - curUsed[curBodyUsed++]=bodyA; + bodyUsed[bodyA / 32] |= (1 << (bodyA & 31)); + curUsed[curBodyUsed++] = bodyA; } if (!bIsStatic) { - bodyUsed[ bodyB/32 ] |= (1<<(bodyB&31)); - curUsed[curBodyUsed++]=bodyB; + bodyUsed[bodyB / 32] |= (1 << (bodyB & 31)); + curUsed[curBodyUsed++] = bodyB; } cs[idx].m_batchId = batchIdx; - if (i!=numValidConstraints) + if (i != numValidConstraints) { - b3Swap(cs[i],cs[numValidConstraints]); + b3Swap(cs[i], cs[numValidConstraints]); numSwaps++; } numValidConstraints++; { nCurrentBatch++; - if( nCurrentBatch == simdWidth ) + if (nCurrentBatch == simdWidth) { nCurrentBatch = 0; - for(int i=0; i<curBodyUsed; i++) - bodyUsed[curUsed[i]/32] = 0; + for (int i = 0; i < curBodyUsed; i++) + bodyUsed[curUsed[i] / 32] = 0; curBodyUsed = 0; } } } } m_gpuData->m_batchSizes.push_back(nCurrentBatch); - batchIdx ++; + batchIdx++; } } - + #if defined(_DEBUG) - // debugPrintf( "nBatches: %d\n", batchIdx ); - for(int i=0; i<numConstraints; i++) - { - b3Assert( cs[i].m_batchId != -1 ); - } + // debugPrintf( "nBatches: %d\n", batchIdx ); + for (int i = 0; i < numConstraints; i++) + { + b3Assert(cs[i].m_batchId != -1); + } #endif - if (maxSwaps<numSwaps) + if (maxSwaps < numSwaps) { maxSwaps = numSwaps; //printf("maxSwaps = %d\n", maxSwaps); } - + return batchIdx; } - /// b3PgsJacobiSolver Sequentially applies impulses -b3Scalar b3GpuPgsConstraintSolver::solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, - int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints, const b3ContactSolverInfo& infoGlobal) +b3Scalar b3GpuPgsConstraintSolver::solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, + int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal) { - B3_PROFILE("solveJoints"); //you need to provide at least some bodies - - solveGroupCacheFriendlySetup( gpuBodies, gpuInertias,numBodies,gpuConstraints, numConstraints,infoGlobal); - solveGroupCacheFriendlyIterations(gpuConstraints, numConstraints,infoGlobal); + solveGroupCacheFriendlySetup(gpuBodies, gpuInertias, numBodies, gpuConstraints, numConstraints, infoGlobal); + + solveGroupCacheFriendlyIterations(gpuConstraints, numConstraints, infoGlobal); + + solveGroupCacheFriendlyFinish(gpuBodies, gpuInertias, numBodies, gpuConstraints, numConstraints, infoGlobal); - solveGroupCacheFriendlyFinish(gpuBodies, gpuInertias,numBodies, gpuConstraints, numConstraints, infoGlobal); - return 0.f; } -void b3GpuPgsConstraintSolver::solveJoints(int numBodies, b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, - int numConstraints, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints) +void b3GpuPgsConstraintSolver::solveJoints(int numBodies, b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, + int numConstraints, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints) { b3ContactSolverInfo infoGlobal; infoGlobal.m_splitImpulse = false; - infoGlobal.m_timeStep = 1.f/60.f; - infoGlobal.m_numIterations = 4;//4; -// infoGlobal.m_solverMode|=B3_SOLVER_USE_2_FRICTION_DIRECTIONS|B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS|B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION; + infoGlobal.m_timeStep = 1.f / 60.f; + infoGlobal.m_numIterations = 4; //4; + // infoGlobal.m_solverMode|=B3_SOLVER_USE_2_FRICTION_DIRECTIONS|B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS|B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION; //infoGlobal.m_solverMode|=B3_SOLVER_USE_2_FRICTION_DIRECTIONS|B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS; - infoGlobal.m_solverMode|=B3_SOLVER_USE_2_FRICTION_DIRECTIONS; + infoGlobal.m_solverMode |= B3_SOLVER_USE_2_FRICTION_DIRECTIONS; //if (infoGlobal.m_solverMode & B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS) //if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS) && (infoGlobal.m_solverMode & B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION)) - - - solveGroup(gpuBodies,gpuInertias,numBodies,gpuConstraints,numConstraints,infoGlobal); + solveGroup(gpuBodies, gpuInertias, numBodies, gpuConstraints, numConstraints, infoGlobal); } //b3AlignedObjectArray<b3RigidBodyData> testBodies; - -b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyFinish(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias,int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal) +b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyFinish(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal) { B3_PROFILE("solveGroupCacheFriendlyFinish"); -// int numPoolConstraints = m_tmpSolverContactConstraintPool.size(); -// int i,j; - + // int numPoolConstraints = m_tmpSolverContactConstraintPool.size(); + // int i,j; { if (gpuBreakConstraints) { B3_PROFILE("breakViolatedConstraintsKernel"); - b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_breakViolatedConstraintsKernel,"m_breakViolatedConstraintsKernel"); + b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_breakViolatedConstraintsKernel, "m_breakViolatedConstraintsKernel"); launcher.setBuffer(gpuConstraints->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL()); launcher.setConst(numConstraints); launcher.launch1D(numConstraints); - } else + } + else { gpuConstraints->copyToHost(m_gpuData->m_cpuConstraints); m_gpuData->m_gpuBatchConstraints->copyToHost(m_gpuData->m_cpuBatchConstraints); @@ -1056,31 +970,28 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyFinish(b3OpenCLArray<b m_gpuData->m_gpuConstraintInfo1->copyToHost(m_gpuData->m_cpuConstraintInfo1); m_gpuData->m_gpuConstraintRowOffsets->copyToHost(m_gpuData->m_cpuConstraintRowOffsets); - for (int cid=0;cid<numConstraints;cid++) + for (int cid = 0; cid < numConstraints; cid++) { int originalConstraintIndex = batchConstraints[cid].m_originalConstraintIndex; int constraintRowOffset = m_gpuData->m_cpuConstraintRowOffsets[originalConstraintIndex]; int numRows = m_gpuData->m_cpuConstraintInfo1[originalConstraintIndex]; if (numRows) { - - // printf("cid=%d, breakingThreshold =%f\n",cid,breakingThreshold); - for (int i=0;i<numRows;i++) + // printf("cid=%d, breakingThreshold =%f\n",cid,breakingThreshold); + for (int i = 0; i < numRows; i++) { - int rowIndex =constraintRowOffset+i; + int rowIndex = constraintRowOffset + i; int orgConstraintIndex = m_gpuData->m_cpuConstraintRows[rowIndex].m_originalConstraintIndex; float breakingThreshold = m_gpuData->m_cpuConstraints[orgConstraintIndex].m_breakingImpulseThreshold; - // printf("rows[%d].m_appliedImpulse=%f\n",rowIndex,rows[rowIndex].m_appliedImpulse); + // printf("rows[%d].m_appliedImpulse=%f\n",rowIndex,rows[rowIndex].m_appliedImpulse); if (b3Fabs(m_gpuData->m_cpuConstraintRows[rowIndex].m_appliedImpulse) >= breakingThreshold) { - - m_gpuData->m_cpuConstraints[orgConstraintIndex].m_flags =0;//&= ~B3_CONSTRAINT_FLAG_ENABLED; + m_gpuData->m_cpuConstraints[orgConstraintIndex].m_flags = 0; //&= ~B3_CONSTRAINT_FLAG_ENABLED; } } } } - gpuConstraints->copyFromHost(m_gpuData->m_cpuConstraints); } } @@ -1090,28 +1001,27 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyFinish(b3OpenCLArray<b { B3_PROFILE("GPU write back velocities and transforms"); - b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_writeBackVelocitiesKernel,"m_writeBackVelocitiesKernel"); + b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_writeBackVelocitiesKernel, "m_writeBackVelocitiesKernel"); launcher.setBuffer(gpuBodies->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL()); launcher.setConst(numBodies); launcher.launch1D(numBodies); clFinish(m_gpuData->m_queue); -// m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); -// m_gpuData->m_gpuBodies->copyToHostPointer(bodies,numBodies); + // m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); + // m_gpuData->m_gpuBodies->copyToHostPointer(bodies,numBodies); //m_gpuData->m_gpuBodies->copyToHost(testBodies); - - } + } else { B3_PROFILE("CPU write back velocities and transforms"); m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); gpuBodies->copyToHost(m_gpuData->m_cpuBodies); - for ( int i=0;i<m_tmpSolverBodyPool.size();i++) + for (int i = 0; i < m_tmpSolverBodyPool.size(); i++) { int bodyIndex = m_tmpSolverBodyPool[i].m_originalBodyIndex; //printf("bodyIndex=%d\n",bodyIndex); - b3Assert(i==bodyIndex); + b3Assert(i == bodyIndex); b3RigidBodyData* body = &m_gpuData->m_cpuBodies[bodyIndex]; if (body->m_invMass) @@ -1125,11 +1035,12 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyFinish(b3OpenCLArray<b { body->m_linVel = m_tmpSolverBodyPool[i].m_linearVelocity; body->m_angVel = m_tmpSolverBodyPool[i].m_angularVelocity; - } else + } + else { b3Assert(0); } - /* + /* if (infoGlobal.m_splitImpulse) { body->m_pos = m_tmpSolverBodyPool[i].m_worldTransform.getOrigin(); @@ -1139,10 +1050,9 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyFinish(b3OpenCLArray<b } */ } - }//for + } //for gpuBodies->copyFromHost(m_gpuData->m_cpuBodies); - } } |