1 files changed, 369 insertions, 466 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3Solver.cpp b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3Solver.cpp
index 20bf6d47c5..ccf67da1a8 100644
--- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3Solver.cpp
+++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3Solver.cpp
@@ -13,7 +13,6 @@ subject to the following restrictions:
 */
 //Originally written by Takahiro Harada
 
-
 #include "b3Solver.h"
 
 ///useNewBatchingKernel  is a rewritten kernel using just a single thread of the warp, for experiments
@@ -38,7 +37,6 @@ bool gConvertConstraintOnCpu = false;
 #include "kernels/batchingKernels.h"
 #include "kernels/batchingKernelsNew.h"
 
-
 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
 #include "Bullet3Common/b3Vector3.h"
 
@@ -48,7 +46,7 @@ struct SolverDebugInfo
 	int m_valInt1;
 	int m_valInt2;
 	int m_valInt3;
-	
+
 	int m_valInt4;
 	int m_valInt5;
 	int m_valInt6;
@@ -59,11 +57,10 @@ struct SolverDebugInfo
 	int m_valInt10;
 	int m_valInt11;
 
-	int	m_valInt12;
-	int	m_valInt13;
-	int	m_valInt14;
-	int	m_valInt15;
-
+	int m_valInt12;
+	int m_valInt13;
+	int m_valInt14;
+	int m_valInt15;
 
 	float m_val0;
 	float m_val1;
@@ -71,9 +68,6 @@ struct SolverDebugInfo
 	float m_val3;
 };
 
-
-
-
 class SolverDeviceInl
 {
 public:
@@ -84,101 +78,89 @@ public:
 	};
 };
 
-
-
 b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity)
-			:
-			m_context(ctx),
-			m_device(device),
-			m_queue(queue),
-			m_batchSizes(ctx,queue),
-			m_nIterations(4)
+	: m_context(ctx),
+	  m_device(device),
+	  m_queue(queue),
+	  m_batchSizes(ctx, queue),
+	  m_nIterations(4)
 {
-	m_sort32 = new b3RadixSort32CL(ctx,device,queue);
-	m_scan = new b3PrefixScanCL(ctx,device,queue,B3_SOLVER_N_CELLS);
-	m_search = new b3BoundSearchCL(ctx,device,queue,B3_SOLVER_N_CELLS);
+	m_sort32 = new b3RadixSort32CL(ctx, device, queue);
+	m_scan = new b3PrefixScanCL(ctx, device, queue, B3_SOLVER_N_CELLS);
+	m_search = new b3BoundSearchCL(ctx, device, queue, B3_SOLVER_N_CELLS);
 
-	const int sortSize = B3NEXTMULTIPLEOF( pairCapacity, 512 );
+	const int sortSize = B3NEXTMULTIPLEOF(pairCapacity, 512);
 
-	m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx,queue,sortSize);
-	m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(ctx,queue);
+	m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx, queue, sortSize);
+	m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(ctx, queue);
 
-	m_numConstraints = new b3OpenCLArray<unsigned int>(ctx,queue,B3_SOLVER_N_CELLS );
+	m_numConstraints = new b3OpenCLArray<unsigned int>(ctx, queue, B3_SOLVER_N_CELLS);
 	m_numConstraints->resize(B3_SOLVER_N_CELLS);
 
-	m_offsets = new b3OpenCLArray<unsigned int>( ctx,queue,B3_SOLVER_N_CELLS);
+	m_offsets = new b3OpenCLArray<unsigned int>(ctx, queue, B3_SOLVER_N_CELLS);
 	m_offsets->resize(B3_SOLVER_N_CELLS);
 	const char* additionalMacros = "";
-//	const char* srcFileNameForCaching="";
-
-
+	//	const char* srcFileNameForCaching="";
 
 	cl_int pErrNum;
 	const char* batchKernelSource = batchingKernelsCL;
 	const char* batchKernelNewSource = batchingKernelsNewCL;
-	
+
 	const char* solverSetupSource = solverSetupCL;
 	const char* solverSetup2Source = solverSetup2CL;
 	const char* solveContactSource = solveContactCL;
 	const char* solveFrictionSource = solveFrictionCL;
-	
-	
-	
+
 	{
-		
-		cl_program solveContactProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveContactSource, &pErrNum,additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
+		cl_program solveContactProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveContactSource, &pErrNum, additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
 		b3Assert(solveContactProg);
-		
-		cl_program solveFrictionProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
+
+		cl_program solveFrictionProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveFrictionSource, &pErrNum, additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
 		b3Assert(solveFrictionProg);
 
-		cl_program solverSetup2Prog= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetup2Source, &pErrNum,additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
+		cl_program solverSetup2Prog = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetup2Source, &pErrNum, additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
 		b3Assert(solverSetup2Prog);
 
-		
-		cl_program solverSetupProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetupSource, &pErrNum,additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
+		cl_program solverSetupProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetupSource, &pErrNum, additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
 		b3Assert(solverSetupProg);
-		
-		
-		m_solveFrictionKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg,additionalMacros );
+
+		m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg, additionalMacros);
 		b3Assert(m_solveFrictionKernel);
 
-		m_solveContactKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros );
+		m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg, additionalMacros);
 		b3Assert(m_solveContactKernel);
-		
-		m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg,additionalMacros );
+
+		m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg, additionalMacros);
 		b3Assert(m_contactToConstraintKernel);
-			
-		m_setSortDataKernel =  b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog,additionalMacros );
+
+		m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog, additionalMacros);
 		b3Assert(m_setSortDataKernel);
-				
-		m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog,additionalMacros );
+
+		m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog, additionalMacros);
 		b3Assert(m_reorderContactKernel);
-		
 
-		m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog,additionalMacros );
+		m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog, additionalMacros);
 		b3Assert(m_copyConstraintKernel);
-		
 	}
 
 	{
-		cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelSource, &pErrNum,additionalMacros, B3_BATCHING_PATH);
+		cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelSource, &pErrNum, additionalMacros, B3_BATCHING_PATH);
 		//cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, 0, &pErrNum,additionalMacros, B3_BATCHING_PATH,true);
 		b3Assert(batchingProg);
-		
-		m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg,additionalMacros );
+
+		m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg, additionalMacros);
 		b3Assert(m_batchingKernel);
 	}
 	{
-		cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelNewSource, &pErrNum,additionalMacros, B3_BATCHING_NEW_PATH);
+		cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelNewSource, &pErrNum, additionalMacros, B3_BATCHING_NEW_PATH);
 		b3Assert(batchingNewProg);
 
-		m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg,additionalMacros );
+		m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg, additionalMacros);
 		//m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesBruteForce", &pErrNum, batchingNewProg,additionalMacros );
 		b3Assert(m_batchingKernelNew);
 	}
 }
-		
+
 b3Solver::~b3Solver()
 {
 	delete m_offsets;
@@ -190,71 +172,68 @@ b3Solver::~b3Solver()
 	delete m_scan;
 	delete m_search;
 
-
 	clReleaseKernel(m_batchingKernel);
 	clReleaseKernel(m_batchingKernelNew);
-	
-	clReleaseKernel( m_solveContactKernel);
-	clReleaseKernel( m_solveFrictionKernel);
-
-	clReleaseKernel( m_contactToConstraintKernel);
-	clReleaseKernel( m_setSortDataKernel);
-	clReleaseKernel( m_reorderContactKernel);
-	clReleaseKernel( m_copyConstraintKernel);
-			
-}
 
+	clReleaseKernel(m_solveContactKernel);
+	clReleaseKernel(m_solveFrictionKernel);
 
- 
+	clReleaseKernel(m_contactToConstraintKernel);
+	clReleaseKernel(m_setSortDataKernel);
+	clReleaseKernel(m_reorderContactKernel);
+	clReleaseKernel(m_copyConstraintKernel);
+}
 
-template<bool JACOBI>
-static
-__inline
-void solveContact(b3GpuConstraint4& cs, 
-	const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
-	const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, 
-	float maxRambdaDt[4], float minRambdaDt[4])
+template <bool JACOBI>
+static __inline void solveContact(b3GpuConstraint4& cs,
+								  const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
+								  const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
+								  float maxRambdaDt[4], float minRambdaDt[4])
 {
-
-	b3Vector3 dLinVelA; dLinVelA.setZero();
-	b3Vector3 dAngVelA; dAngVelA.setZero();
-	b3Vector3 dLinVelB; dLinVelB.setZero();
-	b3Vector3 dAngVelB; dAngVelB.setZero();
-
-	for(int ic=0; ic<4; ic++)
+	b3Vector3 dLinVelA;
+	dLinVelA.setZero();
+	b3Vector3 dAngVelA;
+	dAngVelA.setZero();
+	b3Vector3 dLinVelB;
+	dLinVelB.setZero();
+	b3Vector3 dAngVelB;
+	dAngVelB.setZero();
+
+	for (int ic = 0; ic < 4; ic++)
 	{
 		//	dont necessary because this makes change to 0
-		if( cs.m_jacCoeffInv[ic] == 0.f ) continue;
+		if (cs.m_jacCoeffInv[ic] == 0.f) continue;
 
 		{
 			b3Vector3 angular0, angular1, linear;
 			b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA;
 			b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB;
-			setLinearAndAngular( (const b3Vector3 &)cs.m_linear, (const b3Vector3 &)r0, (const b3Vector3 &)r1, &linear, &angular0, &angular1 );
+			setLinearAndAngular((const b3Vector3&)cs.m_linear, (const b3Vector3&)r0, (const b3Vector3&)r1, &linear, &angular0, &angular1);
 
-			float rambdaDt = calcRelVel((const b3Vector3 &)cs.m_linear,(const b3Vector3 &) -cs.m_linear, angular0, angular1,
-				linVelA, angVelA, linVelB, angVelB ) + cs.m_b[ic];
+			float rambdaDt = calcRelVel((const b3Vector3&)cs.m_linear, (const b3Vector3&)-cs.m_linear, angular0, angular1,
+										linVelA, angVelA, linVelB, angVelB) +
+							 cs.m_b[ic];
 			rambdaDt *= cs.m_jacCoeffInv[ic];
 
 			{
 				float prevSum = cs.m_appliedRambdaDt[ic];
 				float updated = prevSum;
 				updated += rambdaDt;
-				updated = b3Max( updated, minRambdaDt[ic] );
-				updated = b3Min( updated, maxRambdaDt[ic] );
+				updated = b3Max(updated, minRambdaDt[ic]);
+				updated = b3Min(updated, maxRambdaDt[ic]);
 				rambdaDt = updated - prevSum;
 				cs.m_appliedRambdaDt[ic] = updated;
 			}
 
-			b3Vector3 linImp0 = invMassA*linear*rambdaDt;
-			b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt;
-			b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt;
-			b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt;
+			b3Vector3 linImp0 = invMassA * linear * rambdaDt;
+			b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt;
+			b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt;
+			b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt;
 #ifdef _WIN32
-            b3Assert(_finite(linImp0.getX()));
+			b3Assert(_finite(linImp0.getX()));
 			b3Assert(_finite(linImp1.getX()));
 #endif
-			if( JACOBI )
+			if (JACOBI)
 			{
 				dLinVelA += linImp0;
 				dAngVelA += angImp0;
@@ -271,92 +250,83 @@ void solveContact(b3GpuConstraint4& cs,
 		}
 	}
 
-	if( JACOBI )
+	if (JACOBI)
 	{
 		linVelA += dLinVelA;
 		angVelA += dAngVelA;
 		linVelB += dLinVelB;
 		angVelB += dAngVelB;
 	}
-
 }
 
+static __inline void solveFriction(b3GpuConstraint4& cs,
+								   const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
+								   const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
+								   float maxRambdaDt[4], float minRambdaDt[4])
+{
+	if (cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0) return;
+	const b3Vector3& center = (const b3Vector3&)cs.m_center;
 
+	b3Vector3 n = -(const b3Vector3&)cs.m_linear;
 
-
-
-	static
-	__inline
-	void solveFriction(b3GpuConstraint4& cs, 
-		const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
-		const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, 
-		float maxRambdaDt[4], float minRambdaDt[4])
-	{
-
-		if( cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0 ) return;
-		const b3Vector3& center = (const b3Vector3&)cs.m_center;
-
-		b3Vector3 n = -(const b3Vector3&)cs.m_linear;
-
-		b3Vector3 tangent[2];
-#if 1		
-		b3PlaneSpace1 (n, tangent[0],tangent[1]);
+	b3Vector3 tangent[2];
+#if 1
+	b3PlaneSpace1(n, tangent[0], tangent[1]);
 #else
-		b3Vector3 r = cs.m_worldPos[0]-center;
-		tangent[0] = cross3( n, r );
-		tangent[1] = cross3( tangent[0], n );
-		tangent[0] = normalize3( tangent[0] );
-		tangent[1] = normalize3( tangent[1] );
+	b3Vector3 r = cs.m_worldPos[0] - center;
+	tangent[0] = cross3(n, r);
+	tangent[1] = cross3(tangent[0], n);
+	tangent[0] = normalize3(tangent[0]);
+	tangent[1] = normalize3(tangent[1]);
 #endif
 
-		b3Vector3 angular0, angular1, linear;
-		b3Vector3 r0 = center - posA;
-		b3Vector3 r1 = center - posB;
-		for(int i=0; i<2; i++)
-		{
-			setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );
-			float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
-				linVelA, angVelA, linVelB, angVelB );
-			rambdaDt *= cs.m_fJacCoeffInv[i];
+	b3Vector3 angular0, angular1, linear;
+	b3Vector3 r0 = center - posA;
+	b3Vector3 r1 = center - posB;
+	for (int i = 0; i < 2; i++)
+	{
+		setLinearAndAngular(tangent[i], r0, r1, &linear, &angular0, &angular1);
+		float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
+									linVelA, angVelA, linVelB, angVelB);
+		rambdaDt *= cs.m_fJacCoeffInv[i];
 
-				{
-					float prevSum = cs.m_fAppliedRambdaDt[i];
-					float updated = prevSum;
-					updated += rambdaDt;
-					updated = b3Max( updated, minRambdaDt[i] );
-					updated = b3Min( updated, maxRambdaDt[i] );
-					rambdaDt = updated - prevSum;
-					cs.m_fAppliedRambdaDt[i] = updated;
-				}
+		{
+			float prevSum = cs.m_fAppliedRambdaDt[i];
+			float updated = prevSum;
+			updated += rambdaDt;
+			updated = b3Max(updated, minRambdaDt[i]);
+			updated = b3Min(updated, maxRambdaDt[i]);
+			rambdaDt = updated - prevSum;
+			cs.m_fAppliedRambdaDt[i] = updated;
+		}
 
-			b3Vector3 linImp0 = invMassA*linear*rambdaDt;
-			b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt;
-			b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt;
-			b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt;
+		b3Vector3 linImp0 = invMassA * linear * rambdaDt;
+		b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt;
+		b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt;
+		b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt;
 #ifdef _WIN32
-			b3Assert(_finite(linImp0.getX()));
-			b3Assert(_finite(linImp1.getX()));
+		b3Assert(_finite(linImp0.getX()));
+		b3Assert(_finite(linImp1.getX()));
 #endif
-			linVelA += linImp0;
-			angVelA += angImp0;
-			linVelB += linImp1;
-			angVelB += angImp1;
-		}
+		linVelA += linImp0;
+		angVelA += angImp0;
+		linVelB += linImp1;
+		angVelB += angImp1;
+	}
 
-		{	//	angular damping for point constraint
-			b3Vector3 ab = ( posB - posA ).normalized();
-			b3Vector3 ac = ( center - posA ).normalized();
-			if( b3Dot( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
-			{
-				float angNA = b3Dot( n, angVelA );
-				float angNB = b3Dot( n, angVelB );
+	{  //	angular damping for point constraint
+		b3Vector3 ab = (posB - posA).normalized();
+		b3Vector3 ac = (center - posA).normalized();
+		if (b3Dot(ab, ac) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
+		{
+			float angNA = b3Dot(n, angVelA);
+			float angNB = b3Dot(n, angVelB);
 
-				angVelA -= (angNA*0.1f)*n;
-				angVelB -= (angNB*0.1f)*n;
-			}
+			angVelA -= (angNA * 0.1f) * n;
+			angVelB -= (angNB * 0.1f) * n;
 		}
-
 	}
+}
 /*
  b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
 	b3AlignedObjectArray<b3InertiaData>& m_shapes;
@@ -370,79 +340,69 @@ void solveContact(b3GpuConstraint4& cs,
 	int m_maxNumBatches;
  */
 
-struct SolveTask// : public ThreadPool::Task
+struct SolveTask  // : public ThreadPool::Task
 {
-	SolveTask(b3AlignedObjectArray<b3RigidBodyData>& bodies,  b3AlignedObjectArray<b3InertiaData>& shapes, b3AlignedObjectArray<b3GpuConstraint4>& constraints,
-		int start, int nConstraints,int maxNumBatches,b3AlignedObjectArray<int>* wgUsedBodies, int curWgidx, b3AlignedObjectArray<int>* batchSizes, int cellIndex)
-		: m_bodies( bodies ), m_shapes( shapes ), 
-		m_constraints( constraints ), 
-		m_batchSizes(batchSizes),
-		m_cellIndex(cellIndex),
-		m_curWgidx(curWgidx),
-		m_start( start ), 
-		m_nConstraints( nConstraints ),
-		m_solveFriction( true ),
-		m_maxNumBatches(maxNumBatches)
-	{}
-
-	unsigned short int getType(){ return 0; }
+	SolveTask(b3AlignedObjectArray<b3RigidBodyData>& bodies, b3AlignedObjectArray<b3InertiaData>& shapes, b3AlignedObjectArray<b3GpuConstraint4>& constraints,
+			  int start, int nConstraints, int maxNumBatches, b3AlignedObjectArray<int>* wgUsedBodies, int curWgidx, b3AlignedObjectArray<int>* batchSizes, int cellIndex)
+		: m_bodies(bodies), m_shapes(shapes), m_constraints(constraints), m_batchSizes(batchSizes), m_cellIndex(cellIndex), m_curWgidx(curWgidx), m_start(start), m_nConstraints(nConstraints), m_solveFriction(true), m_maxNumBatches(maxNumBatches)
+	{
+	}
+
+	unsigned short int getType() { return 0; }
 
 	void run(int tIdx)
 	{
 		int offset = 0;
-		for (int ii=0;ii<B3_MAX_NUM_BATCHES;ii++)
+		for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
 		{
-			int numInBatch = m_batchSizes->at(m_cellIndex*B3_MAX_NUM_BATCHES+ii);
+			int numInBatch = m_batchSizes->at(m_cellIndex * B3_MAX_NUM_BATCHES + ii);
 			if (!numInBatch)
 				break;
 
-			for (int jj=0;jj<numInBatch;jj++)
+			for (int jj = 0; jj < numInBatch; jj++)
 			{
-				int i = m_start + offset+jj;
+				int i = m_start + offset + jj;
 				int batchId = m_constraints[i].m_batchIdx;
-				b3Assert(batchId==ii);
+				b3Assert(batchId == ii);
 				float frictionCoeff = m_constraints[i].getFrictionCoeff();
 				int aIdx = (int)m_constraints[i].m_bodyA;
 				int bIdx = (int)m_constraints[i].m_bodyB;
-//				int localBatch = m_constraints[i].m_batchIdx;
+				//				int localBatch = m_constraints[i].m_batchIdx;
 				b3RigidBodyData& bodyA = m_bodies[aIdx];
 				b3RigidBodyData& bodyB = m_bodies[bIdx];
 
-				if( !m_solveFriction )
+				if (!m_solveFriction)
 				{
-					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
-					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+					float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
+					float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f};
 
-					solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld, 
-							(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld,
-						maxRambdaDt, minRambdaDt );
+					solveContact<false>(m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3&)m_shapes[aIdx].m_invInertiaWorld,
+										(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3&)m_shapes[bIdx].m_invInertiaWorld,
+										maxRambdaDt, minRambdaDt);
 				}
 				else
 				{
-					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
-					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+					float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
+					float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f};
 					float sum = 0;
-					for(int j=0; j<4; j++)
+					for (int j = 0; j < 4; j++)
 					{
-						sum +=m_constraints[i].m_appliedRambdaDt[j];
+						sum += m_constraints[i].m_appliedRambdaDt[j];
 					}
 					frictionCoeff = 0.7f;
-					for(int j=0; j<4; j++)
+					for (int j = 0; j < 4; j++)
 					{
-						maxRambdaDt[j] = frictionCoeff*sum;
+						maxRambdaDt[j] = frictionCoeff * sum;
 						minRambdaDt[j] = -maxRambdaDt[j];
 					}
-					solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld, 
-						(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld,
-						maxRambdaDt, minRambdaDt );
-			
+					solveFriction(m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3&)m_shapes[aIdx].m_invInertiaWorld,
+								  (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3&)m_shapes[bIdx].m_invInertiaWorld,
+								  maxRambdaDt, minRambdaDt);
 				}
 			}
-			offset+=numInBatch;
-
-
+			offset += numInBatch;
 		}
-/*		for (int bb=0;bb<m_maxNumBatches;bb++)
+		/*		for (int bb=0;bb<m_maxNumBatches;bb++)
 		{
 			//for(int ic=m_nConstraints-1; ic>=0; ic--)
 			for(int ic=0; ic<m_nConstraints; ic++)
@@ -491,9 +451,6 @@ struct SolveTask// : public ThreadPool::Task
 			}
 		}
 		*/
-
-
-		
 	}
 
 	b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
@@ -508,11 +465,9 @@ struct SolveTask// : public ThreadPool::Task
 	int m_maxNumBatches;
 };
 
-
-void b3Solver::solveContactConstraintHost(  b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf, 
-			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches,b3AlignedObjectArray<int>* batchSizes)
+void b3Solver::solveContactConstraintHost(b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf,
+										  b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, b3AlignedObjectArray<int>* batchSizes)
 {
-
 #if 0
 	{	
 		int nSplitX = B3_SOLVER_N_SPLIT_X;
@@ -571,114 +526,105 @@ void b3Solver::solveContactConstraintHost(  b3OpenCLArray<b3RigidBodyData>* body
 	//printf("------------------------\n");
 	b3AlignedObjectArray<unsigned int> offsetsHost;
 	m_offsets->copyToHost(offsetsHost);
-	static int frame=0;
-	bool useBatches=true;
+	static int frame = 0;
+	bool useBatches = true;
 	if (useBatches)
 	{
-		for(int iter=0; iter<m_nIterations; iter++)
+		for (int iter = 0; iter < m_nIterations; iter++)
 		{
-			for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
+			for (int cellBatch = 0; cellBatch < B3_SOLVER_N_BATCHES; cellBatch++)
 			{
-				
 				int nSplitX = B3_SOLVER_N_SPLIT_X;
 				int nSplitY = B3_SOLVER_N_SPLIT_Y;
-				int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
+				int numWorkgroups = B3_SOLVER_N_CELLS / B3_SOLVER_N_BATCHES;
 				//printf("cell Batch %d\n",cellBatch);
 				b3AlignedObjectArray<int> usedBodies[B3_SOLVER_N_CELLS];
-				for (int i=0;i<B3_SOLVER_N_CELLS;i++)
+				for (int i = 0; i < B3_SOLVER_N_CELLS; i++)
 				{
 					usedBodies[i].resize(0);
 				}
 
-				
-
-
 				//for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
-				for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
+				for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
 				{
-					int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
-					int remain= (wgIdx%((nSplitX*nSplitY)/4));
-					int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
-					int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
-					int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
-					
-	
-					if( numConstraintsHost[cellIdx] == 0 ) 
+					int zIdx = (wgIdx / ((nSplitX * nSplitY) / 4)) * 2 + ((cellBatch & 4) >> 2);
+					int remain = (wgIdx % ((nSplitX * nSplitY) / 4));
+					int yIdx = (remain / (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
+					int xIdx = (remain % (nSplitX / 2)) * 2 + (cellBatch & 1);
+					int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
+
+					if (numConstraintsHost[cellIdx] == 0)
 						continue;
 
 					//printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
 					//printf("cell %d has %d constraints\n", cellIdx,numConstraintsHost[cellIdx]);
 					if (zIdx)
 					{
-					//printf("?\n");
+						//printf("?\n");
 					}
 
-					if (iter==0)
+					if (iter == 0)
 					{
 						//printf("frame=%d, Cell xIdx=%x, yIdx=%d ",frame, xIdx,yIdx);
 						//printf("cellBatch=%d, wgIdx=%d, #constraints in cell=%d\n",cellBatch,wgIdx,numConstraintsHost[cellIdx]);
 					}
 					const int start = offsetsHost[cellIdx];
 					int numConstraintsInCell = numConstraintsHost[cellIdx];
-	//				const int end = start + numConstraintsInCell;
+					//				const int end = start + numConstraintsInCell;
 
-					SolveTask task( bodyNative, shapeNative, constraintNative, start, numConstraintsInCell ,maxNumBatches,usedBodies,wgIdx,batchSizes,cellIdx);
+					SolveTask task(bodyNative, shapeNative, constraintNative, start, numConstraintsInCell, maxNumBatches, usedBodies, wgIdx, batchSizes, cellIdx);
 					task.m_solveFriction = false;
 					task.run(0);
-				
 				}
 			}
 		}
 
-		for(int iter=0; iter<m_nIterations; iter++)
+		for (int iter = 0; iter < m_nIterations; iter++)
 		{
-			for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
+			for (int cellBatch = 0; cellBatch < B3_SOLVER_N_BATCHES; cellBatch++)
 			{
 				int nSplitX = B3_SOLVER_N_SPLIT_X;
 				int nSplitY = B3_SOLVER_N_SPLIT_Y;
-				
 
-				int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
+				int numWorkgroups = B3_SOLVER_N_CELLS / B3_SOLVER_N_BATCHES;
 
-				for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
+				for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
 				{
-					int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
-					int remain= (wgIdx%((nSplitX*nSplitY)/4));
-					int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
-					int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
-					
-					int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
-	
-					if( numConstraintsHost[cellIdx] == 0 ) 
+					int zIdx = (wgIdx / ((nSplitX * nSplitY) / 4)) * 2 + ((cellBatch & 4) >> 2);
+					int remain = (wgIdx % ((nSplitX * nSplitY) / 4));
+					int yIdx = (remain / (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
+					int xIdx = (remain % (nSplitX / 2)) * 2 + (cellBatch & 1);
+
+					int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
+
+					if (numConstraintsHost[cellIdx] == 0)
 						continue;
-	
+
 					//printf("yIdx=%d\n",yIdx);
-					
+
 					const int start = offsetsHost[cellIdx];
 					int numConstraintsInCell = numConstraintsHost[cellIdx];
-	//				const int end = start + numConstraintsInCell;
+					//				const int end = start + numConstraintsInCell;
 
-					SolveTask task( bodyNative, shapeNative, constraintNative, start, numConstraintsInCell,maxNumBatches, 0,0,batchSizes,cellIdx);
+					SolveTask task(bodyNative, shapeNative, constraintNative, start, numConstraintsInCell, maxNumBatches, 0, 0, batchSizes, cellIdx);
 					task.m_solveFriction = true;
 					task.run(0);
-					
 				}
 			}
 		}
-
-
-	} else
+	}
+	else
 	{
-		for(int iter=0; iter<m_nIterations; iter++)
+		for (int iter = 0; iter < m_nIterations; iter++)
 		{
-			SolveTask task( bodyNative, shapeNative, constraintNative, 0, n ,maxNumBatches,0,0,0,0);
+			SolveTask task(bodyNative, shapeNative, constraintNative, 0, n, maxNumBatches, 0, 0, 0, 0);
 			task.m_solveFriction = false;
 			task.run(0);
 		}
 
-		for(int iter=0; iter<m_nIterations; iter++)
+		for (int iter = 0; iter < m_nIterations; iter++)
 		{
-			SolveTask task( bodyNative, shapeNative, constraintNative, 0, n ,maxNumBatches,0,0,0,0);
+			SolveTask task(bodyNative, shapeNative, constraintNative, 0, n, maxNumBatches, 0, 0, 0, 0);
 			task.m_solveFriction = true;
 			task.run(0);
 		}
@@ -688,23 +634,21 @@ void b3Solver::solveContactConstraintHost(  b3OpenCLArray<b3RigidBodyData>* body
 	shapeBuf->copyFromHost(shapeNative);
 	constraint->copyFromHost(constraintNative);
 	frame++;
-	
 }
 
 void checkConstraintBatch(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
-					const b3OpenCLArray<b3InertiaData>* shapeBuf,
-					b3OpenCLArray<b3GpuConstraint4>* constraint, 
-					b3OpenCLArray<unsigned int>* m_numConstraints,
-					b3OpenCLArray<unsigned int>* m_offsets,
-					int batchId
-					)
+						  const b3OpenCLArray<b3InertiaData>* shapeBuf,
+						  b3OpenCLArray<b3GpuConstraint4>* constraint,
+						  b3OpenCLArray<unsigned int>* m_numConstraints,
+						  b3OpenCLArray<unsigned int>* m_offsets,
+						  int batchId)
 {
-//						b3BufferInfoCL( m_numConstraints->getBufferCL() ), 
-//						b3BufferInfoCL( m_offsets->getBufferCL() ) 
-	
+	//						b3BufferInfoCL( m_numConstraints->getBufferCL() ),
+	//						b3BufferInfoCL( m_offsets->getBufferCL() )
+
 	int cellBatch = batchId;
 	const int nn = B3_SOLVER_N_CELLS;
-//	int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES;
+	//	int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES;
 
 	b3AlignedObjectArray<unsigned int> gN;
 	m_numConstraints->copyToHost(gN);
@@ -712,243 +656,220 @@ void checkConstraintBatch(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
 	m_offsets->copyToHost(gOffsets);
 	int nSplitX = B3_SOLVER_N_SPLIT_X;
 	int nSplitY = B3_SOLVER_N_SPLIT_Y;
-	
-//	int bIdx = batchId;
+
+	//	int bIdx = batchId;
 
 	b3AlignedObjectArray<b3GpuConstraint4> cpuConstraints;
 	constraint->copyToHost(cpuConstraints);
 
 	printf("batch = %d\n", batchId);
 
-	int numWorkgroups = nn/B3_SOLVER_N_BATCHES;
+	int numWorkgroups = nn / B3_SOLVER_N_BATCHES;
 	b3AlignedObjectArray<int> usedBodies;
 
-
-	for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
+	for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
 	{
 		printf("wgIdx = %d           ", wgIdx);
 
-		int zIdx = (wgIdx/((nSplitX*nSplitY))/2)*2+((cellBatch&4)>>2);					
-		int remain = wgIdx%((nSplitX*nSplitY));
-		int yIdx = (remain%(nSplitX/2))*2 + ((cellBatch&2)>>1);
-		int xIdx = (remain/(nSplitX/2))*2 + (cellBatch&1);
+		int zIdx = (wgIdx / ((nSplitX * nSplitY)) / 2) * 2 + ((cellBatch & 4) >> 2);
+		int remain = wgIdx % ((nSplitX * nSplitY));
+		int yIdx = (remain % (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
+		int xIdx = (remain / (nSplitX / 2)) * 2 + (cellBatch & 1);
 
-		
-		int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
-		printf("cellIdx=%d\n",cellIdx);
-		if( gN[cellIdx] == 0 ) 
+		int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
+		printf("cellIdx=%d\n", cellIdx);
+		if (gN[cellIdx] == 0)
 			continue;
 
 		const int start = gOffsets[cellIdx];
 		const int end = start + gN[cellIdx];
 
-		for (int c=start;c<end;c++)
+		for (int c = start; c < end; c++)
 		{
 			b3GpuConstraint4& constraint = cpuConstraints[c];
 			//printf("constraint (%d,%d)\n", constraint.m_bodyA,constraint.m_bodyB);
-			if (usedBodies.findLinearSearch(constraint.m_bodyA)< usedBodies.size())
+			if (usedBodies.findLinearSearch(constraint.m_bodyA) < usedBodies.size())
 			{
 				printf("error?\n");
 			}
-			if (usedBodies.findLinearSearch(constraint.m_bodyB)< usedBodies.size())
+			if (usedBodies.findLinearSearch(constraint.m_bodyB) < usedBodies.size())
 			{
 				printf("error?\n");
 			}
 		}
 
-		for (int c=start;c<end;c++)
+		for (int c = start; c < end; c++)
 		{
 			b3GpuConstraint4& constraint = cpuConstraints[c];
 			usedBodies.push_back(constraint.m_bodyA);
 			usedBodies.push_back(constraint.m_bodyB);
 		}
-
 	}
 }
 
-static bool verify=false;
+static bool verify = false;
 
-void b3Solver::solveContactConstraint(  const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, 
-			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches)
+void b3Solver::solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
+									  b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches)
 {
-	
-	
-	b3Int4 cdata = b3MakeInt4( n, 0, 0, 0 );
+	b3Int4 cdata = b3MakeInt4(n, 0, 0, 0);
 	{
-		
 		const int nn = B3_SOLVER_N_CELLS;
 
 		cdata.x = 0;
-		cdata.y = maxNumBatches;//250;
-
+		cdata.y = maxNumBatches;  //250;
 
-		int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES;
+		int numWorkItems = 64 * nn / B3_SOLVER_N_BATCHES;
 #ifdef DEBUG_ME
-		SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
-		adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
+		SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
+		adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
 #endif
 
-
-
 		{
-
 			B3_PROFILE("m_batchSolveKernel iterations");
-			for(int iter=0; iter<m_nIterations; iter++)
+			for (int iter = 0; iter < m_nIterations; iter++)
 			{
-				for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++)
+				for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
 				{
-					
 					if (verify)
 					{
-						checkConstraintBatch(bodyBuf,shapeBuf,constraint,m_numConstraints,m_offsets,ib);
+						checkConstraintBatch(bodyBuf, shapeBuf, constraint, m_numConstraints, m_offsets, ib);
 					}
 
 #ifdef DEBUG_ME
-					memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
-					gpuDebugInfo.write(debugInfo,numWorkItems);
+					memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
+					gpuDebugInfo.write(debugInfo, numWorkItems);
 #endif
 
-
 					cdata.z = ib;
-					
 
-				b3LauncherCL launcher( m_queue, m_solveContactKernel ,"m_solveContactKernel");
+					b3LauncherCL launcher(m_queue, m_solveContactKernel, "m_solveContactKernel");
 #if 1
-                    
-					b3BufferInfoCL bInfo[] = { 
-
-						b3BufferInfoCL( bodyBuf->getBufferCL() ), 
-						b3BufferInfoCL( shapeBuf->getBufferCL() ), 
-						b3BufferInfoCL( constraint->getBufferCL() ),
-						b3BufferInfoCL( m_numConstraints->getBufferCL() ), 
-						b3BufferInfoCL( m_offsets->getBufferCL() ) 
+
+					b3BufferInfoCL bInfo[] = {
+
+						b3BufferInfoCL(bodyBuf->getBufferCL()),
+						b3BufferInfoCL(shapeBuf->getBufferCL()),
+						b3BufferInfoCL(constraint->getBufferCL()),
+						b3BufferInfoCL(m_numConstraints->getBufferCL()),
+						b3BufferInfoCL(m_offsets->getBufferCL())
 #ifdef DEBUG_ME
-						,	b3BufferInfoCL(&gpuDebugInfo)
+							,
+						b3BufferInfoCL(&gpuDebugInfo)
 #endif
-						};
-
-					
+					};
 
-                    launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+					launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
 					//launcher.setConst(  cdata.x );
-                    launcher.setConst(  cdata.y );
-                    launcher.setConst(  cdata.z );
-                    b3Int4 nSplit;
+					launcher.setConst(cdata.y);
+					launcher.setConst(cdata.z);
+					b3Int4 nSplit;
 					nSplit.x = B3_SOLVER_N_SPLIT_X;
 					nSplit.y = B3_SOLVER_N_SPLIT_Y;
 					nSplit.z = B3_SOLVER_N_SPLIT_Z;
 
-                    launcher.setConst(  nSplit );
-                    launcher.launch1D( numWorkItems, 64 );
+					launcher.setConst(nSplit);
+					launcher.launch1D(numWorkItems, 64);
 
-                    
 #else
-                    const char* fileName = "m_batchSolveKernel.bin";
-                    FILE* f = fopen(fileName,"rb");
-                    if (f)
-                    {
-                        int sizeInBytes=0;
-                        if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
-                        {
-                            printf("error, cannot get file size\n");
-                            exit(0);
-                        }
-                        
-                        unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
-                        fread(buf,sizeInBytes,1,f);
-                        int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
-                        int num = *(int*)&buf[serializedBytes];
-                        
-                        launcher.launch1D( num);
-
-                        //this clFinish is for testing on errors
-                        clFinish(m_queue);
-                    }
+					const char* fileName = "m_batchSolveKernel.bin";
+					FILE* f = fopen(fileName, "rb");
+					if (f)
+					{
+						int sizeInBytes = 0;
+						if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
+						{
+							printf("error, cannot get file size\n");
+							exit(0);
+						}
+
+						unsigned char* buf = (unsigned char*)malloc(sizeInBytes);
+						fread(buf, sizeInBytes, 1, f);
+						int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context);
+						int num = *(int*)&buf[serializedBytes];
+
+						launcher.launch1D(num);
+
+						//this clFinish is for testing on errors
+						clFinish(m_queue);
+					}
 
 #endif
-					
 
 #ifdef DEBUG_ME
 					clFinish(m_queue);
-					gpuDebugInfo.read(debugInfo,numWorkItems);
+					gpuDebugInfo.read(debugInfo, numWorkItems);
 					clFinish(m_queue);
-					for (int i=0;i<numWorkItems;i++)
+					for (int i = 0; i < numWorkItems; i++)
 					{
-						if (debugInfo[i].m_valInt2>0)
+						if (debugInfo[i].m_valInt2 > 0)
 						{
-							printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2);
+							printf("debugInfo[i].m_valInt2 = %d\n", i, debugInfo[i].m_valInt2);
 						}
 
-						if (debugInfo[i].m_valInt3>0)
+						if (debugInfo[i].m_valInt3 > 0)
 						{
-							printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3);
+							printf("debugInfo[i].m_valInt3 = %d\n", i, debugInfo[i].m_valInt3);
 						}
 					}
-#endif //DEBUG_ME
-
-
+#endif  //DEBUG_ME
 				}
 			}
-		
-			clFinish(m_queue);
-
 
+			clFinish(m_queue);
 		}
 
 		cdata.x = 1;
-		bool applyFriction=true;
+		bool applyFriction = true;
 		if (applyFriction)
-    	{
+		{
 			B3_PROFILE("m_batchSolveKernel iterations2");
-			for(int iter=0; iter<m_nIterations; iter++)
+			for (int iter = 0; iter < m_nIterations; iter++)
 			{
-				for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++)
+				for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
 				{
 					cdata.z = ib;
-					
-
-					b3BufferInfoCL bInfo[] = { 
-						b3BufferInfoCL( bodyBuf->getBufferCL() ), 
-						b3BufferInfoCL( shapeBuf->getBufferCL() ), 
-						b3BufferInfoCL( constraint->getBufferCL() ),
-						b3BufferInfoCL( m_numConstraints->getBufferCL() ), 
-						b3BufferInfoCL( m_offsets->getBufferCL() )
+
+					b3BufferInfoCL bInfo[] = {
+						b3BufferInfoCL(bodyBuf->getBufferCL()),
+						b3BufferInfoCL(shapeBuf->getBufferCL()),
+						b3BufferInfoCL(constraint->getBufferCL()),
+						b3BufferInfoCL(m_numConstraints->getBufferCL()),
+						b3BufferInfoCL(m_offsets->getBufferCL())
 #ifdef DEBUG_ME
-						,b3BufferInfoCL(&gpuDebugInfo)
-#endif //DEBUG_ME
+							,
+						b3BufferInfoCL(&gpuDebugInfo)
+#endif  //DEBUG_ME
 					};
-					b3LauncherCL launcher( m_queue, m_solveFrictionKernel,"m_solveFrictionKernel" );
-					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+					b3LauncherCL launcher(m_queue, m_solveFrictionKernel, "m_solveFrictionKernel");
+					launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
 					//launcher.setConst(  cdata.x );
-                    launcher.setConst(  cdata.y );
-                    launcher.setConst(  cdata.z );
-                    b3Int4 nSplit;
+					launcher.setConst(cdata.y);
+					launcher.setConst(cdata.z);
+					b3Int4 nSplit;
 					nSplit.x = B3_SOLVER_N_SPLIT_X;
 					nSplit.y = B3_SOLVER_N_SPLIT_Y;
 					nSplit.z = B3_SOLVER_N_SPLIT_Z;
 
-                    launcher.setConst(  nSplit );
-                    
-					launcher.launch1D( 64*nn/B3_SOLVER_N_BATCHES, 64 );
+					launcher.setConst(nSplit);
+
+					launcher.launch1D(64 * nn / B3_SOLVER_N_BATCHES, 64);
 				}
 			}
 			clFinish(m_queue);
-			
 		}
 #ifdef DEBUG_ME
 		delete[] debugInfo;
-#endif //DEBUG_ME
+#endif  //DEBUG_ME
 	}
-
-	
 }
 
-void b3Solver::convertToConstraints( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, 
-	const b3OpenCLArray<b3InertiaData>* shapeBuf, 
-	b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData, 
-	int nContacts, const ConstraintCfg& cfg )
+void b3Solver::convertToConstraints(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
+									const b3OpenCLArray<b3InertiaData>* shapeBuf,
+									b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData,
+									int nContacts, const ConstraintCfg& cfg)
 {
-//	b3OpenCLArray<b3GpuConstraint4>* constraintNative =0;
+	//	b3OpenCLArray<b3GpuConstraint4>* constraintNative =0;
 	contactCOut->resize(nContacts);
 	struct CB
 	{
@@ -959,30 +880,28 @@ void b3Solver::convertToConstraints( const b3OpenCLArray<b3RigidBodyData>* bodyB
 	};
 
 	{
-
 		CB cdata;
 		cdata.m_nContacts = nContacts;
 		cdata.m_dt = cfg.m_dt;
 		cdata.m_positionDrift = cfg.m_positionDrift;
 		cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;
 
-		
 		if (gConvertConstraintOnCpu)
 		{
 			b3AlignedObjectArray<b3RigidBodyData> gBodies;
-		bodyBuf->copyToHost(gBodies);
+			bodyBuf->copyToHost(gBodies);
 
-		b3AlignedObjectArray<b3Contact4> gContact;
-		contactsIn->copyToHost(gContact);
+			b3AlignedObjectArray<b3Contact4> gContact;
+			contactsIn->copyToHost(gContact);
+
+			b3AlignedObjectArray<b3InertiaData> gShapes;
+			shapeBuf->copyToHost(gShapes);
+
+			b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut;
+			gConstraintOut.resize(nContacts);
 
-		b3AlignedObjectArray<b3InertiaData> gShapes;
-		shapeBuf->copyToHost(gShapes);
-		
-		b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut;
-		gConstraintOut.resize(nContacts);
-		
 			B3_PROFILE("cpu contactToConstraintKernel");
-			for (int gIdx=0;gIdx<nContacts;gIdx++)
+			for (int gIdx = 0; gIdx < nContacts; gIdx++)
 			{
 				int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
 				int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
@@ -1001,40 +920,36 @@ void b3Solver::convertToConstraints( const b3OpenCLArray<b3RigidBodyData>* bodyB
 
 				b3ContactConstraint4_t cs;
 
-    			setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
-					&gContact[gIdx], cdata.m_dt, cdata.m_positionDrift, cdata.m_positionConstraintCoeff,
-					&cs );
-		
+				setConstraint4(posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
+							   &gContact[gIdx], cdata.m_dt, cdata.m_positionDrift, cdata.m_positionConstraintCoeff,
+							   &cs);
+
 				cs.m_batchIdx = gContact[gIdx].m_batchIdx;
 
 				gConstraintOut[gIdx] = (b3GpuConstraint4&)cs;
 			}
 
 			contactCOut->copyFromHost(gConstraintOut);
-
-		} else
+		}
+		else
 		{
 			B3_PROFILE("gpu m_contactToConstraintKernel");
 
-		
-			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( shapeBuf->getBufferCL()),
-				b3BufferInfoCL( contactCOut->getBufferCL() )};
-			b3LauncherCL launcher( m_queue, m_contactToConstraintKernel,"m_contactToConstraintKernel" );
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(contactsIn->getBufferCL()), b3BufferInfoCL(bodyBuf->getBufferCL()), b3BufferInfoCL(shapeBuf->getBufferCL()),
+									  b3BufferInfoCL(contactCOut->getBufferCL())};
+			b3LauncherCL launcher(m_queue, m_contactToConstraintKernel, "m_contactToConstraintKernel");
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
 			//launcher.setConst(  cdata );
-        
+
 			launcher.setConst(cdata.m_nContacts);
 			launcher.setConst(cdata.m_dt);
 			launcher.setConst(cdata.m_positionDrift);
 			launcher.setConst(cdata.m_positionConstraintCoeff);
-        
-			launcher.launch1D( nContacts, 64 );	
-			clFinish(m_queue);
 
+			launcher.launch1D(nContacts, 64);
+			clFinish(m_queue);
 		}
 	}
-
-	
 }
 
 /*
@@ -1115,28 +1030,24 @@ void b3Solver::sortContacts(  const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
 }
 
 */
-void	b3Solver::batchContacts(  b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* nNative, b3OpenCLArray<unsigned int>* offsetsNative, int staticIdx )
+void b3Solver::batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* nNative, b3OpenCLArray<unsigned int>* offsetsNative, int staticIdx)
 {
-	
-	int numWorkItems = 64*B3_SOLVER_N_CELLS;
+	int numWorkItems = 64 * B3_SOLVER_N_CELLS;
 	{
 		B3_PROFILE("batch generation");
-		
+
 		b3Int4 cdata;
 		cdata.x = nContacts;
 		cdata.y = 0;
 		cdata.z = staticIdx;
 
-		
 #ifdef BATCH_DEBUG
-		SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
-		adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
-		memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
-		gpuDebugInfo.write(debugInfo,numWorkItems);
+		SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
+		adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
+		memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
+		gpuDebugInfo.write(debugInfo, numWorkItems);
 #endif
 
-		
-
 #if 0
 		b3BufferInfoCL bInfo[] = { 
 			b3BufferInfoCL( contacts->getBufferCL() ), 
@@ -1148,8 +1059,6 @@ void	b3Solver::batchContacts(  b3OpenCLArray<b3Contact4>* contacts, int nContact
 #endif
 		};
 #endif
-		
-		
 
 		{
 			m_batchSizes.resize(nNative->size());
@@ -1157,22 +1066,21 @@ void	b3Solver::batchContacts(  b3OpenCLArray<b3Contact4>* contacts, int nContact
 			//b3LauncherCL launcher( m_queue, m_batchingKernel);
 			cl_kernel k = useNewBatchingKernel ? m_batchingKernelNew : m_batchingKernel;
 
-			b3LauncherCL launcher( m_queue, k,"*batchingKernel");
-			if (!useNewBatchingKernel )
+			b3LauncherCL launcher(m_queue, k, "*batchingKernel");
+			if (!useNewBatchingKernel)
 			{
-				launcher.setBuffer( contacts->getBufferCL() );
+				launcher.setBuffer(contacts->getBufferCL());
 			}
-			launcher.setBuffer( m_contactBuffer2->getBufferCL() );
-			launcher.setBuffer( nNative->getBufferCL());
-			launcher.setBuffer( offsetsNative->getBufferCL());
-			
+			launcher.setBuffer(m_contactBuffer2->getBufferCL());
+			launcher.setBuffer(nNative->getBufferCL());
+			launcher.setBuffer(offsetsNative->getBufferCL());
+
 			launcher.setBuffer(m_batchSizes.getBufferCL());
-			
 
 			//launcher.setConst(  cdata );
-            launcher.setConst(staticIdx);
-            
-			launcher.launch1D( numWorkItems, 64 );
+			launcher.setConst(staticIdx);
+
+			launcher.launch1D(numWorkItems, 64);
 			//clFinish(m_queue);
 			//b3AlignedObjectArray<int> batchSizesCPU;
 			//m_batchSizes.copyToHost(batchSizesCPU);
@@ -1180,46 +1088,41 @@ void	b3Solver::batchContacts(  b3OpenCLArray<b3Contact4>* contacts, int nContact
 		}
 
 #ifdef BATCH_DEBUG
-	aaaa
-		b3Contact4* hostContacts = new b3Contact4[nContacts];
-		m_contactBuffer->read(hostContacts,nContacts);
+		aaaa
+			b3Contact4* hostContacts = new b3Contact4[nContacts];
+		m_contactBuffer->read(hostContacts, nContacts);
 		clFinish(m_queue);
 
-		gpuDebugInfo.read(debugInfo,numWorkItems);
+		gpuDebugInfo.read(debugInfo, numWorkItems);
 		clFinish(m_queue);
 
-		for (int i=0;i<numWorkItems;i++)
+		for (int i = 0; i < numWorkItems; i++)
 		{
-			if (debugInfo[i].m_valInt1>0)
+			if (debugInfo[i].m_valInt1 > 0)
 			{
 				printf("catch\n");
 			}
-			if (debugInfo[i].m_valInt2>0)
+			if (debugInfo[i].m_valInt2 > 0)
 			{
 				printf("catch22\n");
 			}
 
-			if (debugInfo[i].m_valInt3>0)
+			if (debugInfo[i].m_valInt3 > 0)
 			{
 				printf("catch666\n");
 			}
 
-			if (debugInfo[i].m_valInt4>0)
+			if (debugInfo[i].m_valInt4 > 0)
 			{
 				printf("catch777\n");
 			}
 		}
 		delete[] debugInfo;
-#endif //BATCH_DEBUG
-
+#endif  //BATCH_DEBUG
 	}
 
-//	copy buffer to buffer
+	//	copy buffer to buffer
 	//b3Assert(m_contactBuffer->size()==nContacts);
 	//contacts->copyFromOpenCLArray( *m_contactBuffer);
 	//clFinish(m_queue);//needed?
-	
-	
-	
 }
-