1 files changed, 1225 insertions, 0 deletions
diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3Solver.cpp b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3Solver.cpp
new file mode 100644
index 0000000000..20bf6d47c5
--- /dev/null
+++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3Solver.cpp
@@ -0,0 +1,1225 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#include "b3Solver.h"
+
+///useNewBatchingKernel  is a rewritten kernel using just a single thread of the warp, for experiments
+bool useNewBatchingKernel = true;
+bool gConvertConstraintOnCpu = false;
+
+#define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
+#define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
+#define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl"
+#define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl"
+#define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
+#define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"
+
+#include "Bullet3Dynamics/shared/b3ConvertConstraint4.h"
+
+#include "kernels/solverSetup.h"
+#include "kernels/solverSetup2.h"
+
+#include "kernels/solveContact.h"
+#include "kernels/solveFriction.h"
+
+#include "kernels/batchingKernels.h"
+#include "kernels/batchingKernelsNew.h"
+
+
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include "Bullet3Common/b3Vector3.h"
+
+struct SolverDebugInfo
+{
+	int m_valInt0;
+	int m_valInt1;
+	int m_valInt2;
+	int m_valInt3;
+	
+	int m_valInt4;
+	int m_valInt5;
+	int m_valInt6;
+	int m_valInt7;
+
+	int m_valInt8;
+	int m_valInt9;
+	int m_valInt10;
+	int m_valInt11;
+
+	int	m_valInt12;
+	int	m_valInt13;
+	int	m_valInt14;
+	int	m_valInt15;
+
+
+	float m_val0;
+	float m_val1;
+	float m_val2;
+	float m_val3;
+};
+
+
+
+
+class SolverDeviceInl
+{
+public:
+	struct ParallelSolveData
+	{
+		b3OpenCLArray<unsigned int>* m_numConstraints;
+		b3OpenCLArray<unsigned int>* m_offsets;
+	};
+};
+
+
+
+b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity)
+			:
+			m_context(ctx),
+			m_device(device),
+			m_queue(queue),
+			m_batchSizes(ctx,queue),
+			m_nIterations(4)
+{
+	m_sort32 = new b3RadixSort32CL(ctx,device,queue);
+	m_scan = new b3PrefixScanCL(ctx,device,queue,B3_SOLVER_N_CELLS);
+	m_search = new b3BoundSearchCL(ctx,device,queue,B3_SOLVER_N_CELLS);
+
+	const int sortSize = B3NEXTMULTIPLEOF( pairCapacity, 512 );
+
+	m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx,queue,sortSize);
+	m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(ctx,queue);
+
+	m_numConstraints = new b3OpenCLArray<unsigned int>(ctx,queue,B3_SOLVER_N_CELLS );
+	m_numConstraints->resize(B3_SOLVER_N_CELLS);
+
+	m_offsets = new b3OpenCLArray<unsigned int>( ctx,queue,B3_SOLVER_N_CELLS);
+	m_offsets->resize(B3_SOLVER_N_CELLS);
+	const char* additionalMacros = "";
+//	const char* srcFileNameForCaching="";
+
+
+
+	cl_int pErrNum;
+	const char* batchKernelSource = batchingKernelsCL;
+	const char* batchKernelNewSource = batchingKernelsNewCL;
+	
+	const char* solverSetupSource = solverSetupCL;
+	const char* solverSetup2Source = solverSetup2CL;
+	const char* solveContactSource = solveContactCL;
+	const char* solveFrictionSource = solveFrictionCL;
+	
+	
+	
+	{
+		
+		cl_program solveContactProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveContactSource, &pErrNum,additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
+		b3Assert(solveContactProg);
+		
+		cl_program solveFrictionProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
+		b3Assert(solveFrictionProg);
+
+		cl_program solverSetup2Prog= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetup2Source, &pErrNum,additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
+		b3Assert(solverSetup2Prog);
+
+		
+		cl_program solverSetupProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetupSource, &pErrNum,additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
+		b3Assert(solverSetupProg);
+		
+		
+		m_solveFrictionKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg,additionalMacros );
+		b3Assert(m_solveFrictionKernel);
+
+		m_solveContactKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros );
+		b3Assert(m_solveContactKernel);
+		
+		m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg,additionalMacros );
+		b3Assert(m_contactToConstraintKernel);
+			
+		m_setSortDataKernel =  b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog,additionalMacros );
+		b3Assert(m_setSortDataKernel);
+				
+		m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog,additionalMacros );
+		b3Assert(m_reorderContactKernel);
+		
+
+		m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog,additionalMacros );
+		b3Assert(m_copyConstraintKernel);
+		
+	}
+
+	{
+		cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelSource, &pErrNum,additionalMacros, B3_BATCHING_PATH);
+		//cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, 0, &pErrNum,additionalMacros, B3_BATCHING_PATH,true);
+		b3Assert(batchingProg);
+		
+		m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg,additionalMacros );
+		b3Assert(m_batchingKernel);
+	}
+	{
+		cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelNewSource, &pErrNum,additionalMacros, B3_BATCHING_NEW_PATH);
+		b3Assert(batchingNewProg);
+
+		m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg,additionalMacros );
+		//m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesBruteForce", &pErrNum, batchingNewProg,additionalMacros );
+		b3Assert(m_batchingKernelNew);
+	}
+}
+		
+b3Solver::~b3Solver()
+{
+	delete m_offsets;
+	delete m_numConstraints;
+	delete m_sortDataBuffer;
+	delete m_contactBuffer2;
+
+	delete m_sort32;
+	delete m_scan;
+	delete m_search;
+
+
+	clReleaseKernel(m_batchingKernel);
+	clReleaseKernel(m_batchingKernelNew);
+	
+	clReleaseKernel( m_solveContactKernel);
+	clReleaseKernel( m_solveFrictionKernel);
+
+	clReleaseKernel( m_contactToConstraintKernel);
+	clReleaseKernel( m_setSortDataKernel);
+	clReleaseKernel( m_reorderContactKernel);
+	clReleaseKernel( m_copyConstraintKernel);
+			
+}
+
+
+ 
+
+template<bool JACOBI>
+static
+__inline
+void solveContact(b3GpuConstraint4& cs, 
+	const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
+	const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, 
+	float maxRambdaDt[4], float minRambdaDt[4])
+{
+
+	b3Vector3 dLinVelA; dLinVelA.setZero();
+	b3Vector3 dAngVelA; dAngVelA.setZero();
+	b3Vector3 dLinVelB; dLinVelB.setZero();
+	b3Vector3 dAngVelB; dAngVelB.setZero();
+
+	for(int ic=0; ic<4; ic++)
+	{
+		//	dont necessary because this makes change to 0
+		if( cs.m_jacCoeffInv[ic] == 0.f ) continue;
+
+		{
+			b3Vector3 angular0, angular1, linear;
+			b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA;
+			b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB;
+			setLinearAndAngular( (const b3Vector3 &)cs.m_linear, (const b3Vector3 &)r0, (const b3Vector3 &)r1, &linear, &angular0, &angular1 );
+
+			float rambdaDt = calcRelVel((const b3Vector3 &)cs.m_linear,(const b3Vector3 &) -cs.m_linear, angular0, angular1,
+				linVelA, angVelA, linVelB, angVelB ) + cs.m_b[ic];
+			rambdaDt *= cs.m_jacCoeffInv[ic];
+
+			{
+				float prevSum = cs.m_appliedRambdaDt[ic];
+				float updated = prevSum;
+				updated += rambdaDt;
+				updated = b3Max( updated, minRambdaDt[ic] );
+				updated = b3Min( updated, maxRambdaDt[ic] );
+				rambdaDt = updated - prevSum;
+				cs.m_appliedRambdaDt[ic] = updated;
+			}
+
+			b3Vector3 linImp0 = invMassA*linear*rambdaDt;
+			b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt;
+			b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt;
+			b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt;
+#ifdef _WIN32
+            b3Assert(_finite(linImp0.getX()));
+			b3Assert(_finite(linImp1.getX()));
+#endif
+			if( JACOBI )
+			{
+				dLinVelA += linImp0;
+				dAngVelA += angImp0;
+				dLinVelB += linImp1;
+				dAngVelB += angImp1;
+			}
+			else
+			{
+				linVelA += linImp0;
+				angVelA += angImp0;
+				linVelB += linImp1;
+				angVelB += angImp1;
+			}
+		}
+	}
+
+	if( JACOBI )
+	{
+		linVelA += dLinVelA;
+		angVelA += dAngVelA;
+		linVelB += dLinVelB;
+		angVelB += dAngVelB;
+	}
+
+}
+
+
+
+
+
+	static
+	__inline
+	void solveFriction(b3GpuConstraint4& cs, 
+		const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
+		const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, 
+		float maxRambdaDt[4], float minRambdaDt[4])
+	{
+
+		if( cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0 ) return;
+		const b3Vector3& center = (const b3Vector3&)cs.m_center;
+
+		b3Vector3 n = -(const b3Vector3&)cs.m_linear;
+
+		b3Vector3 tangent[2];
+#if 1		
+		b3PlaneSpace1 (n, tangent[0],tangent[1]);
+#else
+		b3Vector3 r = cs.m_worldPos[0]-center;
+		tangent[0] = cross3( n, r );
+		tangent[1] = cross3( tangent[0], n );
+		tangent[0] = normalize3( tangent[0] );
+		tangent[1] = normalize3( tangent[1] );
+#endif
+
+		b3Vector3 angular0, angular1, linear;
+		b3Vector3 r0 = center - posA;
+		b3Vector3 r1 = center - posB;
+		for(int i=0; i<2; i++)
+		{
+			setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );
+			float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
+				linVelA, angVelA, linVelB, angVelB );
+			rambdaDt *= cs.m_fJacCoeffInv[i];
+
+				{
+					float prevSum = cs.m_fAppliedRambdaDt[i];
+					float updated = prevSum;
+					updated += rambdaDt;
+					updated = b3Max( updated, minRambdaDt[i] );
+					updated = b3Min( updated, maxRambdaDt[i] );
+					rambdaDt = updated - prevSum;
+					cs.m_fAppliedRambdaDt[i] = updated;
+				}
+
+			b3Vector3 linImp0 = invMassA*linear*rambdaDt;
+			b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt;
+			b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt;
+			b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt;
+#ifdef _WIN32
+			b3Assert(_finite(linImp0.getX()));
+			b3Assert(_finite(linImp1.getX()));
+#endif
+			linVelA += linImp0;
+			angVelA += angImp0;
+			linVelB += linImp1;
+			angVelB += angImp1;
+		}
+
+		{	//	angular damping for point constraint
+			b3Vector3 ab = ( posB - posA ).normalized();
+			b3Vector3 ac = ( center - posA ).normalized();
+			if( b3Dot( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
+			{
+				float angNA = b3Dot( n, angVelA );
+				float angNB = b3Dot( n, angVelB );
+
+				angVelA -= (angNA*0.1f)*n;
+				angVelB -= (angNB*0.1f)*n;
+			}
+		}
+
+	}
+/*
+ b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
+	b3AlignedObjectArray<b3InertiaData>& m_shapes;
+	b3AlignedObjectArray<b3GpuConstraint4>& m_constraints;
+	b3AlignedObjectArray<int>* m_batchSizes;
+	int m_cellIndex;
+	int m_curWgidx;
+	int m_start;
+	int m_nConstraints;
+	bool m_solveFriction;
+	int m_maxNumBatches;
+ */
+
+struct SolveTask// : public ThreadPool::Task
+{
+	SolveTask(b3AlignedObjectArray<b3RigidBodyData>& bodies,  b3AlignedObjectArray<b3InertiaData>& shapes, b3AlignedObjectArray<b3GpuConstraint4>& constraints,
+		int start, int nConstraints,int maxNumBatches,b3AlignedObjectArray<int>* wgUsedBodies, int curWgidx, b3AlignedObjectArray<int>* batchSizes, int cellIndex)
+		: m_bodies( bodies ), m_shapes( shapes ), 
+		m_constraints( constraints ), 
+		m_batchSizes(batchSizes),
+		m_cellIndex(cellIndex),
+		m_curWgidx(curWgidx),
+		m_start( start ), 
+		m_nConstraints( nConstraints ),
+		m_solveFriction( true ),
+		m_maxNumBatches(maxNumBatches)
+	{}
+
+	unsigned short int getType(){ return 0; }
+
+	void run(int tIdx)
+	{
+		int offset = 0;
+		for (int ii=0;ii<B3_MAX_NUM_BATCHES;ii++)
+		{
+			int numInBatch = m_batchSizes->at(m_cellIndex*B3_MAX_NUM_BATCHES+ii);
+			if (!numInBatch)
+				break;
+
+			for (int jj=0;jj<numInBatch;jj++)
+			{
+				int i = m_start + offset+jj;
+				int batchId = m_constraints[i].m_batchIdx;
+				b3Assert(batchId==ii);
+				float frictionCoeff = m_constraints[i].getFrictionCoeff();
+				int aIdx = (int)m_constraints[i].m_bodyA;
+				int bIdx = (int)m_constraints[i].m_bodyB;
+//				int localBatch = m_constraints[i].m_batchIdx;
+				b3RigidBodyData& bodyA = m_bodies[aIdx];
+				b3RigidBodyData& bodyB = m_bodies[bIdx];
+
+				if( !m_solveFriction )
+				{
+					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+
+					solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld, 
+							(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld,
+						maxRambdaDt, minRambdaDt );
+				}
+				else
+				{
+					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+					float sum = 0;
+					for(int j=0; j<4; j++)
+					{
+						sum +=m_constraints[i].m_appliedRambdaDt[j];
+					}
+					frictionCoeff = 0.7f;
+					for(int j=0; j<4; j++)
+					{
+						maxRambdaDt[j] = frictionCoeff*sum;
+						minRambdaDt[j] = -maxRambdaDt[j];
+					}
+					solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld, 
+						(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld,
+						maxRambdaDt, minRambdaDt );
+			
+				}
+			}
+			offset+=numInBatch;
+
+
+		}
+/*		for (int bb=0;bb<m_maxNumBatches;bb++)
+		{
+			//for(int ic=m_nConstraints-1; ic>=0; ic--)
+			for(int ic=0; ic<m_nConstraints; ic++)
+			{
+				
+				int i = m_start + ic;
+				if (m_constraints[i].m_batchIdx != bb)
+					continue;
+
+				float frictionCoeff = m_constraints[i].getFrictionCoeff();
+				int aIdx = (int)m_constraints[i].m_bodyA;
+				int bIdx = (int)m_constraints[i].m_bodyB;
+				int localBatch = m_constraints[i].m_batchIdx;
+				b3RigidBodyData& bodyA = m_bodies[aIdx];
+				b3RigidBodyData& bodyB = m_bodies[bIdx];
+
+				if( !m_solveFriction )
+				{
+					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+
+					solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld, 
+							(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld,
+						maxRambdaDt, minRambdaDt );
+				}
+				else
+				{
+					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+					float sum = 0;
+					for(int j=0; j<4; j++)
+					{
+						sum +=m_constraints[i].m_appliedRambdaDt[j];
+					}
+					frictionCoeff = 0.7f;
+					for(int j=0; j<4; j++)
+					{
+						maxRambdaDt[j] = frictionCoeff*sum;
+						minRambdaDt[j] = -maxRambdaDt[j];
+					}
+					solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld, 
+						(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld,
+						maxRambdaDt, minRambdaDt );
+			
+				}
+			}
+		}
+		*/
+
+
+		
+	}
+
+	b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
+	b3AlignedObjectArray<b3InertiaData>& m_shapes;
+	b3AlignedObjectArray<b3GpuConstraint4>& m_constraints;
+	b3AlignedObjectArray<int>* m_batchSizes;
+	int m_cellIndex;
+	int m_curWgidx;
+	int m_start;
+	int m_nConstraints;
+	bool m_solveFriction;
+	int m_maxNumBatches;
+};
+
+
+void b3Solver::solveContactConstraintHost(  b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf, 
+			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches,b3AlignedObjectArray<int>* batchSizes)
+{
+
+#if 0
+	{	
+		int nSplitX = B3_SOLVER_N_SPLIT_X;
+		int nSplitY = B3_SOLVER_N_SPLIT_Y;
+		int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
+		for (int z=0;z<4;z++)
+		{
+			for (int y=0;y<4;y++)
+			{
+				for (int x=0;x<4;x++)
+				{
+					int newIndex = (x+y*nSplitX+z*nSplitX*nSplitY);
+				//	printf("newIndex=%d\n",newIndex);
+
+					int zIdx = newIndex/(nSplitX*nSplitY);
+					int remain = newIndex%(nSplitX*nSplitY);
+					int yIdx = remain/nSplitX;
+					int xIdx = remain%nSplitX;
+				//	printf("newIndex=%d\n",newIndex);
+				}
+			}
+		}
+
+		//for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
+		for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
+		{
+			for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
+			{
+				int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
+				int remain= (wgIdx%((nSplitX*nSplitY)/4));
+				int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
+				int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
+				
+				/*int zIdx = newIndex/(nSplitX*nSplitY);
+				int remain = newIndex%(nSplitX*nSplitY);
+				int yIdx = remain/nSplitX;
+				int xIdx = remain%nSplitX;
+				*/
+				int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
+			//	printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
+			}
+		}
+	}
+#endif
+
+	b3AlignedObjectArray<b3RigidBodyData> bodyNative;
+	bodyBuf->copyToHost(bodyNative);
+	b3AlignedObjectArray<b3InertiaData> shapeNative;
+	shapeBuf->copyToHost(shapeNative);
+	b3AlignedObjectArray<b3GpuConstraint4> constraintNative;
+	constraint->copyToHost(constraintNative);
+
+	b3AlignedObjectArray<unsigned int> numConstraintsHost;
+	m_numConstraints->copyToHost(numConstraintsHost);
+
+	//printf("------------------------\n");
+	b3AlignedObjectArray<unsigned int> offsetsHost;
+	m_offsets->copyToHost(offsetsHost);
+	static int frame=0;
+	bool useBatches=true;
+	if (useBatches)
+	{
+		for(int iter=0; iter<m_nIterations; iter++)
+		{
+			for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
+			{
+				
+				int nSplitX = B3_SOLVER_N_SPLIT_X;
+				int nSplitY = B3_SOLVER_N_SPLIT_Y;
+				int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
+				//printf("cell Batch %d\n",cellBatch);
+				b3AlignedObjectArray<int> usedBodies[B3_SOLVER_N_CELLS];
+				for (int i=0;i<B3_SOLVER_N_CELLS;i++)
+				{
+					usedBodies[i].resize(0);
+				}
+
+				
+
+
+				//for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
+				for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
+				{
+					int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
+					int remain= (wgIdx%((nSplitX*nSplitY)/4));
+					int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
+					int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
+					int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
+					
+	
+					if( numConstraintsHost[cellIdx] == 0 ) 
+						continue;
+
+					//printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
+					//printf("cell %d has %d constraints\n", cellIdx,numConstraintsHost[cellIdx]);
+					if (zIdx)
+					{
+					//printf("?\n");
+					}
+
+					if (iter==0)
+					{
+						//printf("frame=%d, Cell xIdx=%x, yIdx=%d ",frame, xIdx,yIdx);
+						//printf("cellBatch=%d, wgIdx=%d, #constraints in cell=%d\n",cellBatch,wgIdx,numConstraintsHost[cellIdx]);
+					}
+					const int start = offsetsHost[cellIdx];
+					int numConstraintsInCell = numConstraintsHost[cellIdx];
+	//				const int end = start + numConstraintsInCell;
+
+					SolveTask task( bodyNative, shapeNative, constraintNative, start, numConstraintsInCell ,maxNumBatches,usedBodies,wgIdx,batchSizes,cellIdx);
+					task.m_solveFriction = false;
+					task.run(0);
+				
+				}
+			}
+		}
+
+		for(int iter=0; iter<m_nIterations; iter++)
+		{
+			for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
+			{
+				int nSplitX = B3_SOLVER_N_SPLIT_X;
+				int nSplitY = B3_SOLVER_N_SPLIT_Y;
+				
+
+				int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
+
+				for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
+				{
+					int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
+					int remain= (wgIdx%((nSplitX*nSplitY)/4));
+					int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
+					int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
+					
+					int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
+	
+					if( numConstraintsHost[cellIdx] == 0 ) 
+						continue;
+	
+					//printf("yIdx=%d\n",yIdx);
+					
+					const int start = offsetsHost[cellIdx];
+					int numConstraintsInCell = numConstraintsHost[cellIdx];
+	//				const int end = start + numConstraintsInCell;
+
+					SolveTask task( bodyNative, shapeNative, constraintNative, start, numConstraintsInCell,maxNumBatches, 0,0,batchSizes,cellIdx);
+					task.m_solveFriction = true;
+					task.run(0);
+					
+				}
+			}
+		}
+
+
+	} else
+	{
+		for(int iter=0; iter<m_nIterations; iter++)
+		{
+			SolveTask task( bodyNative, shapeNative, constraintNative, 0, n ,maxNumBatches,0,0,0,0);
+			task.m_solveFriction = false;
+			task.run(0);
+		}
+
+		for(int iter=0; iter<m_nIterations; iter++)
+		{
+			SolveTask task( bodyNative, shapeNative, constraintNative, 0, n ,maxNumBatches,0,0,0,0);
+			task.m_solveFriction = true;
+			task.run(0);
+		}
+	}
+
+	bodyBuf->copyFromHost(bodyNative);
+	shapeBuf->copyFromHost(shapeNative);
+	constraint->copyFromHost(constraintNative);
+	frame++;
+	
+}
+
+void checkConstraintBatch(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
+					const b3OpenCLArray<b3InertiaData>* shapeBuf,
+					b3OpenCLArray<b3GpuConstraint4>* constraint, 
+					b3OpenCLArray<unsigned int>* m_numConstraints,
+					b3OpenCLArray<unsigned int>* m_offsets,
+					int batchId
+					)
+{
+//						b3BufferInfoCL( m_numConstraints->getBufferCL() ), 
+//						b3BufferInfoCL( m_offsets->getBufferCL() ) 
+	
+	int cellBatch = batchId;
+	const int nn = B3_SOLVER_N_CELLS;
+//	int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES;
+
+	b3AlignedObjectArray<unsigned int> gN;
+	m_numConstraints->copyToHost(gN);
+	b3AlignedObjectArray<unsigned int> gOffsets;
+	m_offsets->copyToHost(gOffsets);
+	int nSplitX = B3_SOLVER_N_SPLIT_X;
+	int nSplitY = B3_SOLVER_N_SPLIT_Y;
+	
+//	int bIdx = batchId;
+
+	b3AlignedObjectArray<b3GpuConstraint4> cpuConstraints;
+	constraint->copyToHost(cpuConstraints);
+
+	printf("batch = %d\n", batchId);
+
+	int numWorkgroups = nn/B3_SOLVER_N_BATCHES;
+	b3AlignedObjectArray<int> usedBodies;
+
+
+	for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
+	{
+		printf("wgIdx = %d           ", wgIdx);
+
+		int zIdx = (wgIdx/((nSplitX*nSplitY))/2)*2+((cellBatch&4)>>2);					
+		int remain = wgIdx%((nSplitX*nSplitY));
+		int yIdx = (remain%(nSplitX/2))*2 + ((cellBatch&2)>>1);
+		int xIdx = (remain/(nSplitX/2))*2 + (cellBatch&1);
+
+		
+		int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
+		printf("cellIdx=%d\n",cellIdx);
+		if( gN[cellIdx] == 0 ) 
+			continue;
+
+		const int start = gOffsets[cellIdx];
+		const int end = start + gN[cellIdx];
+
+		for (int c=start;c<end;c++)
+		{
+			b3GpuConstraint4& constraint = cpuConstraints[c];
+			//printf("constraint (%d,%d)\n", constraint.m_bodyA,constraint.m_bodyB);
+			if (usedBodies.findLinearSearch(constraint.m_bodyA)< usedBodies.size())
+			{
+				printf("error?\n");
+			}
+			if (usedBodies.findLinearSearch(constraint.m_bodyB)< usedBodies.size())
+			{
+				printf("error?\n");
+			}
+		}
+
+		for (int c=start;c<end;c++)
+		{
+			b3GpuConstraint4& constraint = cpuConstraints[c];
+			usedBodies.push_back(constraint.m_bodyA);
+			usedBodies.push_back(constraint.m_bodyB);
+		}
+
+	}
+}
+
+static bool verify=false;
+
+void b3Solver::solveContactConstraint(  const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, 
+			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches)
+{
+	
+	
+	b3Int4 cdata = b3MakeInt4( n, 0, 0, 0 );
+	{
+		
+		const int nn = B3_SOLVER_N_CELLS;
+
+		cdata.x = 0;
+		cdata.y = maxNumBatches;//250;
+
+
+		int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES;
+#ifdef DEBUG_ME
+		SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
+		adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
+#endif
+
+
+
+		{
+
+			B3_PROFILE("m_batchSolveKernel iterations");
+			for(int iter=0; iter<m_nIterations; iter++)
+			{
+				for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++)
+				{
+					
+					if (verify)
+					{
+						checkConstraintBatch(bodyBuf,shapeBuf,constraint,m_numConstraints,m_offsets,ib);
+					}
+
+#ifdef DEBUG_ME
+					memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
+					gpuDebugInfo.write(debugInfo,numWorkItems);
+#endif
+
+
+					cdata.z = ib;
+					
+
+				b3LauncherCL launcher( m_queue, m_solveContactKernel ,"m_solveContactKernel");
+#if 1
+                    
+					b3BufferInfoCL bInfo[] = { 
+
+						b3BufferInfoCL( bodyBuf->getBufferCL() ), 
+						b3BufferInfoCL( shapeBuf->getBufferCL() ), 
+						b3BufferInfoCL( constraint->getBufferCL() ),
+						b3BufferInfoCL( m_numConstraints->getBufferCL() ), 
+						b3BufferInfoCL( m_offsets->getBufferCL() ) 
+#ifdef DEBUG_ME
+						,	b3BufferInfoCL(&gpuDebugInfo)
+#endif
+						};
+
+					
+
+                    launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+					//launcher.setConst(  cdata.x );
+                    launcher.setConst(  cdata.y );
+                    launcher.setConst(  cdata.z );
+                    b3Int4 nSplit;
+					nSplit.x = B3_SOLVER_N_SPLIT_X;
+					nSplit.y = B3_SOLVER_N_SPLIT_Y;
+					nSplit.z = B3_SOLVER_N_SPLIT_Z;
+
+                    launcher.setConst(  nSplit );
+                    launcher.launch1D( numWorkItems, 64 );
+
+                    
+#else
+                    const char* fileName = "m_batchSolveKernel.bin";
+                    FILE* f = fopen(fileName,"rb");
+                    if (f)
+                    {
+                        int sizeInBytes=0;
+                        if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
+                        {
+                            printf("error, cannot get file size\n");
+                            exit(0);
+                        }
+                        
+                        unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
+                        fread(buf,sizeInBytes,1,f);
+                        int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
+                        int num = *(int*)&buf[serializedBytes];
+                        
+                        launcher.launch1D( num);
+
+                        //this clFinish is for testing on errors
+                        clFinish(m_queue);
+                    }
+
+#endif
+					
+
+#ifdef DEBUG_ME
+					clFinish(m_queue);
+					gpuDebugInfo.read(debugInfo,numWorkItems);
+					clFinish(m_queue);
+					for (int i=0;i<numWorkItems;i++)
+					{
+						if (debugInfo[i].m_valInt2>0)
+						{
+							printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2);
+						}
+
+						if (debugInfo[i].m_valInt3>0)
+						{
+							printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3);
+						}
+					}
+#endif //DEBUG_ME
+
+
+				}
+			}
+		
+			clFinish(m_queue);
+
+
+		}
+
+		cdata.x = 1;
+		bool applyFriction=true;
+		if (applyFriction)
+    	{
+			B3_PROFILE("m_batchSolveKernel iterations2");
+			for(int iter=0; iter<m_nIterations; iter++)
+			{
+				for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++)
+				{
+					cdata.z = ib;
+					
+
+					b3BufferInfoCL bInfo[] = { 
+						b3BufferInfoCL( bodyBuf->getBufferCL() ), 
+						b3BufferInfoCL( shapeBuf->getBufferCL() ), 
+						b3BufferInfoCL( constraint->getBufferCL() ),
+						b3BufferInfoCL( m_numConstraints->getBufferCL() ), 
+						b3BufferInfoCL( m_offsets->getBufferCL() )
+#ifdef DEBUG_ME
+						,b3BufferInfoCL(&gpuDebugInfo)
+#endif //DEBUG_ME
+					};
+					b3LauncherCL launcher( m_queue, m_solveFrictionKernel,"m_solveFrictionKernel" );
+					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+					//launcher.setConst(  cdata.x );
+                    launcher.setConst(  cdata.y );
+                    launcher.setConst(  cdata.z );
+                    b3Int4 nSplit;
+					nSplit.x = B3_SOLVER_N_SPLIT_X;
+					nSplit.y = B3_SOLVER_N_SPLIT_Y;
+					nSplit.z = B3_SOLVER_N_SPLIT_Z;
+
+                    launcher.setConst(  nSplit );
+                    
+					launcher.launch1D( 64*nn/B3_SOLVER_N_BATCHES, 64 );
+				}
+			}
+			clFinish(m_queue);
+			
+		}
+#ifdef DEBUG_ME
+		delete[] debugInfo;
+#endif //DEBUG_ME
+	}
+
+	
+}
+
+void b3Solver::convertToConstraints( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, 
+	const b3OpenCLArray<b3InertiaData>* shapeBuf, 
+	b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData, 
+	int nContacts, const ConstraintCfg& cfg )
+{
+//	b3OpenCLArray<b3GpuConstraint4>* constraintNative =0;
+	contactCOut->resize(nContacts);
+	struct CB
+	{
+		int m_nContacts;
+		float m_dt;
+		float m_positionDrift;
+		float m_positionConstraintCoeff;
+	};
+
+	{
+
+		CB cdata;
+		cdata.m_nContacts = nContacts;
+		cdata.m_dt = cfg.m_dt;
+		cdata.m_positionDrift = cfg.m_positionDrift;
+		cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;
+
+		
+		if (gConvertConstraintOnCpu)
+		{
+			b3AlignedObjectArray<b3RigidBodyData> gBodies;
+		bodyBuf->copyToHost(gBodies);
+
+		b3AlignedObjectArray<b3Contact4> gContact;
+		contactsIn->copyToHost(gContact);
+
+		b3AlignedObjectArray<b3InertiaData> gShapes;
+		shapeBuf->copyToHost(gShapes);
+		
+		b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut;
+		gConstraintOut.resize(nContacts);
+		
+			B3_PROFILE("cpu contactToConstraintKernel");
+			for (int gIdx=0;gIdx<nContacts;gIdx++)
+			{
+				int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
+				int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
+
+				b3Float4 posA = gBodies[aIdx].m_pos;
+				b3Float4 linVelA = gBodies[aIdx].m_linVel;
+				b3Float4 angVelA = gBodies[aIdx].m_angVel;
+				float invMassA = gBodies[aIdx].m_invMass;
+				b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;
+
+				b3Float4 posB = gBodies[bIdx].m_pos;
+				b3Float4 linVelB = gBodies[bIdx].m_linVel;
+				b3Float4 angVelB = gBodies[bIdx].m_angVel;
+				float invMassB = gBodies[bIdx].m_invMass;
+				b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;
+
+				b3ContactConstraint4_t cs;
+
+    			setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
+					&gContact[gIdx], cdata.m_dt, cdata.m_positionDrift, cdata.m_positionConstraintCoeff,
+					&cs );
+		
+				cs.m_batchIdx = gContact[gIdx].m_batchIdx;
+
+				gConstraintOut[gIdx] = (b3GpuConstraint4&)cs;
+			}
+
+			contactCOut->copyFromHost(gConstraintOut);
+
+		} else
+		{
+			B3_PROFILE("gpu m_contactToConstraintKernel");
+
+		
+			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( shapeBuf->getBufferCL()),
+				b3BufferInfoCL( contactCOut->getBufferCL() )};
+			b3LauncherCL launcher( m_queue, m_contactToConstraintKernel,"m_contactToConstraintKernel" );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			//launcher.setConst(  cdata );
+        
+			launcher.setConst(cdata.m_nContacts);
+			launcher.setConst(cdata.m_dt);
+			launcher.setConst(cdata.m_positionDrift);
+			launcher.setConst(cdata.m_positionConstraintCoeff);
+        
+			launcher.launch1D( nContacts, 64 );	
+			clFinish(m_queue);
+
+		}
+	}
+
+	
+}
+
+/*
+void b3Solver::sortContacts(  const b3OpenCLArray<b3RigidBodyData>* bodyBuf, 
+			b3OpenCLArray<b3Contact4>* contactsIn, void* additionalData, 
+			int nContacts, const b3Solver::ConstraintCfg& cfg )
+{
+	
+	
+
+	const int sortAlignment = 512; // todo. get this out of sort
+	if( cfg.m_enableParallelSolve )
+	{
+		
+
+		int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );
+
+		b3OpenCLArray<unsigned int>* countsNative = m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost );
+		b3OpenCLArray<unsigned int>* offsetsNative = m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost );
+
+		{	//	2. set cell idx
+			struct CB
+			{
+				int m_nContacts;
+				int m_staticIdx;
+				float m_scale;
+				int m_nSplit;
+			};
+
+			b3Assert( sortSize%64 == 0 );
+			CB cdata;
+			cdata.m_nContacts = nContacts;
+			cdata.m_staticIdx = cfg.m_staticIdx;
+			cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent);
+			cdata.m_nSplit = B3_SOLVER_N_SPLIT;
+
+			
+			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
+			b3LauncherCL launcher( m_queue, m_setSortDataKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(  cdata );
+			launcher.launch1D( sortSize, 64 );
+		}
+
+		{	//	3. sort by cell idx
+			int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
+			int sortBit = 32;
+			//if( n <= 0xffff ) sortBit = 16;
+			//if( n <= 0xff ) sortBit = 8;
+			m_sort32->execute(*m_sortDataBuffer,sortSize);
+		}
+		{	//	4. find entries
+			m_search->execute( *m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, b3BoundSearchCL::COUNT);
+
+			m_scan->execute( *countsNative, *offsetsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT );
+		}
+
+		{	//	5. sort constraints by cellIdx
+			//	todo. preallocate this
+//			b3Assert( contactsIn->getType() == TYPE_HOST );
+//			b3OpenCLArray<b3Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn );	//	copying contacts to this buffer
+
+			{
+				
+
+				b3Int4 cdata; cdata.x = nContacts;
+				b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( m_contactBuffer->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
+				b3LauncherCL launcher( m_queue, m_reorderContactKernel );
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+				launcher.setConst(  cdata );
+				launcher.launch1D( nContacts, 64 );
+			}
+//			BufferUtils::unmap<true>( out, contactsIn, nContacts );
+		}
+	}
+
+	
+}
+
+*/
+void	b3Solver::batchContacts(  b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* nNative, b3OpenCLArray<unsigned int>* offsetsNative, int staticIdx )
+{
+	
+	int numWorkItems = 64*B3_SOLVER_N_CELLS;
+	{
+		B3_PROFILE("batch generation");
+		
+		b3Int4 cdata;
+		cdata.x = nContacts;
+		cdata.y = 0;
+		cdata.z = staticIdx;
+
+		
+#ifdef BATCH_DEBUG
+		SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
+		adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
+		memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
+		gpuDebugInfo.write(debugInfo,numWorkItems);
+#endif
+
+		
+
+#if 0
+		b3BufferInfoCL bInfo[] = { 
+			b3BufferInfoCL( contacts->getBufferCL() ), 
+			b3BufferInfoCL(  m_contactBuffer2->getBufferCL()),
+			b3BufferInfoCL( nNative->getBufferCL() ), 
+			b3BufferInfoCL( offsetsNative->getBufferCL() ),
+#ifdef BATCH_DEBUG
+			,	b3BufferInfoCL(&gpuDebugInfo)
+#endif
+		};
+#endif
+		
+		
+
+		{
+			m_batchSizes.resize(nNative->size());
+			B3_PROFILE("batchingKernel");
+			//b3LauncherCL launcher( m_queue, m_batchingKernel);
+			cl_kernel k = useNewBatchingKernel ? m_batchingKernelNew : m_batchingKernel;
+
+			b3LauncherCL launcher( m_queue, k,"*batchingKernel");
+			if (!useNewBatchingKernel )
+			{
+				launcher.setBuffer( contacts->getBufferCL() );
+			}
+			launcher.setBuffer( m_contactBuffer2->getBufferCL() );
+			launcher.setBuffer( nNative->getBufferCL());
+			launcher.setBuffer( offsetsNative->getBufferCL());
+			
+			launcher.setBuffer(m_batchSizes.getBufferCL());
+			
+
+			//launcher.setConst(  cdata );
+            launcher.setConst(staticIdx);
+            
+			launcher.launch1D( numWorkItems, 64 );
+			//clFinish(m_queue);
+			//b3AlignedObjectArray<int> batchSizesCPU;
+			//m_batchSizes.copyToHost(batchSizesCPU);
+			//printf(".\n");
+		}
+
+#ifdef BATCH_DEBUG
+	aaaa
+		b3Contact4* hostContacts = new b3Contact4[nContacts];
+		m_contactBuffer->read(hostContacts,nContacts);
+		clFinish(m_queue);
+
+		gpuDebugInfo.read(debugInfo,numWorkItems);
+		clFinish(m_queue);
+
+		for (int i=0;i<numWorkItems;i++)
+		{
+			if (debugInfo[i].m_valInt1>0)
+			{
+				printf("catch\n");
+			}
+			if (debugInfo[i].m_valInt2>0)
+			{
+				printf("catch22\n");
+			}
+
+			if (debugInfo[i].m_valInt3>0)
+			{
+				printf("catch666\n");
+			}
+
+			if (debugInfo[i].m_valInt4>0)
+			{
+				printf("catch777\n");
+			}
+		}
+		delete[] debugInfo;
+#endif //BATCH_DEBUG
+
+	}
+
+//	copy buffer to buffer
+	//b3Assert(m_contactBuffer->size()==nContacts);
+	//contacts->copyFromOpenCLArray( *m_contactBuffer);
+	//clFinish(m_queue);//needed?
+	
+	
+	
+}
+