34 files changed, 25885 insertions, 0 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h
new file mode 100644
index 0000000000..872f039506
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h
@@ -0,0 +1,18 @@
+#ifndef B3_BVH_INFO_H
+#define B3_BVH_INFO_H
+
+#include "Bullet3Common/b3Vector3.h"
+
+struct b3BvhInfo
+{
+	b3Vector3	m_aabbMin;
+	b3Vector3	m_aabbMax;
+	b3Vector3	m_quantization;
+	int			m_numNodes;
+	int			m_numSubTrees;
+	int			m_nodeOffset;
+	int			m_subTreeOffset;
+
+};
+
+#endif //B3_BVH_INFO_H
+\ No newline at end of file
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp
new file mode 100644
index 0000000000..cb30ee939b
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp
@@ -0,0 +1,258 @@
+
+#if 0
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "b3ContactCache.h"
+#include "Bullet3Common/b3Transform.h"
+
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+
+b3Scalar					gContactBreakingThreshold = b3Scalar(0.02);
+
+///gContactCalcArea3Points will approximate the convex hull area using 3 points
+///when setting it to false, it will use 4 points to compute the area: it is more accurate but slower
+bool						gContactCalcArea3Points = true;
+
+
+
+
+static inline b3Scalar calcArea4Points(const b3Vector3 &p0,const b3Vector3 &p1,const b3Vector3 &p2,const b3Vector3 &p3)
+{
+	// It calculates possible 3 area constructed from random 4 points and returns the biggest one.
+
+	b3Vector3 a[3],b[3];
+	a[0] = p0 - p1;
+	a[1] = p0 - p2;
+	a[2] = p0 - p3;
+	b[0] = p2 - p3;
+	b[1] = p1 - p3;
+	b[2] = p1 - p2;
+
+	//todo: Following 3 cross production can be easily optimized by SIMD.
+	b3Vector3 tmp0 = a[0].cross(b[0]);
+	b3Vector3 tmp1 = a[1].cross(b[1]);
+	b3Vector3 tmp2 = a[2].cross(b[2]);
+
+	return b3Max(b3Max(tmp0.length2(),tmp1.length2()),tmp2.length2());
+}
+#if 0
+
+//using localPointA for all points
+int b3ContactCache::sortCachedPoints(const b3Vector3& pt) 
+{
+		//calculate 4 possible cases areas, and take biggest area
+		//also need to keep 'deepest'
+		
+		int maxPenetrationIndex = -1;
+#define KEEP_DEEPEST_POINT 1
+#ifdef KEEP_DEEPEST_POINT
+		b3Scalar maxPenetration = pt.getDistance();
+		for (int i=0;i<4;i++)
+		{
+			if (m_pointCache[i].getDistance() < maxPenetration)
+			{
+				maxPenetrationIndex = i;
+				maxPenetration = m_pointCache[i].getDistance();
+			}
+		}
+#endif //KEEP_DEEPEST_POINT
+		
+		b3Scalar res0(b3Scalar(0.)),res1(b3Scalar(0.)),res2(b3Scalar(0.)),res3(b3Scalar(0.));
+
+	if (gContactCalcArea3Points)
+	{
+		if (maxPenetrationIndex != 0)
+		{
+			b3Vector3 a0 = pt.m_localPointA-m_pointCache[1].m_localPointA;
+			b3Vector3 b0 = m_pointCache[3].m_localPointA-m_pointCache[2].m_localPointA;
+			b3Vector3 cross = a0.cross(b0);
+			res0 = cross.length2();
+		}
+		if (maxPenetrationIndex != 1)
+		{
+			b3Vector3 a1 = pt.m_localPointA-m_pointCache[0].m_localPointA;
+			b3Vector3 b1 = m_pointCache[3].m_localPointA-m_pointCache[2].m_localPointA;
+			b3Vector3 cross = a1.cross(b1);
+			res1 = cross.length2();
+		}
+
+		if (maxPenetrationIndex != 2)
+		{
+			b3Vector3 a2 = pt.m_localPointA-m_pointCache[0].m_localPointA;
+			b3Vector3 b2 = m_pointCache[3].m_localPointA-m_pointCache[1].m_localPointA;
+			b3Vector3 cross = a2.cross(b2);
+			res2 = cross.length2();
+		}
+
+		if (maxPenetrationIndex != 3)
+		{
+			b3Vector3 a3 = pt.m_localPointA-m_pointCache[0].m_localPointA;
+			b3Vector3 b3 = m_pointCache[2].m_localPointA-m_pointCache[1].m_localPointA;
+			b3Vector3 cross = a3.cross(b3);
+			res3 = cross.length2();
+		}
+	} 
+	else
+	{
+		if(maxPenetrationIndex != 0) {
+			res0 = calcArea4Points(pt.m_localPointA,m_pointCache[1].m_localPointA,m_pointCache[2].m_localPointA,m_pointCache[3].m_localPointA);
+		}
+
+		if(maxPenetrationIndex != 1) {
+			res1 = calcArea4Points(pt.m_localPointA,m_pointCache[0].m_localPointA,m_pointCache[2].m_localPointA,m_pointCache[3].m_localPointA);
+		}
+
+		if(maxPenetrationIndex != 2) {
+			res2 = calcArea4Points(pt.m_localPointA,m_pointCache[0].m_localPointA,m_pointCache[1].m_localPointA,m_pointCache[3].m_localPointA);
+		}
+
+		if(maxPenetrationIndex != 3) {
+			res3 = calcArea4Points(pt.m_localPointA,m_pointCache[0].m_localPointA,m_pointCache[1].m_localPointA,m_pointCache[2].m_localPointA);
+		}
+	}
+	b3Vector4 maxvec(res0,res1,res2,res3);
+	int biggestarea = maxvec.closestAxis4();
+	return biggestarea;
+	
+}
+
+
+int b3ContactCache::getCacheEntry(const b3Vector3& newPoint) const
+{
+	b3Scalar shortestDist =  getContactBreakingThreshold() * getContactBreakingThreshold();
+	int size = getNumContacts();
+	int nearestPoint = -1;
+	for( int i = 0; i < size; i++ )
+	{
+		const b3Vector3 &mp = m_pointCache[i];
+
+		b3Vector3 diffA =  mp.m_localPointA- newPoint.m_localPointA;
+		const b3Scalar distToManiPoint = diffA.dot(diffA);
+		if( distToManiPoint < shortestDist )
+		{
+			shortestDist = distToManiPoint;
+			nearestPoint = i;
+		}
+	}
+	return nearestPoint;
+}
+
+int b3ContactCache::addManifoldPoint(const b3Vector3& newPoint)
+{
+	b3Assert(validContactDistance(newPoint));
+	
+	int insertIndex = getNumContacts();
+	if (insertIndex == MANIFOLD_CACHE_SIZE)
+	{
+#if MANIFOLD_CACHE_SIZE >= 4
+		//sort cache so best points come first, based on area
+		insertIndex = sortCachedPoints(newPoint);
+#else
+		insertIndex = 0;
+#endif
+		clearUserCache(m_pointCache[insertIndex]);
+		
+	} else
+	{
+		m_cachedPoints++;
+
+		
+	}
+	if (insertIndex<0)
+		insertIndex=0;
+
+	//b3Assert(m_pointCache[insertIndex].m_userPersistentData==0);
+	m_pointCache[insertIndex] = newPoint;
+	return insertIndex;
+}
+
+#endif
+
+bool b3ContactCache::validContactDistance(const b3Vector3& pt)
+{
+	return pt.w <= gContactBreakingThreshold;
+}
+
+void b3ContactCache::removeContactPoint(struct b3Contact4Data& newContactCache,int i)
+{
+	int numContacts = b3Contact4Data_getNumPoints(&newContactCache);
+	if (i!=(numContacts-1))
+	{
+		b3Swap(newContactCache.m_localPosA[i],newContactCache.m_localPosA[numContacts-1]);
+		b3Swap(newContactCache.m_localPosB[i],newContactCache.m_localPosB[numContacts-1]);
+		b3Swap(newContactCache.m_worldPosB[i],newContactCache.m_worldPosB[numContacts-1]);
+	}
+	b3Contact4Data_setNumPoints(&newContactCache,numContacts-1);
+
+}
+
+
+void b3ContactCache::refreshContactPoints(const b3Transform& trA,const b3Transform& trB, struct b3Contact4Data& contacts)
+{
+
+	int numContacts = b3Contact4Data_getNumPoints(&contacts);
+	
+
+	int i;
+	/// first refresh worldspace positions and distance
+	for (i=numContacts-1;i>=0;i--)
+	{
+		b3Vector3 worldPosA = trA( contacts.m_localPosA[i]);
+		b3Vector3 worldPosB = trB( contacts.m_localPosB[i]);
+		contacts.m_worldPosB[i] = worldPosB;
+		float distance = (worldPosA -  worldPosB).dot(contacts.m_worldNormalOnB);
+		contacts.m_worldPosB[i].w = distance;
+	}
+
+	/// then 
+	b3Scalar distance2d;
+	b3Vector3 projectedDifference,projectedPoint;
+	for (i=numContacts-1;i>=0;i--)
+	{
+		b3Vector3 worldPosA = trA( contacts.m_localPosA[i]);
+		b3Vector3 worldPosB = trB( contacts.m_localPosB[i]);
+		b3Vector3&pt = contacts.m_worldPosB[i];
+		//contact becomes invalid when signed distance exceeds margin (projected on contactnormal direction)
+		if (!validContactDistance(pt))
+		{
+			removeContactPoint(contacts,i);
+		} else
+		{
+			//contact also becomes invalid when relative movement orthogonal to normal exceeds margin
+			projectedPoint = worldPosA - contacts.m_worldNormalOnB * contacts.m_worldPosB[i].w;
+			projectedDifference = contacts.m_worldPosB[i] - projectedPoint;
+			distance2d = projectedDifference.dot(projectedDifference);
+			if (distance2d  > gContactBreakingThreshold*gContactBreakingThreshold )
+			{
+				removeContactPoint(contacts,i);
+			} else
+			{
+				////contact point processed callback
+				//if (gContactProcessedCallback)
+				//	(*gContactProcessedCallback)(manifoldPoint,(void*)m_body0,(void*)m_body1);
+			}
+		}
+	}
+	
+
+}
+
+
+
+
+
+#endif
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h
new file mode 100644
index 0000000000..d6c9b0a07e
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h
@@ -0,0 +1,80 @@
+
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_CONTACT_CACHE_H
+#define B3_CONTACT_CACHE_H
+
+
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3Transform.h"
+#include "Bullet3Common/b3AlignedAllocator.h"
+
+
+///maximum contact breaking and merging threshold
+extern b3Scalar gContactBreakingThreshold;
+
+
+
+#define MANIFOLD_CACHE_SIZE 4
+
+///b3ContactCache is a contact point cache, it stays persistent as long as objects are overlapping in the broadphase.
+///Those contact points are created by the collision narrow phase.
+///The cache can be empty, or hold 1,2,3 or 4 points. Some collision algorithms (GJK) might only add one point at a time.
+///updates/refreshes old contact points, and throw them away if necessary (distance becomes too large)
+///reduces the cache to 4 points, when more then 4 points are added, using following rules:
+///the contact point with deepest penetration is always kept, and it tries to maximuze the area covered by the points
+///note that some pairs of objects might have more then one contact manifold.
+B3_ATTRIBUTE_ALIGNED16( class) b3ContactCache
+{
+
+	
+
+	
+	/// sort cached points so most isolated points come first
+	int	sortCachedPoints(const b3Vector3& pt);
+
+	
+
+public:
+
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	
+	
+	int addManifoldPoint( const b3Vector3& newPoint);
+
+	/*void replaceContactPoint(const b3Vector3& newPoint,int insertIndex)
+	{
+		b3Assert(validContactDistance(newPoint));
+		m_pointCache[insertIndex] = newPoint;
+	}
+	*/
+
+
+	
+	static bool validContactDistance(const b3Vector3& pt);
+	
+	/// calculated new worldspace coordinates and depth, and reject points that exceed the collision margin
+	static void	refreshContactPoints(  const b3Transform& trA,const b3Transform& trB, struct b3Contact4Data& newContactCache);
+
+	static void removeContactPoint(struct b3Contact4Data& newContactCache,int i);
+	
+
+};
+
+
+
+#endif //B3_CONTACT_CACHE_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp
new file mode 100644
index 0000000000..fb435aa7fd
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp
@@ -0,0 +1,4733 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+bool findSeparatingAxisOnGpu = true;
+bool splitSearchSepAxisConcave = false;
+bool splitSearchSepAxisConvex = true;
+bool useMprGpu = true;//use mpr for edge-edge  (+contact point) or sat. Needs testing on main OpenCL platforms, before enabling...
+bool bvhTraversalKernelGPU = true;
+bool findConcaveSeparatingAxisKernelGPU = true;
+bool clipConcaveFacesAndFindContactsCPU = false;//false;//true;
+bool clipConvexFacesAndFindContactsCPU = false;//false;//true;
+bool reduceConcaveContactsOnGPU = true;//false;
+bool reduceConvexContactsOnGPU = true;//false;
+bool findConvexClippingFacesGPU = true;
+bool useGjk = false;///option for CPU/host testing, when findSeparatingAxisOnGpu = false
+bool useGjkContacts = false;//////option for CPU/host testing when findSeparatingAxisOnGpu = false
+
+
+static int myframecount=0;///for testing
+
+///This file was written by Erwin Coumans
+///Separating axis rest based on work from Pierre Terdiman, see
+///And contact clipping based on work from Simon Hobbs
+
+//#define B3_DEBUG_SAT_FACE
+
+//#define CHECK_ON_HOST
+
+#ifdef CHECK_ON_HOST
+//#define PERSISTENT_CONTACTS_HOST
+#endif
+
+int b3g_actualSATPairTests=0;
+
+#include "b3ConvexHullContact.h"
+#include <string.h>//memcpy
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3MprPenetration.h"
+
+#include "Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h"
+#include "Bullet3Geometry/b3AabbUtil.h"
+
+typedef b3AlignedObjectArray<b3Vector3> b3VertexArray;
+
+
+#include <float.h> //for FLT_MAX
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+//#include "AdlQuaternion.h"
+
+#include "kernels/satKernels.h"
+#include "kernels/mprKernels.h"
+
+#include "kernels/satConcaveKernels.h"
+
+#include "kernels/satClipHullContacts.h"
+#include "kernels/bvhTraversal.h"
+#include "kernels/primitiveContacts.h"
+
+
+#include "Bullet3Geometry/b3AabbUtil.h"
+
+#define BT_NARROWPHASE_SAT_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl"
+#define BT_NARROWPHASE_SAT_CONCAVE_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcave.cl"
+
+#define BT_NARROWPHASE_MPR_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl"
+
+
+#define BT_NARROWPHASE_CLIPHULL_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl"
+#define BT_NARROWPHASE_BVH_TRAVERSAL_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl"
+#define BT_NARROWPHASE_PRIMITIVE_CONTACT_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl"
+
+
+#ifndef __global
+#define __global
+#endif
+
+#ifndef __kernel
+#define __kernel
+#endif
+
+
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhTraversal.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3FindConcaveSatAxis.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ClipFaces.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3NewContactReduction.h"
+
+
+
+#define dot3F4 b3Dot
+
+GpuSatCollision::GpuSatCollision(cl_context ctx,cl_device_id device, cl_command_queue  q )
+:m_context(ctx),
+m_device(device),
+m_queue(q),
+
+m_findSeparatingAxisKernel(0),
+m_findSeparatingAxisVertexFaceKernel(0),
+m_findSeparatingAxisEdgeEdgeKernel(0),
+m_unitSphereDirections(m_context,m_queue),
+
+m_totalContactsOut(m_context, m_queue),
+m_sepNormals(m_context, m_queue),
+m_dmins(m_context,m_queue),
+
+m_hasSeparatingNormals(m_context, m_queue),
+m_concaveSepNormals(m_context, m_queue),
+m_concaveHasSeparatingNormals(m_context,m_queue),
+m_numConcavePairsOut(m_context, m_queue),
+
+
+m_gpuCompoundPairs(m_context, m_queue),
+
+
+m_gpuCompoundSepNormals(m_context, m_queue),
+m_gpuHasCompoundSepNormals(m_context, m_queue),
+
+m_numCompoundPairsOut(m_context, m_queue)
+{
+	m_totalContactsOut.push_back(0);
+	
+	cl_int errNum=0;
+
+	if (1)
+	{
+		const char* mprSrc = mprKernelsCL;
+		
+		const char* srcConcave = satConcaveKernelsCL;
+		char flags[1024]={0};
+//#ifdef CL_PLATFORM_INTEL
+//		sprintf(flags,"-g -s \"%s\"","C:/develop/bullet3_experiments2/opencl/gpu_narrowphase/kernels/sat.cl");
+//#endif
+		m_mprPenetrationKernel  = 0;
+		m_findSeparatingAxisUnitSphereKernel = 0;
+
+		if (useMprGpu)
+		{
+			cl_program mprProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,mprSrc,&errNum,flags,BT_NARROWPHASE_MPR_PATH);
+			b3Assert(errNum==CL_SUCCESS);
+		
+			m_mprPenetrationKernel  = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,mprSrc, "mprPenetrationKernel",&errNum,mprProg );
+			b3Assert(m_mprPenetrationKernel);
+			b3Assert(errNum==CL_SUCCESS);
+
+			m_findSeparatingAxisUnitSphereKernel =  b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,mprSrc, "findSeparatingAxisUnitSphereKernel",&errNum,mprProg );
+			b3Assert(m_findSeparatingAxisUnitSphereKernel);
+            b3Assert(errNum==CL_SUCCESS);
+
+
+			int numDirections = sizeof(unitSphere162)/sizeof(b3Vector3);
+			m_unitSphereDirections.resize(numDirections);
+			m_unitSphereDirections.copyFromHostPointer(unitSphere162,numDirections,0,true);
+
+
+		}
+
+
+		cl_program satProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,satKernelsCL,&errNum,flags,BT_NARROWPHASE_SAT_PATH);
+		b3Assert(errNum==CL_SUCCESS);
+
+		cl_program satConcaveProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcConcave,&errNum,flags,BT_NARROWPHASE_SAT_CONCAVE_PATH);
+		b3Assert(errNum==CL_SUCCESS);
+
+		m_findSeparatingAxisKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findSeparatingAxisKernel",&errNum,satProg );
+		b3Assert(m_findSeparatingAxisKernel);
+		b3Assert(errNum==CL_SUCCESS);
+
+
+		m_findSeparatingAxisVertexFaceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findSeparatingAxisVertexFaceKernel",&errNum,satProg );
+		b3Assert(m_findSeparatingAxisVertexFaceKernel);
+
+		m_findSeparatingAxisEdgeEdgeKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findSeparatingAxisEdgeEdgeKernel",&errNum,satProg );
+		b3Assert(m_findSeparatingAxisVertexFaceKernel);
+
+
+		m_findConcaveSeparatingAxisKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findConcaveSeparatingAxisKernel",&errNum,satProg );
+		b3Assert(m_findConcaveSeparatingAxisKernel);
+		b3Assert(errNum==CL_SUCCESS);
+        
+        m_findConcaveSeparatingAxisVertexFaceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcConcave, "findConcaveSeparatingAxisVertexFaceKernel",&errNum,satConcaveProg );
+		b3Assert(m_findConcaveSeparatingAxisVertexFaceKernel);
+		b3Assert(errNum==CL_SUCCESS);
+        
+        m_findConcaveSeparatingAxisEdgeEdgeKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcConcave, "findConcaveSeparatingAxisEdgeEdgeKernel",&errNum,satConcaveProg );
+		b3Assert(m_findConcaveSeparatingAxisEdgeEdgeKernel);
+		b3Assert(errNum==CL_SUCCESS);
+        
+     
+        
+		
+		m_findCompoundPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findCompoundPairsKernel",&errNum,satProg );
+		b3Assert(m_findCompoundPairsKernel);
+		b3Assert(errNum==CL_SUCCESS);
+		m_processCompoundPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "processCompoundPairsKernel",&errNum,satProg );
+		b3Assert(m_processCompoundPairsKernel);
+		b3Assert(errNum==CL_SUCCESS);
+	}
+
+	if (1)
+	{
+		const char* srcClip = satClipKernelsCL;
+
+		char flags[1024]={0};
+//#ifdef CL_PLATFORM_INTEL
+//		sprintf(flags,"-g -s \"%s\"","C:/develop/bullet3_experiments2/opencl/gpu_narrowphase/kernels/satClipHullContacts.cl");
+//#endif
+
+		cl_program satClipContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcClip,&errNum,flags,BT_NARROWPHASE_CLIPHULL_PATH);
+		b3Assert(errNum==CL_SUCCESS);
+
+		m_clipHullHullKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipHullHullKernel",&errNum,satClipContactsProg);
+		b3Assert(errNum==CL_SUCCESS);
+
+		m_clipCompoundsHullHullKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipCompoundsHullHullKernel",&errNum,satClipContactsProg);
+		b3Assert(errNum==CL_SUCCESS);
+		
+
+        m_findClippingFacesKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "findClippingFacesKernel",&errNum,satClipContactsProg);
+		b3Assert(errNum==CL_SUCCESS);
+
+        m_clipFacesAndFindContacts = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipFacesAndFindContactsKernel",&errNum,satClipContactsProg);
+		b3Assert(errNum==CL_SUCCESS);        
+
+		m_clipHullHullConcaveConvexKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipHullHullConcaveConvexKernel",&errNum,satClipContactsProg);
+		b3Assert(errNum==CL_SUCCESS);
+
+//		m_extractManifoldAndAddContactKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "extractManifoldAndAddContactKernel",&errNum,satClipContactsProg);
+	//	b3Assert(errNum==CL_SUCCESS);
+
+        m_newContactReductionKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip,
+                            "newContactReductionKernel",&errNum,satClipContactsProg);
+		b3Assert(errNum==CL_SUCCESS);
+	}
+   else
+	{
+		m_clipHullHullKernel=0;
+		m_clipCompoundsHullHullKernel = 0;
+        m_findClippingFacesKernel = 0;
+        m_newContactReductionKernel=0;
+        m_clipFacesAndFindContacts = 0;
+		m_clipHullHullConcaveConvexKernel = 0;
+//		m_extractManifoldAndAddContactKernel = 0;
+	}
+
+	 if (1)
+	{
+		const char* srcBvh = bvhTraversalKernelCL;
+		cl_program bvhTraversalProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcBvh,&errNum,"",BT_NARROWPHASE_BVH_TRAVERSAL_PATH);
+		b3Assert(errNum==CL_SUCCESS);
+
+		m_bvhTraversalKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcBvh, "bvhTraversalKernel",&errNum,bvhTraversalProg,"");
+		b3Assert(errNum==CL_SUCCESS);
+
+	}
+        
+	 {
+		 const char* primitiveContactsSrc = primitiveContactsKernelsCL;
+		cl_program primitiveContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,primitiveContactsSrc,&errNum,"",BT_NARROWPHASE_PRIMITIVE_CONTACT_PATH);
+		b3Assert(errNum==CL_SUCCESS);
+
+		m_primitiveContactsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,primitiveContactsSrc, "primitiveContactsKernel",&errNum,primitiveContactsProg,"");
+		b3Assert(errNum==CL_SUCCESS);
+
+		m_findConcaveSphereContactsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,primitiveContactsSrc, "findConcaveSphereContactsKernel",&errNum,primitiveContactsProg );
+		b3Assert(errNum==CL_SUCCESS);
+		b3Assert(m_findConcaveSphereContactsKernel);
+
+		m_processCompoundPairsPrimitivesKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,primitiveContactsSrc, "processCompoundPairsPrimitivesKernel",&errNum,primitiveContactsProg,"");
+		b3Assert(errNum==CL_SUCCESS);
+		b3Assert(m_processCompoundPairsPrimitivesKernel);
+		 
+	 }
+	
+
+}
+
+GpuSatCollision::~GpuSatCollision()
+{
+	
+	if (m_findSeparatingAxisVertexFaceKernel)
+		clReleaseKernel(m_findSeparatingAxisVertexFaceKernel);
+
+	if (m_findSeparatingAxisEdgeEdgeKernel)
+		clReleaseKernel(m_findSeparatingAxisEdgeEdgeKernel);
+
+	if (m_findSeparatingAxisUnitSphereKernel)
+		clReleaseKernel(m_findSeparatingAxisUnitSphereKernel);
+
+	if (m_mprPenetrationKernel)
+		clReleaseKernel(m_mprPenetrationKernel);
+
+
+	if (m_findSeparatingAxisKernel)
+		clReleaseKernel(m_findSeparatingAxisKernel);
+
+    if (m_findConcaveSeparatingAxisVertexFaceKernel)
+        clReleaseKernel(m_findConcaveSeparatingAxisVertexFaceKernel);
+
+    
+    if (m_findConcaveSeparatingAxisEdgeEdgeKernel)
+        clReleaseKernel(m_findConcaveSeparatingAxisEdgeEdgeKernel);
+    
+	if (m_findConcaveSeparatingAxisKernel)
+		clReleaseKernel(m_findConcaveSeparatingAxisKernel);
+
+	if (m_findCompoundPairsKernel)
+		clReleaseKernel(m_findCompoundPairsKernel);
+
+	if (m_processCompoundPairsKernel)
+		clReleaseKernel(m_processCompoundPairsKernel);
+    
+    if (m_findClippingFacesKernel)
+        clReleaseKernel(m_findClippingFacesKernel);
+   
+    if (m_clipFacesAndFindContacts)
+        clReleaseKernel(m_clipFacesAndFindContacts);
+    if (m_newContactReductionKernel)
+        clReleaseKernel(m_newContactReductionKernel);
+	if (m_primitiveContactsKernel)
+		clReleaseKernel(m_primitiveContactsKernel);
+    
+	if (m_findConcaveSphereContactsKernel)
+		clReleaseKernel(m_findConcaveSphereContactsKernel);
+
+	if (m_processCompoundPairsPrimitivesKernel)
+		clReleaseKernel(m_processCompoundPairsPrimitivesKernel);
+
+	if (m_clipHullHullKernel)
+		clReleaseKernel(m_clipHullHullKernel);
+	if (m_clipCompoundsHullHullKernel)
+		clReleaseKernel(m_clipCompoundsHullHullKernel);
+
+	if (m_clipHullHullConcaveConvexKernel)
+		clReleaseKernel(m_clipHullHullConcaveConvexKernel);
+//	if (m_extractManifoldAndAddContactKernel)
+	//	clReleaseKernel(m_extractManifoldAndAddContactKernel);
+
+	if (m_bvhTraversalKernel)
+		clReleaseKernel(m_bvhTraversalKernel);
+
+}
+
+struct MyTriangleCallback : public b3NodeOverlapCallback
+{
+	int m_bodyIndexA;
+	int m_bodyIndexB;
+
+	virtual void processNode(int subPart, int triangleIndex)
+	{
+		printf("bodyIndexA %d, bodyIndexB %d\n",m_bodyIndexA,m_bodyIndexB);
+		printf("triangleIndex %d\n", triangleIndex);
+	}
+};
+
+
+#define float4 b3Vector3
+#define make_float4(x,y,z,w) b3MakeVector3(x,y,z,w)
+
+float signedDistanceFromPointToPlane(const float4& point, const float4& planeEqn, float4* closestPointOnFace)
+{
+	float4 n = planeEqn;
+	n[3] = 0.f;
+	float dist = dot3F4(n, point) + planeEqn[3];
+	*closestPointOnFace = point - dist * n;
+	return dist;
+}
+
+
+
+#define cross3(a,b) (a.cross(b))
+b3Vector3 transform(const b3Vector3* v, const b3Vector3* pos, const b3Quaternion* orn)
+{
+	b3Transform tr;
+	tr.setIdentity();
+	tr.setOrigin(*pos);
+	tr.setRotation(*orn);
+	b3Vector3 res = tr(*v);
+	return res;
+}
+
+
+inline bool IsPointInPolygon(const float4& p, 
+							const b3GpuFace* face,
+							 const float4* baseVertex,
+							const  int* convexIndices,
+							float4* out)
+{
+    float4 a;
+    float4 b;
+    float4 ab;
+    float4 ap;
+    float4 v;
+
+	float4 plane = b3MakeVector3(face->m_plane.x,face->m_plane.y,face->m_plane.z,0.f);
+	
+	if (face->m_numIndices<2)
+		return false;
+
+	
+	float4 v0 = baseVertex[convexIndices[face->m_indexOffset + face->m_numIndices-1]];
+	b = v0;
+
+    for(unsigned i=0; i != face->m_numIndices; ++i)
+    {
+		a = b;
+		float4 vi = baseVertex[convexIndices[face->m_indexOffset + i]];
+		b = vi;
+        ab = b-a;
+        ap = p-a;
+        v = cross3(ab,plane);
+
+        if (b3Dot(ap, v) > 0.f)
+        {
+            float ab_m2 = b3Dot(ab, ab);
+            float rt = ab_m2 != 0.f ? b3Dot(ab, ap) / ab_m2 : 0.f;
+            if (rt <= 0.f)
+            {
+                *out = a;
+            }
+            else if (rt >= 1.f) 
+            {
+                *out = b;
+            }
+            else
+            {
+            	float s = 1.f - rt;
+				out[0].x = s * a.x + rt * b.x;
+				out[0].y = s * a.y + rt * b.y;
+				out[0].z = s * a.z + rt * b.z;
+            }
+            return false;
+        }
+    }
+    return true;
+}
+
+#define normalize3(a) (a.normalize())
+
+
+int extractManifoldSequentialGlobal( const float4* p, int nPoints, const float4& nearNormal, b3Int4* contactIdx)
+{
+	if( nPoints == 0 )
+        return 0;
+    
+    if (nPoints <=4)
+        return nPoints;
+    
+    
+    if (nPoints >64)
+        nPoints = 64;
+    
+	float4 center = b3MakeVector3(0,0,0,0);
+	{
+		
+		for (int i=0;i<nPoints;i++)
+			center += p[i];
+		center /= (float)nPoints;
+	}
+    
+	
+    
+	//	sample 4 directions
+    
+    float4 aVector = p[0] - center;
+    float4 u = cross3( nearNormal, aVector );
+    float4 v = cross3( nearNormal, u );
+    u = normalize3( u );
+    v = normalize3( v );
+    
+    
+    //keep point with deepest penetration
+    float minW= FLT_MAX;
+    
+    int minIndex=-1;
+    
+    float4 maxDots;
+    maxDots.x = FLT_MIN;
+    maxDots.y = FLT_MIN;
+    maxDots.z = FLT_MIN;
+    maxDots.w = FLT_MIN;
+    
+    //	idx, distance
+    for(int ie = 0; ie<nPoints; ie++ )
+    {
+        if (p[ie].w<minW)
+        {
+            minW = p[ie].w;
+            minIndex=ie;
+        }
+        float f;
+        float4 r = p[ie]-center;
+        f = dot3F4( u, r );
+        if (f<maxDots.x)
+        {
+            maxDots.x = f;
+            contactIdx[0].x = ie;
+        }
+        
+        f = dot3F4( -u, r );
+        if (f<maxDots.y)
+        {
+            maxDots.y = f;
+            contactIdx[0].y = ie;
+        }
+        
+        
+        f = dot3F4( v, r );
+        if (f<maxDots.z)
+        {
+            maxDots.z = f;
+            contactIdx[0].z = ie;
+        }
+        
+        f = dot3F4( -v, r );
+        if (f<maxDots.w)
+        {
+            maxDots.w = f;
+            contactIdx[0].w = ie;
+        }
+        
+    }
+    
+    if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)
+    {
+        //replace the first contact with minimum (todo: replace contact with least penetration)
+        contactIdx[0].x = minIndex;
+    }
+    
+    return 4;
+    
+}
+
+
+
+#define MAX_VERTS 1024
+
+
+inline void project(const b3ConvexPolyhedronData& hull,  const float4& pos, const b3Quaternion& orn, const float4& dir, const b3AlignedObjectArray<b3Vector3>& vertices, b3Scalar& min, b3Scalar& max)
+{
+	min = FLT_MAX;
+	max = -FLT_MAX;
+	int numVerts = hull.m_numVertices;
+
+	const float4 localDir = b3QuatRotate(orn.inverse(),dir);
+
+	b3Scalar offset = dot3F4(pos,dir);
+
+	for(int i=0;i<numVerts;i++)
+	{
+		//b3Vector3 pt = trans * vertices[m_vertexOffset+i];
+		//b3Scalar dp = pt.dot(dir);
+		//b3Vector3 vertex = vertices[hull.m_vertexOffset+i];
+		b3Scalar dp = dot3F4((float4&)vertices[hull.m_vertexOffset+i],localDir);
+		//b3Assert(dp==dpL);
+		if(dp < min)	min = dp;
+		if(dp > max)	max = dp;
+	}
+	if(min>max)
+	{
+		b3Scalar tmp = min;
+		min = max;
+		max = tmp;
+	}
+	min += offset;
+	max += offset;
+}
+
+
+static bool TestSepAxis(const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, 
+	const float4& posA,const b3Quaternion& ornA,
+	const float4& posB,const b3Quaternion& ornB,
+	const float4& sep_axis, const b3AlignedObjectArray<b3Vector3>& verticesA,const b3AlignedObjectArray<b3Vector3>& verticesB,b3Scalar& depth)
+{
+	b3Scalar Min0,Max0;
+	b3Scalar Min1,Max1;
+	project(hullA,posA,ornA,sep_axis,verticesA, Min0, Max0);
+	project(hullB,posB,ornB, sep_axis,verticesB, Min1, Max1);
+
+	if(Max0<Min1 || Max1<Min0)
+		return false;
+
+	b3Scalar d0 = Max0 - Min1;
+	assert(d0>=0.0f);
+	b3Scalar d1 = Max1 - Min0;
+	assert(d1>=0.0f);
+	depth = d0<d1 ? d0:d1;
+	return true;
+}
+
+inline bool IsAlmostZero(const b3Vector3& v)
+{
+	if(fabsf(v.x)>1e-6 || fabsf(v.y)>1e-6 || fabsf(v.z)>1e-6)	return false;
+	return true;
+}
+
+
+static bool findSeparatingAxis(	const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, 
+	const float4& posA1,
+	const b3Quaternion& ornA,
+	const float4& posB1,
+	const b3Quaternion& ornB,
+	const b3AlignedObjectArray<b3Vector3>& verticesA,
+	const b3AlignedObjectArray<b3Vector3>& uniqueEdgesA, 
+	const b3AlignedObjectArray<b3GpuFace>& facesA,
+	const b3AlignedObjectArray<int>& indicesA,
+	const b3AlignedObjectArray<b3Vector3>& verticesB, 
+	const b3AlignedObjectArray<b3Vector3>& uniqueEdgesB, 
+	const b3AlignedObjectArray<b3GpuFace>& facesB,
+	const b3AlignedObjectArray<int>& indicesB,
+
+	b3Vector3& sep)
+{
+	B3_PROFILE("findSeparatingAxis");
+
+	b3g_actualSATPairTests++;
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+//#ifdef TEST_INTERNAL_OBJECTS
+	float4 c0local = (float4&)hullA.m_localCenter;
+	float4 c0 = transform(&c0local, &posA, &ornA);
+	float4 c1local = (float4&)hullB.m_localCenter;
+	float4 c1 = transform(&c1local,&posB,&ornB);
+	const float4 deltaC2 = c0 - c1;
+//#endif
+
+	b3Scalar dmin = FLT_MAX;
+	int curPlaneTests=0;
+
+	int numFacesA = hullA.m_numFaces;
+	// Test normals from hullA
+	for(int i=0;i<numFacesA;i++)
+	{
+		const float4& normal = (float4&)facesA[hullA.m_faceOffset+i].m_plane;
+		float4 faceANormalWS = b3QuatRotate(ornA,normal);
+
+		if (dot3F4(deltaC2,faceANormalWS)<0)
+			faceANormalWS*=-1.f;
+
+		curPlaneTests++;
+#ifdef TEST_INTERNAL_OBJECTS
+		gExpectedNbTests++;
+		if(gUseInternalObject && !TestInternalObjects(transA,transB, DeltaC2, faceANormalWS, hullA, hullB, dmin))
+			continue;
+		gActualNbTests++;
+#endif
+
+		
+		b3Scalar d;
+		if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,faceANormalWS, verticesA, verticesB,d))
+			return false;
+
+		if(d<dmin)
+		{
+			dmin = d;
+			sep = (b3Vector3&)faceANormalWS;
+		}
+	}
+
+	int numFacesB = hullB.m_numFaces;
+	// Test normals from hullB
+	for(int i=0;i<numFacesB;i++)
+	{
+		float4 normal = (float4&)facesB[hullB.m_faceOffset+i].m_plane;
+		float4 WorldNormal = b3QuatRotate(ornB, normal);
+
+		if (dot3F4(deltaC2,WorldNormal)<0)
+		{
+			WorldNormal*=-1.f;
+		}
+		curPlaneTests++;
+#ifdef TEST_INTERNAL_OBJECTS
+		gExpectedNbTests++;
+		if(gUseInternalObject && !TestInternalObjects(transA,transB,DeltaC2, WorldNormal, hullA, hullB, dmin))
+			continue;
+		gActualNbTests++;
+#endif
+
+		b3Scalar d;
+		if(!TestSepAxis(hullA, hullB,posA,ornA,posB,ornB,WorldNormal,verticesA,verticesB,d))
+			return false;
+
+		if(d<dmin)
+		{
+			dmin = d;
+			sep = (b3Vector3&)WorldNormal;
+		}
+	}
+
+	int curEdgeEdge = 0;
+	// Test edges
+	for(int e0=0;e0<hullA.m_numUniqueEdges;e0++)
+	{
+		const float4& edge0 = (float4&) uniqueEdgesA[hullA.m_uniqueEdgesOffset+e0];
+		float4 edge0World = b3QuatRotate(ornA,(float4&)edge0);
+
+		for(int e1=0;e1<hullB.m_numUniqueEdges;e1++)
+		{
+			const b3Vector3 edge1 = uniqueEdgesB[hullB.m_uniqueEdgesOffset+e1];
+			float4 edge1World = b3QuatRotate(ornB,(float4&)edge1);
+
+
+			float4 crossje = cross3(edge0World,edge1World);
+
+			curEdgeEdge++;
+			if(!IsAlmostZero((b3Vector3&)crossje))
+			{
+				crossje = normalize3(crossje);
+				if (dot3F4(deltaC2,crossje)<0)
+					crossje*=-1.f;
+
+
+#ifdef TEST_INTERNAL_OBJECTS
+				gExpectedNbTests++;
+				if(gUseInternalObject && !TestInternalObjects(transA,transB,DeltaC2, Cross, hullA, hullB, dmin))
+					continue;
+				gActualNbTests++;
+#endif
+
+				b3Scalar dist;
+				if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,crossje, verticesA,verticesB,dist))
+					return false;
+
+				if(dist<dmin)
+				{
+					dmin = dist;
+					sep = (b3Vector3&)crossje;
+				}
+			}
+		}
+
+	}
+
+	
+	if((dot3F4(-deltaC2,(float4&)sep))>0.0f)
+		sep = -sep;
+
+	return true;
+}
+
+
+bool findSeparatingAxisEdgeEdge(	__global const b3ConvexPolyhedronData* hullA, __global const b3ConvexPolyhedronData* hullB, 
+	const b3Float4& posA1,
+	const b3Quat& ornA,
+	const b3Float4& posB1,
+	const b3Quat& ornB,
+	const b3Float4& DeltaC2,
+	__global const b3AlignedObjectArray<float4>& vertices, 
+	__global const b3AlignedObjectArray<float4>& uniqueEdges, 
+	__global const b3AlignedObjectArray<b3GpuFace>& faces,
+	__global const b3AlignedObjectArray<int>&  indices,
+	float4* sep,
+	float* dmin)
+{
+//	int i = get_global_id(0);
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+
+	//int curPlaneTests=0;
+
+	int curEdgeEdge = 0;
+	// Test edges
+	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)
+	{
+		const float4 edge0 = uniqueEdges[hullA->m_uniqueEdgesOffset+e0];
+		float4 edge0World = b3QuatRotate(ornA,edge0);
+
+		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)
+		{
+			const float4 edge1 = uniqueEdges[hullB->m_uniqueEdgesOffset+e1];
+			float4 edge1World = b3QuatRotate(ornB,edge1);
+
+
+			float4 crossje = cross3(edge0World,edge1World);
+
+			curEdgeEdge++;
+			if(!IsAlmostZero(crossje))
+			{
+				crossje = normalize3(crossje);
+				if (dot3F4(DeltaC2,crossje)<0)
+					crossje*=-1.f;
+					
+				float dist;
+				bool result = true;
+				{
+					float Min0,Max0;
+					float Min1,Max1;
+					project(*hullA,posA,ornA,crossje,vertices, Min0, Max0);
+					project(*hullB,posB,ornB,crossje,vertices, Min1, Max1);
+				
+					if(Max0<Min1 || Max1<Min0)
+						result = false;
+				
+					float d0 = Max0 - Min1;
+					float d1 = Max1 - Min0;
+					dist = d0<d1 ? d0:d1;
+					result = true;
+
+				}
+				
+
+				if(dist<*dmin)
+				{
+					*dmin = dist;
+					*sep = crossje;
+				}
+			}
+		}
+
+	}
+
+	
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+
+__inline float4 lerp3(const float4& a,const float4& b, float  t)
+{
+	return b3MakeVector3(	a.x + (b.x - a.x) * t,
+						a.y + (b.y - a.y) * t,
+						a.z + (b.z - a.z) * t,
+						0.f);
+}
+
+
+// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut
+int clipFace(const float4* pVtxIn, int numVertsIn, float4& planeNormalWS,float planeEqWS, float4* ppVtxOut)
+{
+	
+	int ve;
+	float ds, de;
+	int numVertsOut = 0;
+	if (numVertsIn < 2)
+		return 0;
+
+	float4 firstVertex=pVtxIn[numVertsIn-1];
+	float4 endVertex = pVtxIn[0];
+	
+	ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;
+
+	for (ve = 0; ve < numVertsIn; ve++)
+	{
+		endVertex=pVtxIn[ve];
+
+		de = dot3F4(planeNormalWS,endVertex)+planeEqWS;
+
+		if (ds<0)
+		{
+			if (de<0)
+			{
+				// Start < 0, end < 0, so output endVertex
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+			else
+			{
+				// Start < 0, end >= 0, so output intersection
+				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+			}
+		}
+		else
+		{
+			if (de<0)
+			{
+				// Start >= 0, end < 0 so output intersection and end
+				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+		}
+		firstVertex = endVertex;
+		ds = de;
+	}
+	return numVertsOut;
+}
+
+
+int clipFaceAgainstHull(const float4& separatingNormal, const b3ConvexPolyhedronData* hullA,  
+	const float4& posA, const b3Quaternion& ornA, float4* worldVertsB1, int numWorldVertsB1,
+	float4* worldVertsB2, int capacityWorldVertsB2,
+	const float minDist, float maxDist,
+	const b3AlignedObjectArray<float4>& verticesA,	const b3AlignedObjectArray<b3GpuFace>& facesA,	const b3AlignedObjectArray<int>& indicesA,
+	//const float4* verticesB,	const b3GpuFace* facesB,	const int* indicesB,
+	float4* contactsOut,
+	int contactCapacity)
+{
+	int numContactsOut = 0;
+
+	float4* pVtxIn = worldVertsB1;
+	float4* pVtxOut = worldVertsB2;
+	
+	int numVertsIn = numWorldVertsB1;
+	int numVertsOut = 0;
+
+	int closestFaceA=-1;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const float4 Normal = b3MakeVector3(
+				facesA[hullA->m_faceOffset+face].m_plane.x, 
+				facesA[hullA->m_faceOffset+face].m_plane.y, 
+				facesA[hullA->m_faceOffset+face].m_plane.z,0.f);
+			const float4 faceANormalWS = b3QuatRotate(ornA,Normal);
+		
+			float d = dot3F4(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+			}
+		}
+	}
+	if (closestFaceA<0)
+		return numContactsOut;
+
+	b3GpuFace polyA = facesA[hullA->m_faceOffset+closestFaceA];
+
+	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+//	int numContacts = numWorldVertsB1;
+	int numVerticesA = polyA.m_numIndices;
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+		const float4 a = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+e0]];
+		const float4 b = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+((e0+1)%numVerticesA)]];
+		const float4 edge0 = a - b;
+		const float4 WorldEdge0 = b3QuatRotate(ornA,edge0);
+		float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		float4 worldPlaneAnormal1 = b3QuatRotate(ornA,planeNormalA);
+
+		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);
+		float4 worldA1 = transform(&a,&posA,&ornA);
+		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);
+		
+		float4 planeNormalWS = planeNormalWS1;
+		float planeEqWS=planeEqWS1;
+		
+		//clip face
+		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);
+		numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);
+
+		//btSwap(pVtxIn,pVtxOut);
+		float4* tmp = pVtxOut;
+		pVtxOut = pVtxIn;
+		pVtxIn = tmp;
+		numVertsIn = numVertsOut;
+		numVertsOut = 0;
+	}
+
+	
+	// only keep points that are behind the witness face
+	{
+		float4 localPlaneNormal  = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		float localPlaneEq = polyA.m_plane.w;
+		float4 planeNormalWS = b3QuatRotate(ornA,localPlaneNormal);
+		float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);
+		for (int i=0;i<numVertsIn;i++)
+		{
+			float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;
+			if (depth <=minDist)
+			{
+				depth = minDist;
+			}
+			if (numContactsOut<contactCapacity)
+			{
+				if (depth <=maxDist)
+				{
+					float4 pointInWorld = pVtxIn[i];
+					//resultOut.addContactPoint(separatingNormal,point,depth);
+					contactsOut[numContactsOut++] = b3MakeVector3(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);
+					//printf("depth=%f\n",depth);
+				}
+			} else
+			{
+				b3Error("exceeding contact capacity (%d,%df)\n", numContactsOut,contactCapacity);
+			}
+		}
+	}
+
+	return numContactsOut;
+}
+
+
+
+static int	clipHullAgainstHull(const float4& separatingNormal, 
+	const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, 
+	const float4& posA, const b3Quaternion& ornA,const float4& posB, const b3Quaternion& ornB, 
+	float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,
+	const float minDist, float maxDist,
+	const b3AlignedObjectArray<float4>& verticesA,	const b3AlignedObjectArray<b3GpuFace>& facesA,	const b3AlignedObjectArray<int>& indicesA,
+	const b3AlignedObjectArray<float4>& verticesB,	const b3AlignedObjectArray<b3GpuFace>& facesB,	const b3AlignedObjectArray<int>& indicesB,
+
+	float4*	contactsOut,
+	int contactCapacity)
+{
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+	
+	B3_PROFILE("clipHullAgainstHull");
+
+//	float curMaxDist=maxDist;
+	int closestFaceB=-1;
+	float dmax = -FLT_MAX;
+
+	{
+		//B3_PROFILE("closestFaceB");
+		if (hullB.m_numFaces!=1)
+		{
+			//printf("wtf\n");
+		}
+		static bool once = true;
+		//printf("separatingNormal=%f,%f,%f\n",separatingNormal.x,separatingNormal.y,separatingNormal.z);
+		
+		for(int face=0;face<hullB.m_numFaces;face++)
+		{
+#ifdef BT_DEBUG_SAT_FACE
+			if (once)
+				printf("face %d\n",face);
+			const b3GpuFace* faceB = &facesB[hullB.m_faceOffset+face];
+			if (once)
+			{
+				for (int i=0;i<faceB->m_numIndices;i++)
+				{
+					float4 vert = verticesB[hullB.m_vertexOffset+indicesB[faceB->m_indexOffset+i]];
+					printf("vert[%d] = %f,%f,%f\n",i,vert.x,vert.y,vert.z);
+				}
+			}
+#endif //BT_DEBUG_SAT_FACE
+			//if (facesB[hullB.m_faceOffset+face].m_numIndices>2)
+			{
+				const float4 Normal = b3MakeVector3(facesB[hullB.m_faceOffset+face].m_plane.x, 
+					facesB[hullB.m_faceOffset+face].m_plane.y, facesB[hullB.m_faceOffset+face].m_plane.z,0.f);
+				const float4 WorldNormal = b3QuatRotate(ornB, Normal);
+#ifdef BT_DEBUG_SAT_FACE
+				if (once)
+					printf("faceNormal = %f,%f,%f\n",Normal.x,Normal.y,Normal.z);
+#endif
+				float d = dot3F4(WorldNormal,separatingNormal);
+				if (d > dmax)
+				{
+					dmax = d;
+					closestFaceB = face;
+				}
+			}
+		}
+		once = false;
+	}
+
+	
+	b3Assert(closestFaceB>=0);
+	{
+		//B3_PROFILE("worldVertsB1");
+		const b3GpuFace& polyB = facesB[hullB.m_faceOffset+closestFaceB];
+		const int numVertices = polyB.m_numIndices;
+		for(int e0=0;e0<numVertices;e0++)
+		{
+			const float4& b = verticesB[hullB.m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];
+			worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);
+		}
+	}
+
+	if (closestFaceB>=0)
+	{
+		//B3_PROFILE("clipFaceAgainstHull");
+		numContactsOut = clipFaceAgainstHull((float4&)separatingNormal, &hullA, 
+				posA,ornA,
+				worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,
+				verticesA,				facesA,				indicesA,
+				contactsOut,contactCapacity);
+	}
+
+	return numContactsOut;
+}
+
+
+
+
+
+
+#define PARALLEL_SUM(v, n) for(int j=1; j<n; j++) v[0] += v[j];
+#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;}
+#define REDUCE_MAX(v, n) {int i=0;\
+for(int offset=0; offset<n; offset++) v[i] = (v[i].y > v[i+offset].y)? v[i]: v[i+offset]; }
+#define REDUCE_MIN(v, n) {int i=0;\
+for(int offset=0; offset<n; offset++) v[i] = (v[i].y < v[i+offset].y)? v[i]: v[i+offset]; }
+
+int extractManifold(const float4* p, int nPoints, const float4& nearNormal, b3Int4* contactIdx)
+{
+	if( nPoints == 0 )
+        return 0;
+    
+    if (nPoints <=4)
+        return nPoints;
+    
+    
+    if (nPoints >64)
+        nPoints = 64;
+    
+	float4 center = make_float4(0,0,0,0);
+	{
+		
+		for (int i=0;i<nPoints;i++)
+			center += p[i];
+		center /= (float)nPoints;
+	}
+    
+	
+    
+	//	sample 4 directions
+    
+    float4 aVector = p[0] - center;
+    float4 u = cross3( nearNormal, aVector );
+    float4 v = cross3( nearNormal, u );
+    u = normalize3( u );
+    v = normalize3( v );
+    
+    
+    //keep point with deepest penetration
+    float minW= FLT_MAX;
+    
+    int minIndex=-1;
+    
+    float4 maxDots;
+    maxDots.x = FLT_MIN;
+    maxDots.y = FLT_MIN;
+    maxDots.z = FLT_MIN;
+    maxDots.w = FLT_MIN;
+    
+    //	idx, distance
+    for(int ie = 0; ie<nPoints; ie++ )
+    {
+        if (p[ie].w<minW)
+        {
+            minW = p[ie].w;
+            minIndex=ie;
+        }
+        float f;
+        float4 r = p[ie]-center;
+        f = dot3F4( u, r );
+        if (f<maxDots.x)
+        {
+            maxDots.x = f;
+            contactIdx[0].x = ie;
+        }
+        
+        f = dot3F4( -u, r );
+        if (f<maxDots.y)
+        {
+            maxDots.y = f;
+            contactIdx[0].y = ie;
+        }
+        
+        
+        f = dot3F4( v, r );
+        if (f<maxDots.z)
+        {
+            maxDots.z = f;
+            contactIdx[0].z = ie;
+        }
+        
+        f = dot3F4( -v, r );
+        if (f<maxDots.w)
+        {
+            maxDots.w = f;
+            contactIdx[0].w = ie;
+        }
+        
+    }
+    
+    if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)
+    {
+        //replace the first contact with minimum (todo: replace contact with least penetration)
+        contactIdx[0].x = minIndex;
+    }
+    
+    return 4;
+    
+}
+
+
+
+
+int clipHullHullSingle(
+			int bodyIndexA, int bodyIndexB,
+										 const float4& posA,
+										 const b3Quaternion& ornA,
+										 const float4& posB,
+										 const b3Quaternion& ornB,
+
+			int collidableIndexA, int collidableIndexB,
+
+			const b3AlignedObjectArray<b3RigidBodyData>* bodyBuf, 
+			b3AlignedObjectArray<b3Contact4>* globalContactOut, 
+			int& nContacts,
+			
+			const b3AlignedObjectArray<b3ConvexPolyhedronData>& hostConvexDataA,
+			const b3AlignedObjectArray<b3ConvexPolyhedronData>& hostConvexDataB,
+	
+			const b3AlignedObjectArray<b3Vector3>& verticesA, 
+			const b3AlignedObjectArray<b3Vector3>& uniqueEdgesA, 
+			const b3AlignedObjectArray<b3GpuFace>& facesA,
+			const b3AlignedObjectArray<int>& indicesA,
+	
+			const b3AlignedObjectArray<b3Vector3>& verticesB,
+			const b3AlignedObjectArray<b3Vector3>& uniqueEdgesB,
+			const b3AlignedObjectArray<b3GpuFace>& facesB,
+			const b3AlignedObjectArray<int>& indicesB,
+
+			const b3AlignedObjectArray<b3Collidable>& hostCollidablesA,
+			const b3AlignedObjectArray<b3Collidable>& hostCollidablesB,
+			const b3Vector3& sepNormalWorldSpace,
+			int maxContactCapacity			)
+{
+	int contactIndex = -1;
+	b3ConvexPolyhedronData hullA, hullB;
+    
+    b3Collidable colA = hostCollidablesA[collidableIndexA];
+    hullA = hostConvexDataA[colA.m_shapeIndex];
+    //printf("numvertsA = %d\n",hullA.m_numVertices);
+    
+    
+    b3Collidable colB = hostCollidablesB[collidableIndexB];
+    hullB = hostConvexDataB[colB.m_shapeIndex];
+    //printf("numvertsB = %d\n",hullB.m_numVertices);
+    
+	
+	float4 contactsOut[MAX_VERTS];
+	int localContactCapacity = MAX_VERTS;
+
+#ifdef _WIN32
+	b3Assert(_finite(bodyBuf->at(bodyIndexA).m_pos.x));
+	b3Assert(_finite(bodyBuf->at(bodyIndexB).m_pos.x));
+#endif
+	
+	
+	{
+		
+		float4 worldVertsB1[MAX_VERTS];
+		float4 worldVertsB2[MAX_VERTS];
+		int capacityWorldVerts = MAX_VERTS;
+
+		float4 hostNormal = make_float4(sepNormalWorldSpace.x,sepNormalWorldSpace.y,sepNormalWorldSpace.z,0.f);
+		int shapeA = hostCollidablesA[collidableIndexA].m_shapeIndex;
+		int shapeB = hostCollidablesB[collidableIndexB].m_shapeIndex;
+
+		b3Scalar minDist = -1;
+		b3Scalar maxDist = 0.;
+
+		        
+
+		b3Transform trA,trB;
+		{
+		//B3_PROFILE("transform computation");
+		//trA.setIdentity();
+		trA.setOrigin(b3MakeVector3(posA.x,posA.y,posA.z));
+		trA.setRotation(b3Quaternion(ornA.x,ornA.y,ornA.z,ornA.w));
+				
+		//trB.setIdentity();
+		trB.setOrigin(b3MakeVector3(posB.x,posB.y,posB.z));
+		trB.setRotation(b3Quaternion(ornB.x,ornB.y,ornB.z,ornB.w));
+		}
+
+		b3Quaternion trAorn = trA.getRotation();
+        b3Quaternion trBorn = trB.getRotation();
+        
+		int numContactsOut = clipHullAgainstHull(hostNormal, 
+						hostConvexDataA.at(shapeA), 
+						hostConvexDataB.at(shapeB),
+								(float4&)trA.getOrigin(), (b3Quaternion&)trAorn,
+								(float4&)trB.getOrigin(), (b3Quaternion&)trBorn,
+								worldVertsB1,worldVertsB2,capacityWorldVerts,
+								minDist, maxDist,
+								verticesA,	facesA,indicesA,
+								verticesB,	facesB,indicesB,
+								
+								contactsOut,localContactCapacity);
+
+		if (numContactsOut>0)
+		{
+			B3_PROFILE("overlap");
+
+			float4 normalOnSurfaceB = (float4&)hostNormal;
+			
+			b3Int4 contactIdx;
+			contactIdx.x = 0;
+			contactIdx.y = 1;
+			contactIdx.z = 2;
+			contactIdx.w = 3;
+			
+			int numPoints = 0;
+					
+			{
+			//	B3_PROFILE("extractManifold");
+				numPoints = extractManifold(contactsOut, numContactsOut, normalOnSurfaceB,  &contactIdx);
+			}
+					
+			b3Assert(numPoints);
+					
+			if (nContacts<maxContactCapacity)
+			{
+				contactIndex = nContacts;
+				globalContactOut->expand();
+				b3Contact4& contact = globalContactOut->at(nContacts);
+				contact.m_batchIdx = 0;//i;
+				contact.m_bodyAPtrAndSignBit = (bodyBuf->at(bodyIndexA).m_invMass==0)? -bodyIndexA:bodyIndexA;
+				contact.m_bodyBPtrAndSignBit = (bodyBuf->at(bodyIndexB).m_invMass==0)? -bodyIndexB:bodyIndexB;
+
+				contact.m_frictionCoeffCmp = 45874;
+				contact.m_restituitionCoeffCmp = 0;
+					
+	//			float distance = 0.f;
+				for (int p=0;p<numPoints;p++)
+				{
+					contact.m_worldPosB[p] = contactsOut[contactIdx.s[p]];//check if it is actually on B
+					contact.m_worldNormalOnB = normalOnSurfaceB; 
+				}
+				//printf("bodyIndexA %d,bodyIndexB %d,normal=%f,%f,%f numPoints %d\n",bodyIndexA,bodyIndexB,normalOnSurfaceB.x,normalOnSurfaceB.y,normalOnSurfaceB.z,numPoints);
+				contact.m_worldNormalOnB.w = (b3Scalar)numPoints;
+				nContacts++;
+			} else
+			{
+				b3Error("Error: exceeding contact capacity (%d/%d)\n", nContacts,maxContactCapacity);
+			}
+		}
+	}
+	return contactIndex;
+}
+
+	
+
+
+
+void computeContactPlaneConvex(int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																const b3RigidBodyData* rigidBodies, 
+																const b3Collidable* collidables,
+																const b3ConvexPolyhedronData* convexShapes,
+																const b3Vector3* convexVertices,
+																const int* convexIndices,
+																const b3GpuFace* faces,
+																b3Contact4* globalContactsOut,
+																int& nGlobalContactsOut,
+																int maxContactCapacity)
+{
+
+		int shapeIndex = collidables[collidableIndexB].m_shapeIndex;
+	const b3ConvexPolyhedronData* hullB = &convexShapes[shapeIndex];
+	
+	b3Vector3 posB = rigidBodies[bodyIndexB].m_pos;
+	b3Quaternion ornB = rigidBodies[bodyIndexB].m_quat;
+	b3Vector3 posA = rigidBodies[bodyIndexA].m_pos;
+	b3Quaternion ornA = rigidBodies[bodyIndexA].m_quat;
+
+//	int numContactsOut = 0;
+//	int numWorldVertsB1= 0;
+
+	b3Vector3 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;
+	b3Vector3 planeNormal=b3MakeVector3(planeEq.x,planeEq.y,planeEq.z);
+	b3Vector3 planeNormalWorld = b3QuatRotate(ornA,planeNormal);
+	float planeConstant = planeEq.w;
+	b3Transform convexWorldTransform;
+	convexWorldTransform.setIdentity();
+	convexWorldTransform.setOrigin(posB);
+	convexWorldTransform.setRotation(ornB);
+	b3Transform planeTransform;
+	planeTransform.setIdentity();
+	planeTransform.setOrigin(posA);
+	planeTransform.setRotation(ornA);
+
+	b3Transform planeInConvex;
+	planeInConvex= convexWorldTransform.inverse() * planeTransform;
+	b3Transform convexInPlane;
+	convexInPlane = planeTransform.inverse() * convexWorldTransform;
+	
+	b3Vector3 planeNormalInConvex = planeInConvex.getBasis()*-planeNormal;
+	float maxDot = -1e30;
+	int hitVertex=-1;
+	b3Vector3 hitVtx;
+
+#define MAX_PLANE_CONVEX_POINTS 64
+
+	b3Vector3 contactPoints[MAX_PLANE_CONVEX_POINTS];
+	int numPoints = 0;
+
+	b3Int4 contactIdx;
+	contactIdx.s[0] = 0;
+	contactIdx.s[1] = 1;
+	contactIdx.s[2] = 2;
+	contactIdx.s[3] = 3;
+	
+	for (int i=0;i<hullB->m_numVertices;i++)
+	{
+		b3Vector3 vtx = convexVertices[hullB->m_vertexOffset+i];
+		float curDot = vtx.dot(planeNormalInConvex);
+
+
+		if (curDot>maxDot)
+		{
+			hitVertex=i;
+			maxDot=curDot;
+			hitVtx = vtx;
+			//make sure the deepest points is always included
+			if (numPoints==MAX_PLANE_CONVEX_POINTS)
+				numPoints--;
+		}
+
+		if (numPoints<MAX_PLANE_CONVEX_POINTS)
+		{
+			b3Vector3 vtxWorld = convexWorldTransform*vtx;
+			b3Vector3 vtxInPlane = planeTransform.inverse()*vtxWorld;
+			float dist = planeNormal.dot(vtxInPlane)-planeConstant;
+			if (dist<0.f)
+			{
+				vtxWorld.w = dist;
+				contactPoints[numPoints] = vtxWorld;
+				numPoints++;
+			}
+		}
+
+	}
+
+	int numReducedPoints  = 0;
+
+	numReducedPoints = numPoints;
+	
+	if (numPoints>4)
+	{
+		numReducedPoints = extractManifoldSequentialGlobal( contactPoints, numPoints, planeNormalInConvex, &contactIdx);
+	}
+	int dstIdx;
+//    dstIdx = nGlobalContactsOut++;//AppendInc( nGlobalContactsOut, dstIdx );
+		
+	if (numReducedPoints>0)
+	{
+		if (nGlobalContactsOut < maxContactCapacity)
+		{
+			dstIdx=nGlobalContactsOut;
+			nGlobalContactsOut++;
+
+			b3Contact4* c = &globalContactsOut[dstIdx];
+			c->m_worldNormalOnB = -planeNormalWorld;
+			c->setFrictionCoeff(0.7);
+			c->setRestituitionCoeff(0.f);
+
+			c->m_batchIdx = pairIndex;
+			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
+			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
+			for (int i=0;i<numReducedPoints;i++)
+			{
+				b3Vector3 pOnB1 = contactPoints[contactIdx.s[i]];
+				c->m_worldPosB[i] = pOnB1;
+			}
+			c->m_worldNormalOnB.w = (b3Scalar)numReducedPoints;
+		}//if (dstIdx < numPairs)
+	}	
+		
+
+
+//	printf("computeContactPlaneConvex\n");
+}
+
+
+
+B3_FORCE_INLINE b3Vector3	MyUnQuantize(const unsigned short* vecIn, const b3Vector3& quantization, const b3Vector3& bvhAabbMin)
+	{
+			b3Vector3	vecOut;
+			vecOut.setValue(
+			(b3Scalar)(vecIn[0]) / (quantization.x),
+			(b3Scalar)(vecIn[1]) / (quantization.y),
+			(b3Scalar)(vecIn[2]) / (quantization.z));
+			vecOut += bvhAabbMin;
+			return vecOut;
+	}
+
+void traverseTreeTree()
+{
+
+}
+
+#include "Bullet3Common/shared/b3Mat3x3.h"
+
+int numAabbChecks = 0;
+int maxNumAabbChecks = 0;
+int maxDepth = 0;
+
+// work-in-progress
+__kernel void   findCompoundPairsKernel( 
+	int pairIndex,
+	int bodyIndexA,
+	int bodyIndexB,
+	int collidableIndexA,
+	int collidableIndexB,
+	__global const b3RigidBodyData* rigidBodies, 
+	__global const b3Collidable* collidables,
+	__global const b3ConvexPolyhedronData* convexShapes, 
+	__global const b3AlignedObjectArray<b3Float4>& vertices,
+	__global const b3AlignedObjectArray<b3Aabb>& aabbsWorldSpace,
+	__global const b3AlignedObjectArray<b3Aabb>& aabbsLocalSpace,
+	__global const b3GpuChildShape* gpuChildShapes,
+	__global b3Int4* gpuCompoundPairsOut,
+	__global  int* numCompoundPairsOut,
+	int maxNumCompoundPairsCapacity,
+	b3AlignedObjectArray<b3QuantizedBvhNode>&	treeNodesCPU,
+	b3AlignedObjectArray<b3BvhSubtreeInfo>&	subTreesCPU,
+	b3AlignedObjectArray<b3BvhInfo>&	bvhInfoCPU
+	)
+{
+	numAabbChecks=0;
+	maxNumAabbChecks=0;
+//	int i = pairIndex;
+	{
+		
+
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+
+
+		//once the broadphase avoids static-static pairs, we can remove this test
+		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+		{
+			return;
+		}
+
+		if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) &&(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+		{
+			int bvhA = collidables[collidableIndexA].m_compoundBvhIndex;
+			int bvhB = collidables[collidableIndexB].m_compoundBvhIndex;
+			int numSubTreesA = bvhInfoCPU[bvhA].m_numSubTrees;
+			int subTreesOffsetA = bvhInfoCPU[bvhA].m_subTreeOffset;
+			int subTreesOffsetB = bvhInfoCPU[bvhB].m_subTreeOffset;
+
+
+			int numSubTreesB = bvhInfoCPU[bvhB].m_numSubTrees;
+			
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			b3Quat ornA = rigidBodies[bodyIndexA].m_quat;
+
+			b3Transform transA;
+			transA.setIdentity();
+			transA.setOrigin(posA);
+			transA.setRotation(ornA);
+
+			b3Quat ornB = rigidBodies[bodyIndexB].m_quat;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+
+			b3Transform transB;
+			transB.setIdentity();
+			transB.setOrigin(posB);
+			transB.setRotation(ornB);
+
+
+
+			for (int p=0;p<numSubTreesA;p++)
+			{
+				b3BvhSubtreeInfo subtreeA = subTreesCPU[subTreesOffsetA+p];
+				//bvhInfoCPU[bvhA].m_quantization
+				b3Vector3 treeAminLocal = MyUnQuantize(subtreeA.m_quantizedAabbMin,bvhInfoCPU[bvhA].m_quantization,bvhInfoCPU[bvhA].m_aabbMin);
+				b3Vector3 treeAmaxLocal = MyUnQuantize(subtreeA.m_quantizedAabbMax,bvhInfoCPU[bvhA].m_quantization,bvhInfoCPU[bvhA].m_aabbMin);
+
+				b3Vector3 aabbAMinOut,aabbAMaxOut;
+				float margin=0.f;
+				b3TransformAabb2(treeAminLocal,treeAmaxLocal, margin,transA.getOrigin(),transA.getRotation(),&aabbAMinOut,&aabbAMaxOut);
+
+				for (int q=0;q<numSubTreesB;q++)
+				{
+					b3BvhSubtreeInfo subtreeB = subTreesCPU[subTreesOffsetB+q];
+
+					b3Vector3 treeBminLocal = MyUnQuantize(subtreeB.m_quantizedAabbMin,bvhInfoCPU[bvhB].m_quantization,bvhInfoCPU[bvhB].m_aabbMin);
+					b3Vector3 treeBmaxLocal = MyUnQuantize(subtreeB.m_quantizedAabbMax,bvhInfoCPU[bvhB].m_quantization,bvhInfoCPU[bvhB].m_aabbMin);
+
+					b3Vector3 aabbBMinOut,aabbBMaxOut;
+					float margin=0.f;
+					b3TransformAabb2(treeBminLocal,treeBmaxLocal, margin,transB.getOrigin(),transB.getRotation(),&aabbBMinOut,&aabbBMaxOut);
+
+					
+					numAabbChecks=0;
+					bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);
+					if (aabbOverlap)
+					{
+						
+						int startNodeIndexA = subtreeA.m_rootNodeIndex+bvhInfoCPU[bvhA].m_nodeOffset;
+		//				int endNodeIndexA = startNodeIndexA+subtreeA.m_subtreeSize;
+
+						int startNodeIndexB = subtreeB.m_rootNodeIndex+bvhInfoCPU[bvhB].m_nodeOffset;
+		//				int endNodeIndexB = startNodeIndexB+subtreeB.m_subtreeSize;
+
+						b3AlignedObjectArray<b3Int2> nodeStack;
+						b3Int2 node0;
+						node0.x = startNodeIndexA;
+						node0.y = startNodeIndexB;
+
+						int maxStackDepth = 1024;
+						nodeStack.resize(maxStackDepth);
+						int depth=0;
+						nodeStack[depth++]=node0;
+
+						do
+						{
+							if (depth > maxDepth)
+							{
+								maxDepth=depth;
+								printf("maxDepth=%d\n",maxDepth);
+							}
+							b3Int2 node = nodeStack[--depth];
+							
+							b3Vector3 aMinLocal = MyUnQuantize(treeNodesCPU[node.x].m_quantizedAabbMin,bvhInfoCPU[bvhA].m_quantization,bvhInfoCPU[bvhA].m_aabbMin);
+							b3Vector3 aMaxLocal = MyUnQuantize(treeNodesCPU[node.x].m_quantizedAabbMax,bvhInfoCPU[bvhA].m_quantization,bvhInfoCPU[bvhA].m_aabbMin);
+
+							b3Vector3 bMinLocal = MyUnQuantize(treeNodesCPU[node.y].m_quantizedAabbMin,bvhInfoCPU[bvhB].m_quantization,bvhInfoCPU[bvhB].m_aabbMin);
+							b3Vector3 bMaxLocal = MyUnQuantize(treeNodesCPU[node.y].m_quantizedAabbMax,bvhInfoCPU[bvhB].m_quantization,bvhInfoCPU[bvhB].m_aabbMin);
+
+							float margin=0.f;
+							b3Vector3 aabbAMinOut,aabbAMaxOut;
+							b3TransformAabb2(aMinLocal,aMaxLocal, margin,transA.getOrigin(),transA.getRotation(),&aabbAMinOut,&aabbAMaxOut);
+
+							b3Vector3 aabbBMinOut,aabbBMaxOut;
+							b3TransformAabb2(bMinLocal,bMaxLocal, margin,transB.getOrigin(),transB.getRotation(),&aabbBMinOut,&aabbBMaxOut);
+
+							numAabbChecks++;
+							bool nodeOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);
+							if (nodeOverlap)
+							{
+								bool isLeafA = treeNodesCPU[node.x].isLeafNode();
+								bool isLeafB = treeNodesCPU[node.y].isLeafNode();
+								bool isInternalA = !isLeafA;
+								bool isInternalB = !isLeafB;
+
+								//fail, even though it might hit two leaf nodes
+								if (depth+4>maxStackDepth && !(isLeafA && isLeafB))
+								{
+									b3Error("Error: traversal exceeded maxStackDepth\n");
+									continue;
+								}
+
+								if(isInternalA)
+								{
+									int nodeAleftChild = node.x+1;
+									bool isNodeALeftChildLeaf = treeNodesCPU[node.x+1].isLeafNode();
+									int nodeArightChild = isNodeALeftChildLeaf? node.x+2 : node.x+1 + treeNodesCPU[node.x+1].getEscapeIndex();
+
+									if(isInternalB)
+									{					
+										int nodeBleftChild = node.y+1;
+										bool isNodeBLeftChildLeaf = treeNodesCPU[node.y+1].isLeafNode();
+										int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + treeNodesCPU[node.y+1].getEscapeIndex();
+
+										nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBleftChild);
+										nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBleftChild);
+										nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBrightChild);
+										nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBrightChild);
+									}
+									else
+									{
+										nodeStack[depth++] = b3MakeInt2(nodeAleftChild,node.y);
+										nodeStack[depth++] = b3MakeInt2(nodeArightChild,node.y);
+									}
+								}
+								else
+								{
+									if(isInternalB)
+									{
+										int nodeBleftChild = node.y+1;
+										bool isNodeBLeftChildLeaf = treeNodesCPU[node.y+1].isLeafNode();
+										int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + treeNodesCPU[node.y+1].getEscapeIndex();
+										nodeStack[depth++] = b3MakeInt2(node.x,nodeBleftChild);
+										nodeStack[depth++] = b3MakeInt2(node.x,nodeBrightChild);
+									}
+									else
+									{
+										int compoundPairIdx = b3AtomicInc(numCompoundPairsOut);
+										if (compoundPairIdx<maxNumCompoundPairsCapacity)
+										{
+											int childShapeIndexA = treeNodesCPU[node.x].getTriangleIndex();
+											int childShapeIndexB = treeNodesCPU[node.y].getTriangleIndex();
+											gpuCompoundPairsOut[compoundPairIdx]  = b3MakeInt4(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);
+										}
+									}
+								}
+							}
+						} while (depth);
+						maxNumAabbChecks = b3Max(numAabbChecks,maxNumAabbChecks);
+					}
+				}
+			}
+			
+			return;
+		}
+
+		if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+		{
+
+			if (collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) 
+			{
+
+				int numChildrenA = collidables[collidableIndexA].m_numChildShapes;
+				for (int c=0;c<numChildrenA;c++)
+				{
+					int childShapeIndexA = collidables[collidableIndexA].m_shapeIndex+c;
+					int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+
+					float4 posA = rigidBodies[bodyIndexA].m_pos;
+					b3Quat ornA = rigidBodies[bodyIndexA].m_quat;
+					float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+					b3Quat childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+					float4 newPosA = b3QuatRotate(ornA,childPosA)+posA;
+					b3Quat newOrnA = b3QuatMul(ornA,childOrnA);
+					
+
+					
+					b3Aabb aabbA = aabbsLocalSpace[childColIndexA];
+
+					
+					b3Transform transA;
+					transA.setIdentity();
+					transA.setOrigin(newPosA);
+					transA.setRotation(newOrnA);
+					b3Scalar margin=0.0f;
+
+					b3Vector3 aabbAMinOut,aabbAMaxOut;
+
+					b3TransformAabb2((const b3Float4&)aabbA.m_min,(const b3Float4&)aabbA.m_max, margin,transA.getOrigin(),transA.getRotation(),&aabbAMinOut,&aabbAMaxOut);
+
+					if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+					{
+						int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+						for (int b=0;b<numChildrenB;b++)
+						{
+							int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
+							int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+							b3Quat ornB = rigidBodies[bodyIndexB].m_quat;
+							float4 posB = rigidBodies[bodyIndexB].m_pos;
+							float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+							b3Quat childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+							float4 newPosB = transform(&childPosB,&posB,&ornB);
+							b3Quat newOrnB = b3QuatMul(ornB,childOrnB);
+
+							
+
+							b3Aabb aabbB = aabbsLocalSpace[childColIndexB];
+
+							b3Transform transB;
+							transB.setIdentity();
+							transB.setOrigin(newPosB);
+							transB.setRotation(newOrnB);
+
+							b3Vector3 aabbBMinOut,aabbBMaxOut;
+							b3TransformAabb2((const b3Float4&)aabbB.m_min,(const b3Float4&)aabbB.m_max, margin,transB.getOrigin(),transB.getRotation(),&aabbBMinOut,&aabbBMaxOut);
+
+							numAabbChecks++;
+							bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);
+							if (aabbOverlap)
+							{
+								/*
+								int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+								float dmin = FLT_MAX;
+								float4 posA = newPosA;
+								posA.w = 0.f;
+								float4 posB = newPosB;
+								posB.w = 0.f;
+								float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+								b3Quat ornA = newOrnA;
+								float4 c0 = transform(&c0local, &posA, &ornA);
+								float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+								b3Quat ornB =newOrnB;
+								float4 c1 = transform(&c1local,&posB,&ornB);
+								const float4 DeltaC2 = c0 - c1;
+								*/
+								{//
+									int compoundPairIdx = b3AtomicInc(numCompoundPairsOut);
+									if (compoundPairIdx<maxNumCompoundPairsCapacity)
+									{
+										gpuCompoundPairsOut[compoundPairIdx]  = b3MakeInt4(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);
+									}
+								}//
+							}//fi(1)
+						} //for (int b=0
+					}//if (collidables[collidableIndexB].
+					else//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+					{
+						if (1)
+						{
+						//	int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+						//	float dmin = FLT_MAX;
+							float4 posA = newPosA;
+							posA.w = 0.f;
+							float4 posB = rigidBodies[bodyIndexB].m_pos;
+							posB.w = 0.f;
+							float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+							b3Quat ornA = newOrnA;
+							float4 c0;
+							c0 = transform(&c0local, &posA, &ornA);
+							float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+							b3Quat ornB = rigidBodies[bodyIndexB].m_quat;
+							float4 c1;
+							c1 = transform(&c1local,&posB,&ornB);
+						//	const float4 DeltaC2 = c0 - c1;
+
+							{
+								int compoundPairIdx = b3AtomicInc(numCompoundPairsOut);
+								if (compoundPairIdx<maxNumCompoundPairsCapacity)
+								{
+									gpuCompoundPairsOut[compoundPairIdx] = b3MakeInt4(bodyIndexA,bodyIndexB,childShapeIndexA,-1);
+								}//if (compoundPairIdx<maxNumCompoundPairsCapacity)
+							}//
+						}//fi (1)
+					}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+				}//for (int b=0;b<numChildrenB;b++)	
+				return;
+			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+			if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH) 
+				&& (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+			{
+				int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+				for (int b=0;b<numChildrenB;b++)
+				{
+					int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
+					int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+					b3Quat ornB = rigidBodies[bodyIndexB].m_quat;
+					float4 posB = rigidBodies[bodyIndexB].m_pos;
+					float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+					b3Quat childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+					float4 newPosB = b3QuatRotate(ornB,childPosB)+posB;
+					b3Quat newOrnB = b3QuatMul(ornB,childOrnB);
+
+					int shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+
+
+					//////////////////////////////////////
+
+					if (1)
+					{
+					//	int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+					//	float dmin = FLT_MAX;
+						float4 posA = rigidBodies[bodyIndexA].m_pos;
+						posA.w = 0.f;
+						float4 posB = newPosB;
+						posB.w = 0.f;
+						float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+						b3Quat ornA = rigidBodies[bodyIndexA].m_quat;
+						float4 c0;
+						c0 = transform(&c0local, &posA, &ornA);
+						float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+						b3Quat ornB =newOrnB;
+						float4 c1;
+						c1 = transform(&c1local,&posB,&ornB);
+					//	const float4 DeltaC2 = c0 - c1;
+						{//
+							int compoundPairIdx = b3AtomicInc(numCompoundPairsOut);
+							if (compoundPairIdx<maxNumCompoundPairsCapacity)
+							{
+								gpuCompoundPairsOut[compoundPairIdx] = b3MakeInt4(bodyIndexA,bodyIndexB,-1,childShapeIndexB);
+							}//fi (compoundPairIdx<maxNumCompoundPairsCapacity)
+						}//
+					}//fi (1)	
+				}//for (int b=0;b<numChildrenB;b++)
+				return;
+			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+			return;
+		}//fi ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+	}//i<numPairs
+}
+
+
+
+__kernel void   processCompoundPairsKernel( __global const b3Int4* gpuCompoundPairs,
+										__global const b3RigidBodyData* rigidBodies, 
+										__global const b3Collidable* collidables,
+										__global const b3ConvexPolyhedronData* convexShapes, 
+										__global const b3AlignedObjectArray<b3Float4>& vertices,
+										__global const b3AlignedObjectArray<b3Float4>& uniqueEdges,
+										__global const b3AlignedObjectArray<b3GpuFace>& faces,
+										__global const b3AlignedObjectArray<int>& indices,
+										__global b3Aabb* aabbs,
+										__global const b3GpuChildShape* gpuChildShapes,
+										__global b3AlignedObjectArray<b3Float4>& gpuCompoundSepNormalsOut,
+										__global b3AlignedObjectArray<int>& gpuHasCompoundSepNormalsOut,
+										int numCompoundPairs,
+										int i
+										)
+{
+
+//	int i = get_global_id(0);
+	if (i<numCompoundPairs)
+	{
+		int bodyIndexA = gpuCompoundPairs[i].x;
+		int bodyIndexB = gpuCompoundPairs[i].y;
+
+		int childShapeIndexA = gpuCompoundPairs[i].z;
+		int childShapeIndexB = gpuCompoundPairs[i].w;
+		
+		int collidableIndexA = -1;
+		int collidableIndexB = -1;
+		
+		b3Quat ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		
+		b3Quat ornB = rigidBodies[bodyIndexB].m_quat;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+							
+		if (childShapeIndexA >= 0)
+		{
+			collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+			float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+			b3Quat	childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+			float4 newPosA = b3QuatRotate(ornA,childPosA)+posA;
+			b3Quat newOrnA = b3QuatMul(ornA,childOrnA);
+			posA = newPosA;
+			ornA = newOrnA;
+		} else
+		{
+			collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		}
+		
+		if (childShapeIndexB>=0)
+		{
+			collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			b3Quat childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			float4 newPosB = b3QuatRotate(ornB,childPosB)+posB;
+			b3Quat newOrnB = b3QuatMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+		} else
+		{
+			collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	
+		}
+	
+		gpuHasCompoundSepNormalsOut[i] = 0;
+	
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+	
+		int shapeTypeA = collidables[collidableIndexA].m_shapeType;
+		int shapeTypeB = collidables[collidableIndexB].m_shapeType;
+	
+
+		if ((shapeTypeA != SHAPE_CONVEX_HULL) || (shapeTypeB != SHAPE_CONVEX_HULL))
+		{
+			return;
+		}
+
+		int hasSeparatingAxis = 5;
+							
+	//	int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+		float dmin = FLT_MAX;
+		posA.w = 0.f;
+		posB.w = 0.f;
+		float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+		float4 sepNormal = make_float4(1,0,0,0);
+//		bool sepA = findSeparatingAxis(	convexShapes[shapeIndexA], convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);
+		bool sepA = findSeparatingAxis(	convexShapes[shapeIndexA], convexShapes[shapeIndexB],posA,ornA,posB,ornB,vertices,uniqueEdges,faces,indices,vertices,uniqueEdges,faces,indices,sepNormal);//,&dmin);
+	
+		hasSeparatingAxis = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxis(	convexShapes[shapeIndexB],convexShapes[shapeIndexA],posB,ornB,posA,ornA,vertices,uniqueEdges,faces,indices,vertices,uniqueEdges,faces,indices,sepNormal);//,&dmin);
+
+			if (!sepB)
+			{
+				hasSeparatingAxis = 0;
+			} else//(!sepB)
+			{
+				bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);
+				if (sepEE)
+				{
+						gpuCompoundSepNormalsOut[i] = sepNormal;//fastNormalize4(sepNormal);
+						gpuHasCompoundSepNormalsOut[i] = 1;
+				}//sepEE
+			}//(!sepB)
+		}//(!sepA)
+		
+		
+	}
+		
+}
+
+
+__kernel void   clipCompoundsHullHullKernel( __global const b3Int4* gpuCompoundPairs, 
+																					__global const b3RigidBodyData* rigidBodies, 
+																					__global const b3Collidable* collidables,
+																					__global const b3ConvexPolyhedronData* convexShapes, 
+																					__global const b3AlignedObjectArray<b3Float4>& vertices,
+																					__global const b3AlignedObjectArray<b3Float4>& uniqueEdges,
+																					__global const b3AlignedObjectArray<b3GpuFace>& faces,
+																					__global const b3AlignedObjectArray<int>& indices,
+																					__global const b3GpuChildShape* gpuChildShapes,
+																					__global const b3AlignedObjectArray<b3Float4>& gpuCompoundSepNormalsOut,
+																					__global const b3AlignedObjectArray<int>& gpuHasCompoundSepNormalsOut,
+																					__global struct b3Contact4Data* globalContactsOut,
+																					int* nGlobalContactsOut,
+																					int numCompoundPairs, int maxContactCapacity, int i)
+{
+
+//	int i = get_global_id(0);
+	int pairIndex = i;
+	
+	float4 worldVertsB1[64];
+	float4 worldVertsB2[64];
+	int capacityWorldVerts = 64;	
+
+	float4 localContactsOut[64];
+	int localContactCapacity=64;
+	
+	float minDist = -1e30f;
+	float maxDist = 0.0f;
+
+	if (i<numCompoundPairs)
+	{
+
+		if (gpuHasCompoundSepNormalsOut[i])
+		{
+
+			int bodyIndexA = gpuCompoundPairs[i].x;
+			int bodyIndexB = gpuCompoundPairs[i].y;
+			
+			int childShapeIndexA = gpuCompoundPairs[i].z;
+			int childShapeIndexB = gpuCompoundPairs[i].w;
+			
+			int collidableIndexA = -1;
+			int collidableIndexB = -1;
+			
+			b3Quat ornA = rigidBodies[bodyIndexA].m_quat;
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			
+			b3Quat ornB = rigidBodies[bodyIndexB].m_quat;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+								
+			if (childShapeIndexA >= 0)
+			{
+				collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+				float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+				b3Quat childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+				float4 newPosA = b3QuatRotate(ornA,childPosA)+posA;
+				b3Quat newOrnA = b3QuatMul(ornA,childOrnA);
+				posA = newPosA;
+				ornA = newOrnA;
+			} else
+			{
+				collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+			}
+			
+			if (childShapeIndexB>=0)
+			{
+				collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+				float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+				b3Quat  childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+				float4 newPosB = b3QuatRotate(ornB,childPosB)+posB;
+				b3Quat  newOrnB = b3QuatMul(ornB,childOrnB);
+				posB = newPosB;
+				ornB = newOrnB;
+			} else
+			{
+				collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	
+			}
+			
+			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		
+			int numLocalContactsOut = clipHullAgainstHull(gpuCompoundSepNormalsOut[i],
+														convexShapes[shapeIndexA], convexShapes[shapeIndexB],
+														posA,ornA,
+													  posB,ornB,
+													  worldVertsB1,worldVertsB2,capacityWorldVerts,
+														minDist, maxDist,
+														vertices,faces,indices,
+														vertices,faces,indices,
+														localContactsOut,localContactCapacity);
+												
+		if (numLocalContactsOut>0)
+		{
+				float4 normal = -gpuCompoundSepNormalsOut[i];
+				int nPoints = numLocalContactsOut;
+				float4* pointsIn = localContactsOut;
+				b3Int4 contactIdx;// = {-1,-1,-1,-1};
+
+				contactIdx.s[0] = 0;
+				contactIdx.s[1] = 1;
+				contactIdx.s[2] = 2;
+				contactIdx.s[3] = 3;
+
+				int nReducedContacts = extractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx);
+		
+				int dstIdx;
+				dstIdx = b3AtomicInc( nGlobalContactsOut);
+				if ((dstIdx+nReducedContacts) < maxContactCapacity)
+				{
+					__global struct b3Contact4Data* c = globalContactsOut+ dstIdx;
+					c->m_worldNormalOnB = -normal;
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+					c->m_batchIdx = pairIndex;
+					int bodyA = gpuCompoundPairs[pairIndex].x;
+					int bodyB = gpuCompoundPairs[pairIndex].y;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+					c->m_childIndexA = childShapeIndexA;
+					c->m_childIndexB = childShapeIndexB;
+					for (int i=0;i<nReducedContacts;i++)
+					{
+						c->m_worldPosB[i] = pointsIn[contactIdx.s[i]];
+					}
+					b3Contact4Data_setNumPoints(c,nReducedContacts);
+				}
+				
+			}//		if (numContactsOut>0)
+		}//		if (gpuHasCompoundSepNormalsOut[i])
+	}//	if (i<numCompoundPairs)
+
+}
+
+
+void computeContactCompoundCompound(int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																const b3RigidBodyData* rigidBodies, 
+																const b3Collidable* collidables,
+																const b3ConvexPolyhedronData* convexShapes,
+																const b3GpuChildShape* cpuChildShapes,
+																const b3AlignedObjectArray<b3Aabb>& hostAabbsWorldSpace,
+																const b3AlignedObjectArray<b3Aabb>& hostAabbsLocalSpace,
+
+																const b3AlignedObjectArray<b3Vector3>& convexVertices,
+																const b3AlignedObjectArray<b3Vector3>& hostUniqueEdges,
+																const b3AlignedObjectArray<int>& convexIndices,
+																const b3AlignedObjectArray<b3GpuFace>& faces,
+																
+																b3Contact4* globalContactsOut,
+																int& nGlobalContactsOut,
+																int maxContactCapacity,
+																b3AlignedObjectArray<b3QuantizedBvhNode>&	treeNodesCPU,
+																b3AlignedObjectArray<b3BvhSubtreeInfo>&	subTreesCPU,
+																b3AlignedObjectArray<b3BvhInfo>&	bvhInfoCPU
+																)
+{
+
+	int shapeTypeB = collidables[collidableIndexB].m_shapeType;
+	b3Assert(shapeTypeB == SHAPE_COMPOUND_OF_CONVEX_HULLS);
+
+	b3AlignedObjectArray<b3Int4> cpuCompoundPairsOut;
+	int numCompoundPairsOut=0;
+	int maxNumCompoundPairsCapacity = 8192;//1024;
+	cpuCompoundPairsOut.resize(maxNumCompoundPairsCapacity);
+
+	// work-in-progress
+	findCompoundPairsKernel( 
+							pairIndex,
+							bodyIndexA,bodyIndexB,
+							collidableIndexA,collidableIndexB,
+							rigidBodies, 
+							collidables,
+							convexShapes, 
+							convexVertices,
+							hostAabbsWorldSpace,
+							hostAabbsLocalSpace,
+							cpuChildShapes,
+							&cpuCompoundPairsOut[0],
+							&numCompoundPairsOut,
+							maxNumCompoundPairsCapacity	,
+							treeNodesCPU,
+							subTreesCPU,
+							bvhInfoCPU
+							);
+
+	printf("maxNumAabbChecks=%d\n",maxNumAabbChecks);
+	if (numCompoundPairsOut>maxNumCompoundPairsCapacity)
+	{
+		b3Error("numCompoundPairsOut exceeded maxNumCompoundPairsCapacity (%d)\n",maxNumCompoundPairsCapacity);
+		numCompoundPairsOut=maxNumCompoundPairsCapacity;
+	}
+	b3AlignedObjectArray<b3Float4> cpuCompoundSepNormalsOut;
+	b3AlignedObjectArray<int> cpuHasCompoundSepNormalsOut;
+	cpuCompoundSepNormalsOut.resize(numCompoundPairsOut);
+	cpuHasCompoundSepNormalsOut.resize(numCompoundPairsOut);
+
+	for (int i=0;i<numCompoundPairsOut;i++)
+	{
+
+		processCompoundPairsKernel(&cpuCompoundPairsOut[0],rigidBodies,collidables,convexShapes,convexVertices,hostUniqueEdges,faces,convexIndices,0,cpuChildShapes,
+			cpuCompoundSepNormalsOut,cpuHasCompoundSepNormalsOut,numCompoundPairsOut,i);
+	}
+
+	for (int i=0;i<numCompoundPairsOut;i++)
+	{
+		clipCompoundsHullHullKernel(&cpuCompoundPairsOut[0],rigidBodies,collidables,convexShapes,convexVertices,hostUniqueEdges,faces,convexIndices,cpuChildShapes,
+			cpuCompoundSepNormalsOut,cpuHasCompoundSepNormalsOut,globalContactsOut,&nGlobalContactsOut,numCompoundPairsOut,maxContactCapacity,i);
+	}
+		/*
+		int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+
+					float4 posA = rigidBodies[bodyIndexA].m_pos;
+					b3Quat ornA = rigidBodies[bodyIndexA].m_quat;
+					float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+					b3Quat childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+					float4 newPosA = b3QuatRotate(ornA,childPosA)+posA;
+					b3Quat newOrnA = b3QuatMul(ornA,childOrnA);
+
+					int shapeIndexA = collidables[childColIndexA].m_shapeIndex;
+
+
+			bool foundSepAxis = findSeparatingAxis(hullA,hullB,
+							posA,
+							ornA,
+							posB,
+							ornB,
+
+							convexVertices,uniqueEdges,faces,convexIndices,
+							convexVertices,uniqueEdges,faces,convexIndices,
+							
+							sepNormalWorldSpace
+							);
+							*/
+
+	
+	/*
+	if (foundSepAxis)
+	{
+		
+		
+		contactIndex = clipHullHullSingle(
+			bodyIndexA, bodyIndexB,
+						   posA,ornA,
+						   posB,ornB,
+			collidableIndexA, collidableIndexB,
+			&rigidBodies, 
+			&globalContactsOut,
+			nGlobalContactsOut,
+			
+			convexShapes,
+			convexShapes,
+	
+			convexVertices, 
+			uniqueEdges, 
+			faces,
+			convexIndices,
+	
+			convexVertices,
+			uniqueEdges,
+			faces,
+			convexIndices,
+
+			collidables,
+			collidables,
+			sepNormalWorldSpace,
+			maxContactCapacity);
+			
+	}
+	*/
+
+//	return contactIndex;
+	
+	/*
+
+	int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+	for (int c=0;c<numChildrenB;c++)
+	{
+		int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+c;
+		int childColIndexB = cpuChildShapes[childShapeIndexB].m_shapeIndex;
+
+		float4 rootPosB = rigidBodies[bodyIndexB].m_pos;
+		b3Quaternion rootOrnB = rigidBodies[bodyIndexB].m_quat;
+		b3Vector3 childPosB = cpuChildShapes[childShapeIndexB].m_childPosition;
+		b3Quaternion childOrnB = cpuChildShapes[childShapeIndexB].m_childOrientation;
+		float4  posB = b3QuatRotate(rootOrnB,childPosB)+rootPosB;
+		b3Quaternion ornB = b3QuatMul(rootOrnB,childOrnB);//b3QuatMul(ornB,childOrnB);
+
+		int shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+
+		const b3ConvexPolyhedronData* hullB = &convexShapes[shapeIndexB];
+
+	}
+	*/
+
+}
+
+void computeContactPlaneCompound(int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																const b3RigidBodyData* rigidBodies, 
+																const b3Collidable* collidables,
+																const b3ConvexPolyhedronData* convexShapes,
+																const b3GpuChildShape* cpuChildShapes,
+																const b3Vector3* convexVertices,
+																const int* convexIndices,
+																const b3GpuFace* faces,
+																
+																b3Contact4* globalContactsOut,
+																int& nGlobalContactsOut,
+																int maxContactCapacity)
+{
+
+	int shapeTypeB = collidables[collidableIndexB].m_shapeType;
+	b3Assert(shapeTypeB == SHAPE_COMPOUND_OF_CONVEX_HULLS);
+
+
+	int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+	for (int c=0;c<numChildrenB;c++)
+	{
+		int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+c;
+		int childColIndexB = cpuChildShapes[childShapeIndexB].m_shapeIndex;
+
+		float4 rootPosB = rigidBodies[bodyIndexB].m_pos;
+		b3Quaternion rootOrnB = rigidBodies[bodyIndexB].m_quat;
+		b3Vector3 childPosB = cpuChildShapes[childShapeIndexB].m_childPosition;
+		b3Quaternion childOrnB = cpuChildShapes[childShapeIndexB].m_childOrientation;
+		float4  posB = b3QuatRotate(rootOrnB,childPosB)+rootPosB;
+		b3Quaternion ornB = rootOrnB*childOrnB;//b3QuatMul(ornB,childOrnB);
+
+		int shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+
+		const b3ConvexPolyhedronData* hullB = &convexShapes[shapeIndexB];
+	
+		
+		b3Vector3 posA = rigidBodies[bodyIndexA].m_pos;
+		b3Quaternion ornA = rigidBodies[bodyIndexA].m_quat;
+
+	//	int numContactsOut = 0;
+	//	int numWorldVertsB1= 0;
+
+		b3Vector3 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;
+		b3Vector3 planeNormal=b3MakeVector3(planeEq.x,planeEq.y,planeEq.z);
+		b3Vector3 planeNormalWorld = b3QuatRotate(ornA,planeNormal);
+		float planeConstant = planeEq.w;
+		b3Transform convexWorldTransform;
+		convexWorldTransform.setIdentity();
+		convexWorldTransform.setOrigin(posB);
+		convexWorldTransform.setRotation(ornB);
+		b3Transform planeTransform;
+		planeTransform.setIdentity();
+		planeTransform.setOrigin(posA);
+		planeTransform.setRotation(ornA);
+
+		b3Transform planeInConvex;
+		planeInConvex= convexWorldTransform.inverse() * planeTransform;
+		b3Transform convexInPlane;
+		convexInPlane = planeTransform.inverse() * convexWorldTransform;
+	
+		b3Vector3 planeNormalInConvex = planeInConvex.getBasis()*-planeNormal;
+		float maxDot = -1e30;
+		int hitVertex=-1;
+		b3Vector3 hitVtx;
+
+	#define MAX_PLANE_CONVEX_POINTS 64
+
+		b3Vector3 contactPoints[MAX_PLANE_CONVEX_POINTS];
+		int numPoints = 0;
+
+		b3Int4 contactIdx;
+		contactIdx.s[0] = 0;
+		contactIdx.s[1] = 1;
+		contactIdx.s[2] = 2;
+		contactIdx.s[3] = 3;
+	
+		for (int i=0;i<hullB->m_numVertices;i++)
+		{
+			b3Vector3 vtx = convexVertices[hullB->m_vertexOffset+i];
+			float curDot = vtx.dot(planeNormalInConvex);
+
+
+			if (curDot>maxDot)
+			{
+				hitVertex=i;
+				maxDot=curDot;
+				hitVtx = vtx;
+				//make sure the deepest points is always included
+				if (numPoints==MAX_PLANE_CONVEX_POINTS)
+					numPoints--;
+			}
+
+			if (numPoints<MAX_PLANE_CONVEX_POINTS)
+			{
+				b3Vector3 vtxWorld = convexWorldTransform*vtx;
+				b3Vector3 vtxInPlane = planeTransform.inverse()*vtxWorld;
+				float dist = planeNormal.dot(vtxInPlane)-planeConstant;
+				if (dist<0.f)
+				{
+					vtxWorld.w = dist;
+					contactPoints[numPoints] = vtxWorld;
+					numPoints++;
+				}
+			}
+
+		}
+
+		int numReducedPoints  = 0;
+
+		numReducedPoints = numPoints;
+	
+		if (numPoints>4)
+		{
+			numReducedPoints = extractManifoldSequentialGlobal( contactPoints, numPoints, planeNormalInConvex, &contactIdx);
+		}
+		int dstIdx;
+	//    dstIdx = nGlobalContactsOut++;//AppendInc( nGlobalContactsOut, dstIdx );
+		
+		if (numReducedPoints>0)
+		{
+			if (nGlobalContactsOut < maxContactCapacity)
+			{
+				dstIdx=nGlobalContactsOut;
+				nGlobalContactsOut++;
+
+				b3Contact4* c = &globalContactsOut[dstIdx];
+				c->m_worldNormalOnB = -planeNormalWorld;
+				c->setFrictionCoeff(0.7);
+				c->setRestituitionCoeff(0.f);
+
+				c->m_batchIdx = pairIndex;
+				c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
+				c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
+				for (int i=0;i<numReducedPoints;i++)
+				{
+					b3Vector3 pOnB1 = contactPoints[contactIdx.s[i]];
+					c->m_worldPosB[i] = pOnB1;
+				}
+				c->m_worldNormalOnB.w = (b3Scalar)numReducedPoints;
+			}//if (dstIdx < numPairs)
+		}	
+		
+	}
+
+	
+}
+
+
+
+
+
+void	computeContactSphereConvex(int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																const b3RigidBodyData* rigidBodies, 
+																const b3Collidable* collidables,
+																const b3ConvexPolyhedronData* convexShapes,
+																const b3Vector3* convexVertices,
+																const int* convexIndices,
+																const b3GpuFace* faces,
+																b3Contact4* globalContactsOut,
+																int& nGlobalContactsOut,
+																int maxContactCapacity)
+{
+
+	float radius = collidables[collidableIndexA].m_radius;
+	float4 spherePos1 = rigidBodies[bodyIndexA].m_pos;
+	b3Quaternion sphereOrn = rigidBodies[bodyIndexA].m_quat;
+
+
+
+	float4 pos = rigidBodies[bodyIndexB].m_pos;
+	
+
+	b3Quaternion quat = rigidBodies[bodyIndexB].m_quat;
+
+	b3Transform tr;
+	tr.setIdentity();
+	tr.setOrigin(pos);
+	tr.setRotation(quat);
+	b3Transform trInv = tr.inverse();
+
+	float4 spherePos = trInv(spherePos1);
+
+	int collidableIndex = rigidBodies[bodyIndexB].m_collidableIdx;
+	int shapeIndex = collidables[collidableIndex].m_shapeIndex;
+	int numFaces = convexShapes[shapeIndex].m_numFaces;
+	float4 closestPnt = b3MakeVector3(0, 0, 0, 0);
+//	float4 hitNormalWorld = b3MakeVector3(0, 0, 0, 0);
+	float minDist = -1000000.f; // TODO: What is the largest/smallest float?
+	bool bCollide = true;
+	int region = -1;
+	float4 localHitNormal;
+	for ( int f = 0; f < numFaces; f++ )
+	{
+		b3GpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f];
+		float4 planeEqn;
+		float4 localPlaneNormal = b3MakeVector3(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);
+		float4 n1 = localPlaneNormal;//quatRotate(quat,localPlaneNormal);
+		planeEqn = n1;
+		planeEqn[3] = face.m_plane.w;
+
+		float4 pntReturn;
+		float dist = signedDistanceFromPointToPlane(spherePos, planeEqn, &pntReturn);
+
+		if ( dist > radius)
+		{
+			bCollide = false;
+			break;
+		}
+
+		if ( dist > 0 )
+		{
+			//might hit an edge or vertex
+			b3Vector3 out;
+
+			bool isInPoly = IsPointInPolygon(spherePos,
+					&face,
+					&convexVertices[convexShapes[shapeIndex].m_vertexOffset],
+					convexIndices,
+                    &out);
+			if (isInPoly)
+			{
+				if (dist>minDist)
+				{
+					minDist = dist;
+					closestPnt = pntReturn;
+					localHitNormal = planeEqn;
+					region=1;
+				}
+			} else
+			{
+				b3Vector3 tmp = spherePos-out;
+				b3Scalar l2 = tmp.length2();
+				if (l2<radius*radius)
+				{
+					dist  = b3Sqrt(l2);
+					if (dist>minDist)
+					{
+						minDist = dist;
+						closestPnt = out;
+						localHitNormal = tmp/dist;
+						region=2;
+					}
+					
+				} else
+				{
+					bCollide = false;
+					break;
+				}
+			}
+		}
+		else
+		{
+			if ( dist > minDist )
+			{
+				minDist = dist;
+				closestPnt = pntReturn;
+				localHitNormal = planeEqn;
+				region=3;
+			}
+		}
+	}
+	static int numChecks = 0;
+	numChecks++;
+
+	if (bCollide && minDist > -10000)
+	{
+		
+		float4 normalOnSurfaceB1 = tr.getBasis()*localHitNormal;//-hitNormalWorld;
+		float4 pOnB1 = tr(closestPnt);
+		//printf("dist ,%f,",minDist);
+		float actualDepth = minDist-radius;
+		if (actualDepth<0)
+		{
+		//printf("actualDepth = ,%f,", actualDepth);
+		//printf("normalOnSurfaceB1 = ,%f,%f,%f,", normalOnSurfaceB1.x,normalOnSurfaceB1.y,normalOnSurfaceB1.z);
+		//printf("region=,%d,\n", region);
+		pOnB1[3] = actualDepth;
+
+		int dstIdx;
+//    dstIdx = nGlobalContactsOut++;//AppendInc( nGlobalContactsOut, dstIdx );
+		
+		if (nGlobalContactsOut < maxContactCapacity)
+		{
+			dstIdx=nGlobalContactsOut;
+			nGlobalContactsOut++;
+
+			b3Contact4* c = &globalContactsOut[dstIdx];
+			c->m_worldNormalOnB = normalOnSurfaceB1;
+			c->setFrictionCoeff(0.7);
+			c->setRestituitionCoeff(0.f);
+
+			c->m_batchIdx = pairIndex;
+			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
+			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
+			c->m_worldPosB[0] = pOnB1;
+			int numPoints = 1;
+			c->m_worldNormalOnB.w = (b3Scalar)numPoints;
+		}//if (dstIdx < numPairs)
+		}
+	}//if (hasCollision)
+	
+}
+
+
+
+
+int computeContactConvexConvex2(
+																int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																const b3AlignedObjectArray<b3RigidBodyData>& rigidBodies, 
+																const b3AlignedObjectArray<b3Collidable>& collidables,
+																const b3AlignedObjectArray<b3ConvexPolyhedronData>& convexShapes,
+																const b3AlignedObjectArray<b3Vector3>& convexVertices,
+																const b3AlignedObjectArray<b3Vector3>& uniqueEdges,
+																const b3AlignedObjectArray<int>& convexIndices,
+																const b3AlignedObjectArray<b3GpuFace>& faces,
+																b3AlignedObjectArray<b3Contact4>& globalContactsOut,
+																int& nGlobalContactsOut,
+																int maxContactCapacity,
+																const b3AlignedObjectArray<b3Contact4>& oldContacts
+																)
+{
+	int contactIndex = -1;
+	b3Vector3 posA = rigidBodies[bodyIndexA].m_pos;
+	b3Quaternion ornA = rigidBodies[bodyIndexA].m_quat;
+	b3Vector3 posB = rigidBodies[bodyIndexB].m_pos;
+	b3Quaternion ornB = rigidBodies[bodyIndexB].m_quat;
+	
+
+	b3ConvexPolyhedronData hullA, hullB;
+    
+	b3Vector3 sepNormalWorldSpace;
+
+	
+
+    b3Collidable colA = collidables[collidableIndexA];
+    hullA = convexShapes[colA.m_shapeIndex];
+    //printf("numvertsA = %d\n",hullA.m_numVertices);
+    
+    
+    b3Collidable colB = collidables[collidableIndexB];
+    hullB = convexShapes[colB.m_shapeIndex];
+    //printf("numvertsB = %d\n",hullB.m_numVertices);
+
+//	int contactCapacity = MAX_VERTS;
+	//int numContactsOut=0;
+
+
+#ifdef _WIN32
+	b3Assert(_finite(rigidBodies[bodyIndexA].m_pos.x));
+	b3Assert(_finite(rigidBodies[bodyIndexB].m_pos.x));
+#endif
+	
+		bool foundSepAxis = findSeparatingAxis(hullA,hullB,
+							posA,
+							ornA,
+							posB,
+							ornB,
+
+							convexVertices,uniqueEdges,faces,convexIndices,
+							convexVertices,uniqueEdges,faces,convexIndices,
+							
+							sepNormalWorldSpace
+							);
+
+	
+	if (foundSepAxis)
+	{
+		
+		
+		contactIndex = clipHullHullSingle(
+			bodyIndexA, bodyIndexB,
+						   posA,ornA,
+						   posB,ornB,
+			collidableIndexA, collidableIndexB,
+			&rigidBodies, 
+			&globalContactsOut,
+			nGlobalContactsOut,
+			
+			convexShapes,
+			convexShapes,
+	
+			convexVertices, 
+			uniqueEdges, 
+			faces,
+			convexIndices,
+	
+			convexVertices,
+			uniqueEdges,
+			faces,
+			convexIndices,
+
+			collidables,
+			collidables,
+			sepNormalWorldSpace,
+			maxContactCapacity);
+			
+	}
+
+	return contactIndex;
+}
+
+
+
+
+
+																
+																
+void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* pairs, int nPairs,
+			const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
+			b3OpenCLArray<b3Contact4>* contactOut, int& nContacts,
+			const b3OpenCLArray<b3Contact4>* oldContacts,
+			int maxContactCapacity,
+			int compoundPairCapacity,
+			const b3OpenCLArray<b3ConvexPolyhedronData>& convexData,
+			const b3OpenCLArray<b3Vector3>& gpuVertices,
+			const b3OpenCLArray<b3Vector3>& gpuUniqueEdges,
+			const b3OpenCLArray<b3GpuFace>& gpuFaces,
+			const b3OpenCLArray<int>& gpuIndices,
+			const b3OpenCLArray<b3Collidable>& gpuCollidables,
+			const b3OpenCLArray<b3GpuChildShape>& gpuChildShapes,
+
+			const b3OpenCLArray<b3Aabb>& clAabbsWorldSpace,
+			const b3OpenCLArray<b3Aabb>& clAabbsLocalSpace,
+
+            b3OpenCLArray<b3Vector3>& worldVertsB1GPU,
+            b3OpenCLArray<b3Int4>& clippingFacesOutGPU,
+            b3OpenCLArray<b3Vector3>& worldNormalsAGPU,
+            b3OpenCLArray<b3Vector3>& worldVertsA1GPU,
+            b3OpenCLArray<b3Vector3>& worldVertsB2GPU,    
+			b3AlignedObjectArray<class b3OptimizedBvh*>& bvhDataUnused,
+			b3OpenCLArray<b3QuantizedBvhNode>*	treeNodesGPU,
+			b3OpenCLArray<b3BvhSubtreeInfo>*	subTreesGPU,
+			b3OpenCLArray<b3BvhInfo>*	bvhInfo,
+
+			int numObjects,
+			int maxTriConvexPairCapacity,
+			b3OpenCLArray<b3Int4>& triangleConvexPairsOut,
+			int& numTriConvexPairsOut
+			)
+{
+	myframecount++;
+
+	if (!nPairs)
+		return;
+
+#ifdef CHECK_ON_HOST
+
+
+	b3AlignedObjectArray<b3QuantizedBvhNode>	treeNodesCPU;
+	treeNodesGPU->copyToHost(treeNodesCPU);
+
+	b3AlignedObjectArray<b3BvhSubtreeInfo>	subTreesCPU;
+	subTreesGPU->copyToHost(subTreesCPU);
+
+	b3AlignedObjectArray<b3BvhInfo>	bvhInfoCPU;
+	bvhInfo->copyToHost(bvhInfoCPU);
+
+	b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace;
+	clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace);
+
+	b3AlignedObjectArray<b3Aabb> hostAabbsLocalSpace;
+	clAabbsLocalSpace.copyToHost(hostAabbsLocalSpace);
+
+	b3AlignedObjectArray<b3Int4> hostPairs;
+	pairs->copyToHost(hostPairs);
+
+	b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf;
+	bodyBuf->copyToHost(hostBodyBuf);
+
+	
+
+	b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData;
+	convexData.copyToHost(hostConvexData);
+
+	b3AlignedObjectArray<b3Vector3> hostVertices;
+	gpuVertices.copyToHost(hostVertices);
+
+	b3AlignedObjectArray<b3Vector3> hostUniqueEdges;
+	gpuUniqueEdges.copyToHost(hostUniqueEdges);
+	b3AlignedObjectArray<b3GpuFace> hostFaces;
+	gpuFaces.copyToHost(hostFaces);
+	b3AlignedObjectArray<int> hostIndices;
+	gpuIndices.copyToHost(hostIndices);
+	b3AlignedObjectArray<b3Collidable> hostCollidables;
+	gpuCollidables.copyToHost(hostCollidables);
+	
+	b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes;
+	gpuChildShapes.copyToHost(cpuChildShapes);
+	
+
+	b3AlignedObjectArray<b3Int4> hostTriangleConvexPairs;
+
+	b3AlignedObjectArray<b3Contact4> hostContacts;
+	if (nContacts)
+	{
+		contactOut->copyToHost(hostContacts);
+	}
+
+	b3AlignedObjectArray<b3Contact4> oldHostContacts;
+	
+	if (oldContacts->size())
+	{
+		oldContacts->copyToHost(oldHostContacts);
+	}
+
+
+	hostContacts.resize(maxContactCapacity);
+
+	for (int i=0;i<nPairs;i++)
+	{
+		int bodyIndexA = hostPairs[i].x;
+		int bodyIndexB = hostPairs[i].y;
+		int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx;
+
+		if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&
+			hostCollidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)
+		{
+			computeContactSphereConvex(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&hostBodyBuf[0],
+				&hostCollidables[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+		}
+
+		if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&
+			hostCollidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)
+		{
+			computeContactSphereConvex(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&hostBodyBuf[0],
+				&hostCollidables[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+			//printf("convex-sphere\n");
+			
+		}
+
+		if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&
+			hostCollidables[collidableIndexB].m_shapeType == SHAPE_PLANE)
+		{
+			computeContactPlaneConvex(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&hostBodyBuf[0],
+			&hostCollidables[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+//			printf("convex-plane\n");
+			
+		}
+
+		if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&
+			hostCollidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)
+		{
+			computeContactPlaneConvex(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&hostBodyBuf[0],
+			&hostCollidables[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+//			printf("plane-convex\n");
+			
+		}
+
+			if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS &&
+			hostCollidables[collidableIndexB].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+			computeContactCompoundCompound(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&hostBodyBuf[0],
+			&hostCollidables[0],&hostConvexData[0],&cpuChildShapes[0], hostAabbsWorldSpace,hostAabbsLocalSpace,hostVertices,hostUniqueEdges,hostIndices,hostFaces,&hostContacts[0],
+			nContacts,maxContactCapacity,treeNodesCPU,subTreesCPU,bvhInfoCPU);	
+//			printf("convex-plane\n");
+			
+		}
+
+
+				if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS &&
+			hostCollidables[collidableIndexB].m_shapeType == SHAPE_PLANE)
+		{
+			computeContactPlaneCompound(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&hostBodyBuf[0],
+			&hostCollidables[0],&hostConvexData[0],&cpuChildShapes[0], &hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+//			printf("convex-plane\n");
+			
+		}
+
+		if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&
+			hostCollidables[collidableIndexB].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+			computeContactPlaneCompound(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&hostBodyBuf[0],
+			&hostCollidables[0],&hostConvexData[0],&cpuChildShapes[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+//			printf("plane-convex\n");
+			
+		}
+
+		if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&
+			hostCollidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)
+		{
+			//printf("hostPairs[i].z=%d\n",hostPairs[i].z);
+			int contactIndex = computeContactConvexConvex2(           i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,hostBodyBuf, hostCollidables,hostConvexData,hostVertices,hostUniqueEdges,hostIndices,hostFaces,hostContacts,nContacts,maxContactCapacity,oldHostContacts);
+			//int contactIndex = computeContactConvexConvex(hostPairs,i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,hostBodyBuf,hostCollidables,hostConvexData,hostVertices,hostUniqueEdges,hostIndices,hostFaces,hostContacts,nContacts,maxContactCapacity,oldHostContacts);
+
+
+			if (contactIndex>=0)
+			{
+//				printf("convex convex contactIndex = %d\n",contactIndex);
+				hostPairs[i].z = contactIndex;
+			}
+//			printf("plane-convex\n");
+			
+		}
+
+
+	}
+
+	if (hostPairs.size())
+	{
+		pairs->copyFromHost(hostPairs);
+	}
+
+	hostContacts.resize(nContacts);
+	if (nContacts)
+		{
+			
+			contactOut->copyFromHost(hostContacts);
+		} else
+	{
+		contactOut->resize(0);
+		}
+
+		m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true);
+		//printf("(HOST) nContacts = %d\n",nContacts);
+
+#else
+
+	{
+		if (nPairs)
+		{
+			m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true);
+
+			B3_PROFILE("primitiveContactsKernel");
+			b3BufferInfoCL bInfo[] = {
+				b3BufferInfoCL( pairs->getBufferCL(), true ), 
+				b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+				b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+				b3BufferInfoCL( convexData.getBufferCL(),true),
+				b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+				b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+				b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+				b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+				b3BufferInfoCL( contactOut->getBufferCL()),
+				b3BufferInfoCL( m_totalContactsOut.getBufferCL())	
+			};
+			
+			b3LauncherCL launcher(m_queue, m_primitiveContactsKernel,"m_primitiveContactsKernel");
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst( nPairs  );
+			launcher.setConst(maxContactCapacity);
+			int num = nPairs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+		
+			nContacts = m_totalContactsOut.at(0);
+			contactOut->resize(nContacts);
+		}
+	}
+
+	
+#endif//CHECK_ON_HOST
+	
+	B3_PROFILE("computeConvexConvexContactsGPUSAT");
+   // printf("nContacts = %d\n",nContacts);
+    
+	
+	m_sepNormals.resize(nPairs);
+	m_hasSeparatingNormals.resize(nPairs);
+	
+	int concaveCapacity=maxTriConvexPairCapacity;
+	m_concaveSepNormals.resize(concaveCapacity);
+	m_concaveHasSeparatingNormals.resize(concaveCapacity);
+	m_numConcavePairsOut.resize(0);
+	m_numConcavePairsOut.push_back(0);
+
+	
+	m_gpuCompoundPairs.resize(compoundPairCapacity);
+
+	m_gpuCompoundSepNormals.resize(compoundPairCapacity);
+	
+	
+	m_gpuHasCompoundSepNormals.resize(compoundPairCapacity);
+	
+	m_numCompoundPairsOut.resize(0);
+	m_numCompoundPairsOut.push_back(0);
+
+	int numCompoundPairs = 0;
+
+	int numConcavePairs =0;
+
+	{
+		clFinish(m_queue);
+		if (findSeparatingAxisOnGpu)
+		{
+			m_dmins.resize(nPairs);
+			if (splitSearchSepAxisConvex)
+			{
+					
+
+				if (useMprGpu)
+				{
+					nContacts = m_totalContactsOut.at(0);
+					{
+						B3_PROFILE("mprPenetrationKernel");
+						b3BufferInfoCL bInfo[] = { 
+							b3BufferInfoCL( pairs->getBufferCL(), true ), 
+							b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+							b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+							b3BufferInfoCL( convexData.getBufferCL(),true),
+							b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+							b3BufferInfoCL( m_sepNormals.getBufferCL()),
+							b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
+							b3BufferInfoCL( contactOut->getBufferCL()),
+							b3BufferInfoCL( m_totalContactsOut.getBufferCL())
+						};
+
+						b3LauncherCL launcher(m_queue, m_mprPenetrationKernel,"mprPenetrationKernel");
+						launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+
+						launcher.setConst(maxContactCapacity);
+						launcher.setConst( nPairs  );
+
+						int num = nPairs;
+						launcher.launch1D( num);
+						clFinish(m_queue);
+						/*
+						b3AlignedObjectArray<int>hostHasSepAxis;
+						m_hasSeparatingNormals.copyToHost(hostHasSepAxis);
+						b3AlignedObjectArray<b3Vector3>hostSepAxis;
+						m_sepNormals.copyToHost(hostSepAxis);
+						*/
+						nContacts = m_totalContactsOut.at(0);
+						contactOut->resize(nContacts);
+					//	printf("nContacts (after mprPenetrationKernel) = %d\n",nContacts);
+						if (nContacts>maxContactCapacity)
+						{
+                
+							b3Error("Error: contacts exceeds capacity (%d/%d)\n", nContacts, maxContactCapacity);
+							nContacts = maxContactCapacity;
+						}
+
+					}
+				}
+				
+				if (1)
+				{
+
+					if (1)
+					{
+					{
+						B3_PROFILE("findSeparatingAxisVertexFaceKernel");
+						b3BufferInfoCL bInfo[] = { 
+							b3BufferInfoCL( pairs->getBufferCL(), true ), 
+							b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+							b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+							b3BufferInfoCL( convexData.getBufferCL(),true),
+							b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+							b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+							b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+							b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+							b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+							b3BufferInfoCL( m_sepNormals.getBufferCL()),
+							b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
+							b3BufferInfoCL( m_dmins.getBufferCL())
+						};
+
+						b3LauncherCL launcher(m_queue, m_findSeparatingAxisVertexFaceKernel,"findSeparatingAxisVertexFaceKernel");
+						launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+						launcher.setConst( nPairs  );
+
+						int num = nPairs;
+						launcher.launch1D( num);
+						clFinish(m_queue);
+					}
+
+
+					int numDirections = sizeof(unitSphere162)/sizeof(b3Vector3);
+					
+					{
+						B3_PROFILE("findSeparatingAxisEdgeEdgeKernel");
+						b3BufferInfoCL bInfo[] = { 
+							b3BufferInfoCL( pairs->getBufferCL(), true ), 
+							b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+							b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+							b3BufferInfoCL( convexData.getBufferCL(),true),
+							b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+							b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+							b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+							b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+							b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+							b3BufferInfoCL( m_sepNormals.getBufferCL()),
+							b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
+							b3BufferInfoCL( m_dmins.getBufferCL()),
+							b3BufferInfoCL( m_unitSphereDirections.getBufferCL(),true)
+
+						};
+
+						b3LauncherCL launcher(m_queue, m_findSeparatingAxisEdgeEdgeKernel,"findSeparatingAxisEdgeEdgeKernel");
+						launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+						launcher.setConst( numDirections);
+						launcher.setConst( nPairs  );
+						int num = nPairs;
+						launcher.launch1D( num);
+						clFinish(m_queue);
+
+					}
+					}
+					if (useMprGpu)
+					{
+						B3_PROFILE("findSeparatingAxisUnitSphereKernel");
+						b3BufferInfoCL bInfo[] = { 
+								b3BufferInfoCL( pairs->getBufferCL(), true ), 
+								b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+								b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+								b3BufferInfoCL( convexData.getBufferCL(),true),
+								b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+								b3BufferInfoCL( m_unitSphereDirections.getBufferCL(),true),
+								b3BufferInfoCL( m_sepNormals.getBufferCL()),
+								b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
+								b3BufferInfoCL( m_dmins.getBufferCL())
+						};
+
+						b3LauncherCL launcher(m_queue, m_findSeparatingAxisUnitSphereKernel,"findSeparatingAxisUnitSphereKernel");
+						launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+						int numDirections = sizeof(unitSphere162)/sizeof(b3Vector3);
+						launcher.setConst( numDirections);
+
+						launcher.setConst( nPairs  );
+                                                
+						int num = nPairs;
+						launcher.launch1D( num);
+						clFinish(m_queue);
+					}
+			}
+				
+
+			} else
+			{
+				B3_PROFILE("findSeparatingAxisKernel");
+				b3BufferInfoCL bInfo[] = { 
+					b3BufferInfoCL( pairs->getBufferCL(), true ), 
+					b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+					b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+					b3BufferInfoCL( convexData.getBufferCL(),true),
+					b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+					b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+					b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+					b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+					b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+					b3BufferInfoCL( m_sepNormals.getBufferCL()),
+					b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL())
+				};
+
+				b3LauncherCL launcher(m_queue, m_findSeparatingAxisKernel,"m_findSeparatingAxisKernel");
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+				launcher.setConst( nPairs  );
+
+				int num = nPairs;
+				launcher.launch1D( num);
+				clFinish(m_queue);
+			}
+			
+			
+		}
+        else
+        {
+            
+			B3_PROFILE("findSeparatingAxisKernel CPU");
+           
+            
+            b3AlignedObjectArray<b3Int4> hostPairs;
+            pairs->copyToHost(hostPairs);
+            b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf;
+            bodyBuf->copyToHost(hostBodyBuf);
+
+            b3AlignedObjectArray<b3Collidable> hostCollidables;
+            gpuCollidables.copyToHost(hostCollidables);
+            
+            b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes;
+            gpuChildShapes.copyToHost(cpuChildShapes);
+            
+            b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexShapeData;
+            convexData.copyToHost(hostConvexShapeData);
+            
+            b3AlignedObjectArray<b3Vector3> hostVertices;
+            gpuVertices.copyToHost(hostVertices);
+            
+            b3AlignedObjectArray<int> hostHasSepAxis;
+            hostHasSepAxis.resize(nPairs);
+            b3AlignedObjectArray<b3Vector3> hostSepAxis;
+            hostSepAxis.resize(nPairs);
+            
+            b3AlignedObjectArray<b3Vector3> hostUniqueEdges;
+            gpuUniqueEdges.copyToHost(hostUniqueEdges);
+            b3AlignedObjectArray<b3GpuFace> hostFaces;
+            gpuFaces.copyToHost(hostFaces);
+            
+            b3AlignedObjectArray<int> hostIndices;
+            gpuIndices.copyToHost(hostIndices);
+			
+			b3AlignedObjectArray<b3Contact4> hostContacts;
+			if (nContacts)
+			{
+				contactOut->copyToHost(hostContacts);
+			}
+			hostContacts.resize(maxContactCapacity);
+			int nGlobalContactsOut = nContacts;
+			
+            
+            for (int i=0;i<nPairs;i++)
+            {
+                
+                int bodyIndexA = hostPairs[i].x;
+                int bodyIndexB = hostPairs[i].y;
+                int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx;
+                int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx;
+                
+                int shapeIndexA = hostCollidables[collidableIndexA].m_shapeIndex;
+                int shapeIndexB = hostCollidables[collidableIndexB].m_shapeIndex;
+                
+                hostHasSepAxis[i] = 0;
+                
+                //once the broadphase avoids static-static pairs, we can remove this test
+                if ((hostBodyBuf[bodyIndexA].m_invMass==0) &&(hostBodyBuf[bodyIndexB].m_invMass==0))
+                {
+                    continue;
+                }
+                
+                
+                if ((hostCollidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(hostCollidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))
+                {
+                    continue;
+                }
+                
+                float dmin = FLT_MAX;
+                
+                b3ConvexPolyhedronData* convexShapeA = &hostConvexShapeData[shapeIndexA];
+                b3ConvexPolyhedronData* convexShapeB = &hostConvexShapeData[shapeIndexB];
+                b3Vector3 posA = hostBodyBuf[bodyIndexA].m_pos;
+                b3Vector3 posB = hostBodyBuf[bodyIndexB].m_pos;
+                b3Quaternion ornA =hostBodyBuf[bodyIndexA].m_quat;
+                b3Quaternion ornB =hostBodyBuf[bodyIndexB].m_quat;
+				
+				
+				if (useGjk)
+				{
+
+					//first approximate the separating axis, to 'fail-proof' GJK+EPA or MPR
+					{
+						b3Vector3 c0local = hostConvexShapeData[shapeIndexA].m_localCenter;
+						b3Vector3 c0 = b3TransformPoint(c0local, posA, ornA);
+						b3Vector3 c1local = hostConvexShapeData[shapeIndexB].m_localCenter;
+						b3Vector3 c1 = b3TransformPoint(c1local,posB,ornB);
+						b3Vector3 DeltaC2 = c0 - c1;
+                
+						b3Vector3 sepAxis;
+                
+						bool hasSepAxisA = b3FindSeparatingAxis(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2,
+							&hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+							&hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+											 &sepAxis, &dmin);
+                
+						if (hasSepAxisA)
+						{
+							bool hasSepAxisB = b3FindSeparatingAxis(convexShapeB, convexShapeA, posB, ornB, posA, ornA, DeltaC2,
+																	&hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+																	&hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+																	&sepAxis, &dmin);
+							if (hasSepAxisB)
+							{
+								bool hasEdgeEdge =b3FindSeparatingAxisEdgeEdge(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2,
+															 &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+															 &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+															 &sepAxis, &dmin,false);
+													 
+								if (hasEdgeEdge)
+								{
+									hostHasSepAxis[i] = 1;
+									hostSepAxis[i] = sepAxis;
+									hostSepAxis[i].w = dmin;
+								}
+							}
+						}
+					}
+
+					if (hostHasSepAxis[i])
+					{
+						int pairIndex = i;
+				
+						bool useMpr = true;
+						if (useMpr)
+						{
+							int res=0;
+							float depth = 0.f;
+							b3Vector3 sepAxis2 = b3MakeVector3(1,0,0);
+							b3Vector3 resultPointOnBWorld = b3MakeVector3(0,0,0);
+
+						float depthOut;
+						b3Vector3 dirOut;
+						b3Vector3 posOut;
+						
+
+						//res = b3MprPenetration(bodyIndexA,bodyIndexB,hostBodyBuf,hostConvexShapeData,hostCollidables,hostVertices,&mprConfig,&depthOut,&dirOut,&posOut);
+						res = b3MprPenetration(pairIndex,bodyIndexA,bodyIndexB,&hostBodyBuf[0],&hostConvexShapeData[0],&hostCollidables[0],&hostVertices[0],&hostSepAxis[0],&hostHasSepAxis[0],&depthOut,&dirOut,&posOut);
+						depth = depthOut;
+						sepAxis2 =  b3MakeVector3(-dirOut.x,-dirOut.y,-dirOut.z);
+						resultPointOnBWorld = posOut;
+						//hostHasSepAxis[i] = 0;
+
+
+						if (res==0)
+						{
+							//add point?
+							//printf("depth = %f\n",depth);
+							//printf("normal = %f,%f,%f\n",dir.v[0],dir.v[1],dir.v[2]);
+							//qprintf("pos = %f,%f,%f\n",pos.v[0],pos.v[1],pos.v[2]);
+						
+							
+
+							float dist=0.f;
+
+							const b3ConvexPolyhedronData& hullA = hostConvexShapeData[hostCollidables[hostBodyBuf[bodyIndexA].m_collidableIdx].m_shapeIndex];
+							const b3ConvexPolyhedronData& hullB = hostConvexShapeData[hostCollidables[hostBodyBuf[bodyIndexB].m_collidableIdx].m_shapeIndex];
+
+							if(b3TestSepAxis( &hullA, &hullB, posA,ornA,posB,ornB,&sepAxis2, &hostVertices[0], &hostVertices[0],&dist))
+							{
+								if (depth > dist)
+								{
+									float diff = depth - dist;
+									
+									static float maxdiff = 0.f;
+									if (maxdiff < diff)
+									{
+										maxdiff = diff;
+										printf("maxdiff = %20.10f\n",maxdiff);
+									}
+								}
+							}
+							if (depth > dmin)
+							{
+								b3Vector3 oldAxis = hostSepAxis[i];
+								depth = dmin;
+								sepAxis2 = oldAxis;
+							}
+
+							
+
+							if(b3TestSepAxis( &hullA, &hullB, posA,ornA,posB,ornB,&sepAxis2, &hostVertices[0], &hostVertices[0],&dist))
+							{
+								if (depth > dist)
+								{
+									float diff = depth - dist;
+									//printf("?diff  = %f\n",diff );
+									static float maxdiff = 0.f;
+									if (maxdiff < diff)
+									{
+										maxdiff = diff;
+										printf("maxdiff = %20.10f\n",maxdiff);
+									}
+								}
+								//this is used for SAT
+								//hostHasSepAxis[i] = 1;
+								//hostSepAxis[i] = sepAxis2;
+
+								//add contact point
+
+								//int contactIndex = nGlobalContactsOut;
+								b3Contact4& newContact = hostContacts.at(nGlobalContactsOut);
+								nGlobalContactsOut++;
+								newContact.m_batchIdx = 0;//i;
+								newContact.m_bodyAPtrAndSignBit = (hostBodyBuf.at(bodyIndexA).m_invMass==0)? -bodyIndexA:bodyIndexA;
+								newContact.m_bodyBPtrAndSignBit = (hostBodyBuf.at(bodyIndexB).m_invMass==0)? -bodyIndexB:bodyIndexB;
+
+								newContact.m_frictionCoeffCmp = 45874;
+								newContact.m_restituitionCoeffCmp = 0;
+						
+								
+								static float maxDepth = 0.f;
+							
+								if (depth > maxDepth)
+								{
+									maxDepth  = depth;
+									printf("MPR maxdepth = %f\n",maxDepth );
+							
+								}
+							
+
+								resultPointOnBWorld.w = -depth;
+								newContact.m_worldPosB[0] = resultPointOnBWorld;
+								//b3Vector3 resultPointOnAWorld = resultPointOnBWorld+depth*sepAxis2;
+								newContact.m_worldNormalOnB = sepAxis2;
+								newContact.m_worldNormalOnB.w = (b3Scalar)1;
+							} else
+							{
+								printf("rejected\n");
+							}
+
+			
+						}
+						} else
+						{
+
+			
+				
+			//int contactIndex = computeContactConvexConvex2(           i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,hostBodyBuf, hostCollidables,hostConvexData,hostVertices,hostUniqueEdges,hostIndices,hostFaces,hostContacts,nContacts,maxContactCapacity,oldHostContacts);
+							b3AlignedObjectArray<b3Contact4> oldHostContacts;	
+							int result;
+							result = computeContactConvexConvex2( //hostPairs,
+													   pairIndex,
+													bodyIndexA, bodyIndexB,
+													   collidableIndexA, collidableIndexB,
+													   hostBodyBuf,
+													   hostCollidables,
+													   hostConvexShapeData,
+													   hostVertices,
+													   hostUniqueEdges,
+													   hostIndices,
+													   hostFaces,
+													   hostContacts,
+													   nGlobalContactsOut,
+														maxContactCapacity,
+														oldHostContacts
+														//hostHasSepAxis,
+														//hostSepAxis
+														
+																);
+						}//mpr
+					}//hostHasSepAxis[i] = 1;
+					
+				} else
+				{
+				
+					b3Vector3 c0local = hostConvexShapeData[shapeIndexA].m_localCenter;
+					b3Vector3 c0 = b3TransformPoint(c0local, posA, ornA);
+					b3Vector3 c1local = hostConvexShapeData[shapeIndexB].m_localCenter;
+					b3Vector3 c1 = b3TransformPoint(c1local,posB,ornB);
+					b3Vector3 DeltaC2 = c0 - c1;
+                
+					b3Vector3 sepAxis;
+                
+					bool hasSepAxisA = b3FindSeparatingAxis(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2,
+						&hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+						&hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+										 &sepAxis, &dmin);
+                
+					if (hasSepAxisA)
+					{
+						bool hasSepAxisB = b3FindSeparatingAxis(convexShapeB, convexShapeA, posB, ornB, posA, ornA, DeltaC2,
+																&hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+																&hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+																&sepAxis, &dmin);
+						if (hasSepAxisB)
+						{
+							bool hasEdgeEdge =b3FindSeparatingAxisEdgeEdge(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2,
+														 &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+														 &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0),
+														 &sepAxis, &dmin,true);
+													 
+							if (hasEdgeEdge)
+							{
+								hostHasSepAxis[i] = 1;
+								hostSepAxis[i] = sepAxis;
+							}
+						}
+					}
+				}
+            }
+            
+			if (useGjkContacts)//nGlobalContactsOut>0)
+			{
+				//printf("nGlobalContactsOut=%d\n",nGlobalContactsOut);
+				nContacts = nGlobalContactsOut;
+				contactOut->copyFromHost(hostContacts);
+	
+				m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true);
+			}
+            
+            m_hasSeparatingNormals.copyFromHost(hostHasSepAxis);
+            m_sepNormals.copyFromHost(hostSepAxis);
+        
+            /*
+             //double-check results from GPU (comment-out the 'else' so both paths are executed
+            b3AlignedObjectArray<int> checkHasSepAxis;
+            m_hasSeparatingNormals.copyToHost(checkHasSepAxis);
+            static int frameCount = 0;
+            frameCount++;
+            for (int i=0;i<nPairs;i++)
+            {
+                if (hostHasSepAxis[i] != checkHasSepAxis[i])
+                {
+                    printf("at frameCount %d hostHasSepAxis[%d] = %d but checkHasSepAxis[i] = %d\n",
+                           frameCount,i,hostHasSepAxis[i],checkHasSepAxis[i]);
+                }
+            }
+            //m_hasSeparatingNormals.copyFromHost(hostHasSepAxis);
+            //    m_sepNormals.copyFromHost(hostSepAxis);
+            */
+        }
+        
+        
+        numCompoundPairs = m_numCompoundPairsOut.at(0);
+        bool useGpuFindCompoundPairs=true;
+        if (useGpuFindCompoundPairs)
+        {
+            B3_PROFILE("findCompoundPairsKernel");
+            b3BufferInfoCL bInfo[] = 
+            { 
+                b3BufferInfoCL( pairs->getBufferCL(), true ), 
+                b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+                b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+                b3BufferInfoCL( convexData.getBufferCL(),true),
+                b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+                b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+                b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+                b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+                b3BufferInfoCL( clAabbsLocalSpace.getBufferCL(),true),
+                b3BufferInfoCL( gpuChildShapes.getBufferCL(),true),
+                b3BufferInfoCL( m_gpuCompoundPairs.getBufferCL()),
+                b3BufferInfoCL( m_numCompoundPairsOut.getBufferCL()),
+                b3BufferInfoCL(subTreesGPU->getBufferCL()),
+                b3BufferInfoCL(treeNodesGPU->getBufferCL()),
+                b3BufferInfoCL(bvhInfo->getBufferCL())
+            };
+
+            b3LauncherCL launcher(m_queue, m_findCompoundPairsKernel,"m_findCompoundPairsKernel");
+            launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+            launcher.setConst( nPairs  );
+            launcher.setConst( compoundPairCapacity);
+
+            int num = nPairs;
+            launcher.launch1D( num);
+            clFinish(m_queue);
+
+            numCompoundPairs = m_numCompoundPairsOut.at(0);
+            //printf("numCompoundPairs =%d\n",numCompoundPairs );
+            if (numCompoundPairs)
+            {
+                //printf("numCompoundPairs=%d\n",numCompoundPairs);
+            }
+            
+
+        } else
+        {
+
+
+            b3AlignedObjectArray<b3QuantizedBvhNode>	treeNodesCPU;
+            treeNodesGPU->copyToHost(treeNodesCPU);
+
+            b3AlignedObjectArray<b3BvhSubtreeInfo>	subTreesCPU;
+            subTreesGPU->copyToHost(subTreesCPU);
+
+            b3AlignedObjectArray<b3BvhInfo>	bvhInfoCPU;
+            bvhInfo->copyToHost(bvhInfoCPU);
+
+            b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace;
+            clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace);
+
+            b3AlignedObjectArray<b3Aabb> hostAabbsLocalSpace;
+            clAabbsLocalSpace.copyToHost(hostAabbsLocalSpace);
+
+            b3AlignedObjectArray<b3Int4> hostPairs;
+            pairs->copyToHost(hostPairs);
+
+            b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf;
+            bodyBuf->copyToHost(hostBodyBuf);
+
+
+            b3AlignedObjectArray<b3Int4> cpuCompoundPairsOut;
+            cpuCompoundPairsOut.resize(compoundPairCapacity);
+
+            b3AlignedObjectArray<b3Collidable> hostCollidables;
+            gpuCollidables.copyToHost(hostCollidables);
+
+            b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes;
+            gpuChildShapes.copyToHost(cpuChildShapes);
+
+            b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData;
+            convexData.copyToHost(hostConvexData);
+
+            b3AlignedObjectArray<b3Vector3> hostVertices;
+            gpuVertices.copyToHost(hostVertices);
+
+
+
+
+            for (int pairIndex=0;pairIndex<nPairs;pairIndex++)
+            {
+                int bodyIndexA = hostPairs[pairIndex].x;
+                int bodyIndexB = hostPairs[pairIndex].y;
+                int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx;
+                int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx;
+				if (cpuChildShapes.size())
+				{
+                findCompoundPairsKernel( 
+                            pairIndex,
+                            bodyIndexA,
+                            bodyIndexB,
+                            collidableIndexA,
+                            collidableIndexB,
+                            &hostBodyBuf[0],
+                            &hostCollidables[0],
+                            &hostConvexData[0],
+                            hostVertices,
+                            hostAabbsWorldSpace,
+                            hostAabbsLocalSpace,
+                            &cpuChildShapes[0],
+                            &cpuCompoundPairsOut[0],
+                            &numCompoundPairs,
+                            compoundPairCapacity,
+                            treeNodesCPU,
+                            subTreesCPU,
+                            bvhInfoCPU
+                            );
+				}
+            }
+            
+
+			m_numCompoundPairsOut.copyFromHostPointer(&numCompoundPairs,1,0,true);
+			if (numCompoundPairs)
+			{
+				b3CompoundOverlappingPair* ptr = (b3CompoundOverlappingPair*)&cpuCompoundPairsOut[0];
+				m_gpuCompoundPairs.copyFromHostPointer(ptr,numCompoundPairs,0,true);
+			}
+			//cpuCompoundPairsOut
+            
+        }
+		if (numCompoundPairs)
+		{
+			printf("numCompoundPairs=%d\n",numCompoundPairs);
+		}
+
+        if (numCompoundPairs > compoundPairCapacity)
+        {
+            b3Error("Exceeded compound pair capacity (%d/%d)\n", numCompoundPairs,  compoundPairCapacity);
+            numCompoundPairs = compoundPairCapacity;
+        }
+
+        
+
+        m_gpuCompoundPairs.resize(numCompoundPairs);
+        m_gpuHasCompoundSepNormals.resize(numCompoundPairs);
+        m_gpuCompoundSepNormals.resize(numCompoundPairs);
+        
+
+        if (numCompoundPairs)
+        {
+            B3_PROFILE("processCompoundPairsPrimitivesKernel");
+            b3BufferInfoCL bInfo[] = 
+            { 
+                b3BufferInfoCL( m_gpuCompoundPairs.getBufferCL(), true ), 
+                b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+                b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+                b3BufferInfoCL( convexData.getBufferCL(),true),
+                b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+                b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+                b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+                b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+                b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+                b3BufferInfoCL( gpuChildShapes.getBufferCL(),true),
+                b3BufferInfoCL( contactOut->getBufferCL()),
+                b3BufferInfoCL( m_totalContactsOut.getBufferCL())	
+            };
+
+            b3LauncherCL launcher(m_queue, m_processCompoundPairsPrimitivesKernel,"m_processCompoundPairsPrimitivesKernel");
+            launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+            launcher.setConst( numCompoundPairs  );
+            launcher.setConst(maxContactCapacity);
+
+            int num = numCompoundPairs;
+            launcher.launch1D( num);
+            clFinish(m_queue);
+            nContacts = m_totalContactsOut.at(0);
+            //printf("nContacts (after processCompoundPairsPrimitivesKernel) = %d\n",nContacts);
+            if (nContacts>maxContactCapacity)
+            {
+                
+                b3Error("Error: contacts exceeds capacity (%d/%d)\n", nContacts, maxContactCapacity);
+                nContacts = maxContactCapacity;
+            }
+        }
+        
+
+        if (numCompoundPairs)
+        {
+            B3_PROFILE("processCompoundPairsKernel");
+            b3BufferInfoCL bInfo[] = 
+            { 
+                b3BufferInfoCL( m_gpuCompoundPairs.getBufferCL(), true ), 
+                b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+                b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+                b3BufferInfoCL( convexData.getBufferCL(),true),
+                b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+                b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+                b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+                b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+                b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+                b3BufferInfoCL( gpuChildShapes.getBufferCL(),true),
+                b3BufferInfoCL( m_gpuCompoundSepNormals.getBufferCL()),
+                b3BufferInfoCL( m_gpuHasCompoundSepNormals.getBufferCL())
+            };
+
+            b3LauncherCL launcher(m_queue, m_processCompoundPairsKernel,"m_processCompoundPairsKernel");
+            launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+            launcher.setConst( numCompoundPairs  );
+
+            int num = numCompoundPairs;
+            launcher.launch1D( num);
+            clFinish(m_queue);
+        
+        }
+
+
+        //printf("numConcave  = %d\n",numConcave);
+
+    
+
+//		printf("hostNormals.size()=%d\n",hostNormals.size());
+		//int numPairs = pairCount.at(0);
+		
+		
+		
+	}
+	int vertexFaceCapacity = 64;
+
+
+		
+	{
+		//now perform the tree query on GPU
+			
+					
+				
+				
+		if (treeNodesGPU->size() && treeNodesGPU->size())
+		{
+			if (bvhTraversalKernelGPU)
+			{
+						
+				B3_PROFILE("m_bvhTraversalKernel");
+						
+						
+				numConcavePairs = m_numConcavePairsOut.at(0);
+						
+				b3LauncherCL launcher(m_queue, m_bvhTraversalKernel,"m_bvhTraversalKernel");
+				launcher.setBuffer( pairs->getBufferCL());
+				launcher.setBuffer(  bodyBuf->getBufferCL());
+				launcher.setBuffer( gpuCollidables.getBufferCL());
+				launcher.setBuffer( clAabbsWorldSpace.getBufferCL());
+				launcher.setBuffer( triangleConvexPairsOut.getBufferCL());
+				launcher.setBuffer( m_numConcavePairsOut.getBufferCL());
+				launcher.setBuffer( subTreesGPU->getBufferCL());
+				launcher.setBuffer( treeNodesGPU->getBufferCL());
+				launcher.setBuffer( bvhInfo->getBufferCL());
+						
+				launcher.setConst( nPairs  );
+				launcher.setConst( maxTriConvexPairCapacity);
+				int num = nPairs;
+				launcher.launch1D( num);
+				clFinish(m_queue);
+				numConcavePairs = m_numConcavePairsOut.at(0);
+			} else
+			{
+					b3AlignedObjectArray<b3Int4> hostPairs;
+					pairs->copyToHost(hostPairs);
+					b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf;
+					bodyBuf->copyToHost(hostBodyBuf);
+					b3AlignedObjectArray<b3Collidable> hostCollidables;
+					gpuCollidables.copyToHost(hostCollidables);
+					b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace;
+					clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace);
+
+					//int maxTriConvexPairCapacity,
+					b3AlignedObjectArray<b3Int4> triangleConvexPairsOutHost;
+					triangleConvexPairsOutHost.resize(maxTriConvexPairCapacity);
+
+					//int numTriConvexPairsOutHost=0;
+					numConcavePairs = 0;
+					//m_numConcavePairsOut
+
+					b3AlignedObjectArray<b3QuantizedBvhNode>	treeNodesCPU;
+					treeNodesGPU->copyToHost(treeNodesCPU);
+					b3AlignedObjectArray<b3BvhSubtreeInfo>	subTreesCPU;
+					subTreesGPU->copyToHost(subTreesCPU);
+					b3AlignedObjectArray<b3BvhInfo>	bvhInfoCPU;
+					bvhInfo->copyToHost(bvhInfoCPU);
+					//compute it...
+
+					volatile int hostNumConcavePairsOut=0;
+
+					//
+					for (int i=0;i<nPairs;i++)
+					{
+						b3BvhTraversal( &hostPairs.at(0), 
+						&hostBodyBuf.at(0),
+						&hostCollidables.at(0),
+						&hostAabbsWorldSpace.at(0),
+						&triangleConvexPairsOutHost.at(0),
+						&hostNumConcavePairsOut,
+						&subTreesCPU.at(0),
+						&treeNodesCPU.at(0),
+						&bvhInfoCPU.at(0),
+						nPairs,
+						maxTriConvexPairCapacity,
+						i);
+					}
+					numConcavePairs = hostNumConcavePairsOut;
+
+					if (hostNumConcavePairsOut)
+					{
+						triangleConvexPairsOutHost.resize(hostNumConcavePairsOut);
+						triangleConvexPairsOut.copyFromHost(triangleConvexPairsOutHost);
+					}
+					//
+
+					m_numConcavePairsOut.resize(0);
+					m_numConcavePairsOut.push_back(numConcavePairs);
+			}
+
+				//printf("numConcavePairs=%d (max = %d\n",numConcavePairs,maxTriConvexPairCapacity);
+						
+			if (numConcavePairs > maxTriConvexPairCapacity)
+			{
+				static int exceeded_maxTriConvexPairCapacity_count = 0;
+				b3Error("Exceeded the maxTriConvexPairCapacity (found %d but max is %d, it happened %d times)\n",
+					numConcavePairs,maxTriConvexPairCapacity,exceeded_maxTriConvexPairCapacity_count++);
+				numConcavePairs = maxTriConvexPairCapacity;
+			}
+			triangleConvexPairsOut.resize(numConcavePairs);
+	
+			if (numConcavePairs)
+			{
+
+				
+						
+	
+				clippingFacesOutGPU.resize(numConcavePairs);
+				worldNormalsAGPU.resize(numConcavePairs);
+				worldVertsA1GPU.resize(vertexFaceCapacity*(numConcavePairs));
+				worldVertsB1GPU.resize(vertexFaceCapacity*(numConcavePairs));
+
+
+				if (findConcaveSeparatingAxisKernelGPU)
+				{
+
+					/*
+					m_concaveHasSeparatingNormals.copyFromHost(concaveHasSeparatingNormalsCPU);
+						clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU);
+						worldVertsA1GPU.copyFromHost(worldVertsA1CPU);
+						worldNormalsAGPU.copyFromHost(worldNormalsACPU);
+						worldVertsB1GPU.copyFromHost(worldVertsB1CPU);
+					*/
+
+					//now perform a SAT test for each triangle-convex element (stored in triangleConvexPairsOut)
+                    if (splitSearchSepAxisConcave)
+                    {
+                        //printf("numConcavePairs = %d\n",numConcavePairs);
+                        m_dmins.resize(numConcavePairs);
+                        {
+                            B3_PROFILE("findConcaveSeparatingAxisVertexFaceKernel");
+                            b3BufferInfoCL bInfo[] = {
+                                b3BufferInfoCL( triangleConvexPairsOut.getBufferCL() ),
+                                b3BufferInfoCL( bodyBuf->getBufferCL(),true),
+                                b3BufferInfoCL( gpuCollidables.getBufferCL(),true),
+                                b3BufferInfoCL( convexData.getBufferCL(),true),
+                                b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+                                b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+                                b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+                                b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+                                b3BufferInfoCL( gpuChildShapes.getBufferCL(),true),
+                                b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+                                b3BufferInfoCL( m_concaveSepNormals.getBufferCL()),
+                                b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()),
+                                b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+                                b3BufferInfoCL( worldVertsA1GPU.getBufferCL()),
+                                b3BufferInfoCL(worldNormalsAGPU.getBufferCL()),
+                                b3BufferInfoCL(worldVertsB1GPU.getBufferCL()),
+                                b3BufferInfoCL(m_dmins.getBufferCL())
+                            };
+                            
+                            b3LauncherCL launcher(m_queue, m_findConcaveSeparatingAxisVertexFaceKernel,"m_findConcaveSeparatingAxisVertexFaceKernel");
+                            launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+                            launcher.setConst(vertexFaceCapacity);
+                            launcher.setConst( numConcavePairs  );
+                            
+                            int num = numConcavePairs;
+                            launcher.launch1D( num);
+                            clFinish(m_queue);
+
+                            
+                        }
+//                        numConcavePairs = 0;
+                        if (1)
+                        {
+                            B3_PROFILE("findConcaveSeparatingAxisEdgeEdgeKernel");
+                            b3BufferInfoCL bInfo[] = {
+                                b3BufferInfoCL( triangleConvexPairsOut.getBufferCL() ),
+                                b3BufferInfoCL( bodyBuf->getBufferCL(),true),
+                                b3BufferInfoCL( gpuCollidables.getBufferCL(),true),
+                                b3BufferInfoCL( convexData.getBufferCL(),true),
+                                b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+                                b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+                                b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+                                b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+                                b3BufferInfoCL( gpuChildShapes.getBufferCL(),true),
+                                b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+                                b3BufferInfoCL( m_concaveSepNormals.getBufferCL()),
+                                b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()),
+                                b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+                                b3BufferInfoCL( worldVertsA1GPU.getBufferCL()),
+                                b3BufferInfoCL(worldNormalsAGPU.getBufferCL()),
+                                b3BufferInfoCL(worldVertsB1GPU.getBufferCL()),
+                                b3BufferInfoCL(m_dmins.getBufferCL())
+                            };
+                            
+                            b3LauncherCL launcher(m_queue, m_findConcaveSeparatingAxisEdgeEdgeKernel,"m_findConcaveSeparatingAxisEdgeEdgeKernel");
+                            launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+                            launcher.setConst(vertexFaceCapacity);
+                            launcher.setConst( numConcavePairs  );
+                            
+                            int num = numConcavePairs;
+                            launcher.launch1D( num);
+                            clFinish(m_queue);
+                        }
+                      
+                        
+                        // numConcavePairs = 0;
+                        
+                        
+                        
+                        
+                        
+                        
+                    } else
+                    {
+                        B3_PROFILE("findConcaveSeparatingAxisKernel");
+                        b3BufferInfoCL bInfo[] = { 
+                            b3BufferInfoCL( triangleConvexPairsOut.getBufferCL() ), 
+                            b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+                            b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+                            b3BufferInfoCL( convexData.getBufferCL(),true),
+                            b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+                            b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+                            b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+                            b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+                            b3BufferInfoCL( gpuChildShapes.getBufferCL(),true),
+                            b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+                            b3BufferInfoCL( m_concaveSepNormals.getBufferCL()),
+                            b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()),
+                            b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+                            b3BufferInfoCL( worldVertsA1GPU.getBufferCL()),
+                            b3BufferInfoCL(worldNormalsAGPU.getBufferCL()),
+                            b3BufferInfoCL(worldVertsB1GPU.getBufferCL())
+                        };
+
+                        b3LauncherCL launcher(m_queue, m_findConcaveSeparatingAxisKernel,"m_findConcaveSeparatingAxisKernel");
+                        launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+                        launcher.setConst(vertexFaceCapacity);
+                        launcher.setConst( numConcavePairs  );
+
+                        int num = numConcavePairs;
+                        launcher.launch1D( num);
+                        clFinish(m_queue);
+                    }
+                    
+                    
+				} else
+				{
+
+						b3AlignedObjectArray<b3Int4> clippingFacesOutCPU;
+						b3AlignedObjectArray<b3Vector3> worldVertsA1CPU;
+						b3AlignedObjectArray<b3Vector3> worldNormalsACPU;
+						b3AlignedObjectArray<b3Vector3> worldVertsB1CPU;
+						b3AlignedObjectArray<int>concaveHasSeparatingNormalsCPU;
+
+						b3AlignedObjectArray<b3Int4> triangleConvexPairsOutHost;
+						triangleConvexPairsOut.copyToHost(triangleConvexPairsOutHost);
+						//triangleConvexPairsOutHost.resize(maxTriConvexPairCapacity);
+						b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf;
+						bodyBuf->copyToHost(hostBodyBuf);
+						b3AlignedObjectArray<b3Collidable> hostCollidables;
+						gpuCollidables.copyToHost(hostCollidables);
+						b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace;
+						clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace);
+
+						b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData;
+						convexData.copyToHost(hostConvexData);
+
+						b3AlignedObjectArray<b3Vector3> hostVertices;
+						gpuVertices.copyToHost(hostVertices);
+
+						b3AlignedObjectArray<b3Vector3> hostUniqueEdges;
+						gpuUniqueEdges.copyToHost(hostUniqueEdges);
+						b3AlignedObjectArray<b3GpuFace> hostFaces;
+						gpuFaces.copyToHost(hostFaces);
+						b3AlignedObjectArray<int> hostIndices;
+						gpuIndices.copyToHost(hostIndices);
+						b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes;
+						gpuChildShapes.copyToHost(cpuChildShapes);
+
+
+								
+						b3AlignedObjectArray<b3Vector3> concaveSepNormalsHost;
+						m_concaveSepNormals.copyToHost(concaveSepNormalsHost);
+						concaveHasSeparatingNormalsCPU.resize(concaveSepNormalsHost.size());
+
+						b3GpuChildShape* childShapePointerCPU = 0;
+						if (cpuChildShapes.size())
+							childShapePointerCPU  = &cpuChildShapes.at(0);
+
+						clippingFacesOutCPU.resize(clippingFacesOutGPU.size());
+						worldVertsA1CPU.resize(worldVertsA1GPU.size());
+    					worldNormalsACPU.resize(worldNormalsAGPU.size());
+						worldVertsB1CPU.resize(worldVertsB1GPU.size());
+
+						for (int i=0;i<numConcavePairs;i++)
+						{
+							b3FindConcaveSeparatingAxisKernel(&triangleConvexPairsOutHost.at(0),
+								&hostBodyBuf.at(0),
+								&hostCollidables.at(0),
+								&hostConvexData.at(0), &hostVertices.at(0),&hostUniqueEdges.at(0),
+								&hostFaces.at(0),&hostIndices.at(0),childShapePointerCPU,
+								&hostAabbsWorldSpace.at(0),
+								&concaveSepNormalsHost.at(0),
+								&clippingFacesOutCPU.at(0),
+								&worldVertsA1CPU.at(0),
+								&worldNormalsACPU.at(0),
+								&worldVertsB1CPU.at(0),
+								&concaveHasSeparatingNormalsCPU.at(0),
+								vertexFaceCapacity,
+								numConcavePairs,i);
+						};
+
+						m_concaveSepNormals.copyFromHost(concaveSepNormalsHost);
+						m_concaveHasSeparatingNormals.copyFromHost(concaveHasSeparatingNormalsCPU);
+						clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU);
+						worldVertsA1GPU.copyFromHost(worldVertsA1CPU);
+						worldNormalsAGPU.copyFromHost(worldNormalsACPU);
+						worldVertsB1GPU.copyFromHost(worldVertsB1CPU);
+
+
+
+				}
+//							b3AlignedObjectArray<b3Vector3> cpuCompoundSepNormals;
+//						m_concaveSepNormals.copyToHost(cpuCompoundSepNormals);
+//					b3AlignedObjectArray<b3Int4> cpuConcavePairs;
+//				triangleConvexPairsOut.copyToHost(cpuConcavePairs);
+
+
+			}
+		}
+		
+		
+	}
+
+	if (numConcavePairs)
+	{
+			if (numConcavePairs)
+		{
+			B3_PROFILE("findConcaveSphereContactsKernel");
+				nContacts = m_totalContactsOut.at(0);
+//				printf("nContacts1 = %d\n",nContacts);
+			b3BufferInfoCL bInfo[] = { 
+				b3BufferInfoCL( triangleConvexPairsOut.getBufferCL() ), 
+				b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+				b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+				b3BufferInfoCL( convexData.getBufferCL(),true),
+				b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+				b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+				b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+				b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+				b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
+				b3BufferInfoCL( contactOut->getBufferCL()),
+				b3BufferInfoCL( m_totalContactsOut.getBufferCL())
+			};
+
+			b3LauncherCL launcher(m_queue, m_findConcaveSphereContactsKernel,"m_findConcaveSphereContactsKernel");
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+
+			launcher.setConst( numConcavePairs  );
+			launcher.setConst(maxContactCapacity);
+
+			int num = numConcavePairs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+			nContacts = m_totalContactsOut.at(0);
+			//printf("nContacts (after findConcaveSphereContactsKernel) = %d\n",nContacts);
+
+			//printf("nContacts2 = %d\n",nContacts);
+
+			if (nContacts >= maxContactCapacity)
+			{
+				b3Error("Error: contacts exceeds capacity (%d/%d)\n", nContacts, maxContactCapacity);
+				nContacts = maxContactCapacity;
+			}
+		}
+		
+	}
+
+
+
+#ifdef __APPLE__
+	bool contactClippingOnGpu = true;
+#else
+	bool contactClippingOnGpu = true;
+#endif
+
+	if (contactClippingOnGpu)
+	{
+		m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true);
+//		printf("nContacts3 = %d\n",nContacts);
+
+
+		//B3_PROFILE("clipHullHullKernel");
+
+		bool breakupConcaveConvexKernel = true;
+
+#ifdef __APPLE__
+		//actually, some Apple OpenCL platform/device combinations work fine...
+		breakupConcaveConvexKernel = true;
+#endif
+		//concave-convex contact clipping
+		if (numConcavePairs)
+		{
+			//			printf("numConcavePairs = %d\n", numConcavePairs);
+			//		nContacts = m_totalContactsOut.at(0);
+			//	printf("nContacts before = %d\n", nContacts);
+
+			if (breakupConcaveConvexKernel)
+			{
+
+				worldVertsB2GPU.resize(vertexFaceCapacity*numConcavePairs);
+
+
+				//clipFacesAndFindContacts
+
+				if (clipConcaveFacesAndFindContactsCPU)
+				{
+
+					b3AlignedObjectArray<b3Int4> clippingFacesOutCPU;
+					b3AlignedObjectArray<b3Vector3> worldVertsA1CPU;
+					b3AlignedObjectArray<b3Vector3> worldNormalsACPU;
+					b3AlignedObjectArray<b3Vector3> worldVertsB1CPU;
+
+					clippingFacesOutGPU.copyToHost(clippingFacesOutCPU);
+					worldVertsA1GPU.copyToHost(worldVertsA1CPU);
+					worldNormalsAGPU.copyToHost(worldNormalsACPU);
+					worldVertsB1GPU.copyToHost(worldVertsB1CPU);
+
+
+
+					b3AlignedObjectArray<int>concaveHasSeparatingNormalsCPU;
+					m_concaveHasSeparatingNormals.copyToHost(concaveHasSeparatingNormalsCPU);
+
+					b3AlignedObjectArray<b3Vector3> concaveSepNormalsHost;
+					m_concaveSepNormals.copyToHost(concaveSepNormalsHost);
+
+					b3AlignedObjectArray<b3Vector3> worldVertsB2CPU;  
+					worldVertsB2CPU.resize(worldVertsB2GPU.size());
+
+
+					for (int i=0;i<numConcavePairs;i++)
+					{
+
+						clipFacesAndFindContactsKernel(   &concaveSepNormalsHost.at(0),
+							&concaveHasSeparatingNormalsCPU.at(0),
+							&clippingFacesOutCPU.at(0),
+							&worldVertsA1CPU.at(0),
+							&worldNormalsACPU.at(0),
+							&worldVertsB1CPU.at(0),
+							&worldVertsB2CPU.at(0),
+							vertexFaceCapacity,
+							i);
+					}
+
+					clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU);
+					worldVertsB2GPU.copyFromHost(worldVertsB2CPU);
+
+
+				} else
+				{
+
+					if (1)
+					{
+
+
+
+						B3_PROFILE("clipFacesAndFindContacts");
+						//nContacts = m_totalContactsOut.at(0);
+						//int h = m_hasSeparatingNormals.at(0);
+						//int4 p = clippingFacesOutGPU.at(0);
+						b3BufferInfoCL bInfo[] = {
+							b3BufferInfoCL( m_concaveSepNormals.getBufferCL()),
+							b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()),
+							b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+							b3BufferInfoCL( worldVertsA1GPU.getBufferCL()),
+							b3BufferInfoCL( worldNormalsAGPU.getBufferCL()),
+							b3BufferInfoCL( worldVertsB1GPU.getBufferCL()),
+							b3BufferInfoCL( worldVertsB2GPU.getBufferCL())
+						};
+						b3LauncherCL launcher(m_queue, m_clipFacesAndFindContacts,"m_clipFacesAndFindContacts");
+						launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+						launcher.setConst(vertexFaceCapacity);
+
+						launcher.setConst( numConcavePairs  );
+						int debugMode = 0;
+						launcher.setConst( debugMode);
+						int num = numConcavePairs;
+						launcher.launch1D( num);
+						clFinish(m_queue);
+						//int bla = m_totalContactsOut.at(0);
+					}
+				}
+				//contactReduction
+				{
+					int newContactCapacity=nContacts+numConcavePairs; 
+					contactOut->reserve(newContactCapacity);
+					if (reduceConcaveContactsOnGPU)
+					{
+//						printf("newReservation = %d\n",newReservation);
+						{
+							B3_PROFILE("newContactReductionKernel");
+							b3BufferInfoCL bInfo[] =
+							{
+								b3BufferInfoCL( triangleConvexPairsOut.getBufferCL(), true ),
+								b3BufferInfoCL( bodyBuf->getBufferCL(),true),
+								b3BufferInfoCL( m_concaveSepNormals.getBufferCL()),
+								b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()),
+								b3BufferInfoCL( contactOut->getBufferCL()),
+								b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+								b3BufferInfoCL( worldVertsB2GPU.getBufferCL()),
+								b3BufferInfoCL( m_totalContactsOut.getBufferCL())
+							};
+
+							b3LauncherCL launcher(m_queue, m_newContactReductionKernel,"m_newContactReductionKernel");
+							launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+							launcher.setConst(vertexFaceCapacity);
+							launcher.setConst(newContactCapacity);
+							launcher.setConst( numConcavePairs  );
+							int num = numConcavePairs;
+
+							launcher.launch1D( num);
+						}
+						nContacts = m_totalContactsOut.at(0);
+						contactOut->resize(nContacts);
+
+						//printf("contactOut4 (after newContactReductionKernel) = %d\n",nContacts);
+					}else
+					{
+						
+						volatile int nGlobalContactsOut = nContacts;
+						b3AlignedObjectArray<b3Int4> triangleConvexPairsOutHost;
+						triangleConvexPairsOut.copyToHost(triangleConvexPairsOutHost);
+						b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf;
+						bodyBuf->copyToHost(hostBodyBuf);
+
+						b3AlignedObjectArray<int>concaveHasSeparatingNormalsCPU;
+						m_concaveHasSeparatingNormals.copyToHost(concaveHasSeparatingNormalsCPU);
+
+						b3AlignedObjectArray<b3Vector3> concaveSepNormalsHost;
+						m_concaveSepNormals.copyToHost(concaveSepNormalsHost);
+
+
+						b3AlignedObjectArray<b3Contact4> hostContacts;
+						if (nContacts)
+						{
+							contactOut->copyToHost(hostContacts);
+						}
+						hostContacts.resize(newContactCapacity);
+
+						b3AlignedObjectArray<b3Int4> clippingFacesOutCPU;
+						b3AlignedObjectArray<b3Vector3> worldVertsB2CPU;
+
+						clippingFacesOutGPU.copyToHost(clippingFacesOutCPU);
+						worldVertsB2GPU.copyToHost(worldVertsB2CPU);
+
+
+
+						for (int i=0;i<numConcavePairs;i++)
+						{
+							b3NewContactReductionKernel( &triangleConvexPairsOutHost.at(0),
+                                                   &hostBodyBuf.at(0),
+												   &concaveSepNormalsHost.at(0),
+												   &concaveHasSeparatingNormalsCPU.at(0),
+												   &hostContacts.at(0),
+                                                   &clippingFacesOutCPU.at(0),
+                                                   &worldVertsB2CPU.at(0),
+                                                   &nGlobalContactsOut,
+                                                   vertexFaceCapacity,
+												   newContactCapacity,
+                                                   numConcavePairs,
+												   i
+                                                   );
+
+						}
+
+
+						nContacts = nGlobalContactsOut;
+						m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true);
+//						nContacts = m_totalContactsOut.at(0);
+						//contactOut->resize(nContacts);
+						hostContacts.resize(nContacts);
+						//printf("contactOut4 (after newContactReductionKernel) = %d\n",nContacts);
+						contactOut->copyFromHost(hostContacts);
+					}
+
+				}
+				//re-use?
+
+
+			} else
+			{
+				B3_PROFILE("clipHullHullConcaveConvexKernel");
+				nContacts = m_totalContactsOut.at(0);
+				int newContactCapacity = contactOut->capacity();
+
+				//printf("contactOut5 = %d\n",nContacts);
+				b3BufferInfoCL bInfo[] = { 
+					b3BufferInfoCL( triangleConvexPairsOut.getBufferCL(), true ), 
+					b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+					b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+					b3BufferInfoCL( convexData.getBufferCL(),true),
+					b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+					b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+					b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+					b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+					b3BufferInfoCL( gpuChildShapes.getBufferCL(),true),
+					b3BufferInfoCL( m_concaveSepNormals.getBufferCL()),
+					b3BufferInfoCL( contactOut->getBufferCL()),
+					b3BufferInfoCL( m_totalContactsOut.getBufferCL())	
+				};
+				b3LauncherCL launcher(m_queue, m_clipHullHullConcaveConvexKernel,"m_clipHullHullConcaveConvexKernel");
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+				launcher.setConst(newContactCapacity);
+				launcher.setConst( numConcavePairs  );
+				int num = numConcavePairs;
+				launcher.launch1D( num);
+				clFinish(m_queue);
+				nContacts = m_totalContactsOut.at(0);
+				contactOut->resize(nContacts);
+				//printf("contactOut6 = %d\n",nContacts);
+				b3AlignedObjectArray<b3Contact4> cpuContacts;
+				contactOut->copyToHost(cpuContacts);
+			}
+			//			printf("nContacts after = %d\n", nContacts);
+		}//numConcavePairs
+
+
+
+		//convex-convex contact clipping
+		
+		bool breakupKernel = false;
+
+#ifdef __APPLE__
+		breakupKernel = true;
+#endif
+
+#ifdef CHECK_ON_HOST
+	bool computeConvexConvex = false;
+#else
+	bool computeConvexConvex = true;
+#endif//CHECK_ON_HOST
+		if (computeConvexConvex)
+		{
+			B3_PROFILE("clipHullHullKernel");
+		if (breakupKernel)
+		{
+
+
+
+
+			worldVertsB1GPU.resize(vertexFaceCapacity*nPairs);
+			clippingFacesOutGPU.resize(nPairs);
+			worldNormalsAGPU.resize(nPairs);
+			worldVertsA1GPU.resize(vertexFaceCapacity*nPairs);
+			worldVertsB2GPU.resize(vertexFaceCapacity*nPairs);
+
+			if (findConvexClippingFacesGPU)
+			{
+				B3_PROFILE("findClippingFacesKernel");
+				b3BufferInfoCL bInfo[] = {
+					b3BufferInfoCL( pairs->getBufferCL(), true ),
+					b3BufferInfoCL( bodyBuf->getBufferCL(),true),
+					b3BufferInfoCL( gpuCollidables.getBufferCL(),true),
+					b3BufferInfoCL( convexData.getBufferCL(),true),
+					b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+					b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+					b3BufferInfoCL( gpuFaces.getBufferCL(),true), 
+					b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+					b3BufferInfoCL( m_sepNormals.getBufferCL()),
+					b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
+					b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+					b3BufferInfoCL( worldVertsA1GPU.getBufferCL()),
+					b3BufferInfoCL( worldNormalsAGPU.getBufferCL()),
+					b3BufferInfoCL( worldVertsB1GPU.getBufferCL())
+				};
+
+				b3LauncherCL launcher(m_queue, m_findClippingFacesKernel,"m_findClippingFacesKernel");
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+				launcher.setConst( vertexFaceCapacity);
+				launcher.setConst( nPairs  );
+				int num = nPairs;
+				launcher.launch1D( num);
+				clFinish(m_queue);
+
+			} else
+			{
+				
+				float minDist = -1e30f;
+				float maxDist = 0.02f;
+
+				b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData;
+				convexData.copyToHost(hostConvexData);
+				b3AlignedObjectArray<b3Collidable> hostCollidables;
+				gpuCollidables.copyToHost(hostCollidables);
+
+				b3AlignedObjectArray<int> hostHasSepNormals;
+				m_hasSeparatingNormals.copyToHost(hostHasSepNormals);
+				b3AlignedObjectArray<b3Vector3> cpuSepNormals;
+				m_sepNormals.copyToHost(cpuSepNormals);
+
+				b3AlignedObjectArray<b3Int4> hostPairs;
+				pairs->copyToHost(hostPairs);
+				b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf;
+				bodyBuf->copyToHost(hostBodyBuf);
+
+
+				//worldVertsB1GPU.resize(vertexFaceCapacity*nPairs);
+				b3AlignedObjectArray<b3Vector3> worldVertsB1CPU;
+				worldVertsB1GPU.copyToHost(worldVertsB1CPU);
+
+				b3AlignedObjectArray<b3Int4> clippingFacesOutCPU;
+				clippingFacesOutGPU.copyToHost(clippingFacesOutCPU);
+
+				b3AlignedObjectArray<b3Vector3> worldNormalsACPU;
+				worldNormalsACPU.resize(nPairs);
+
+				b3AlignedObjectArray<b3Vector3> worldVertsA1CPU;
+				worldVertsA1CPU.resize(worldVertsA1GPU.size());
+			
+			
+				b3AlignedObjectArray<b3Vector3> hostVertices;
+				gpuVertices.copyToHost(hostVertices);
+				b3AlignedObjectArray<b3GpuFace> hostFaces;
+				gpuFaces.copyToHost(hostFaces);
+				b3AlignedObjectArray<int> hostIndices;
+				gpuIndices.copyToHost(hostIndices);
+				
+
+				for (int i=0;i<nPairs;i++)
+				{
+
+					int bodyIndexA = hostPairs[i].x;
+					int bodyIndexB = hostPairs[i].y;
+			
+					int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx;
+					int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx;
+			
+					int shapeIndexA = hostCollidables[collidableIndexA].m_shapeIndex;
+					int shapeIndexB = hostCollidables[collidableIndexB].m_shapeIndex;
+			
+
+					if (hostHasSepNormals[i])
+					{
+						b3FindClippingFaces(cpuSepNormals[i],
+							&hostConvexData[shapeIndexA],
+							&hostConvexData[shapeIndexB],
+							hostBodyBuf[bodyIndexA].m_pos,hostBodyBuf[bodyIndexA].m_quat,
+							hostBodyBuf[bodyIndexB].m_pos,hostBodyBuf[bodyIndexB].m_quat,
+							&worldVertsA1CPU.at(0),&worldNormalsACPU.at(0),
+							&worldVertsB1CPU.at(0),
+							vertexFaceCapacity,minDist,maxDist,
+							&hostVertices.at(0),&hostFaces.at(0),
+							&hostIndices.at(0),
+							&hostVertices.at(0),&hostFaces.at(0),
+							&hostIndices.at(0),&clippingFacesOutCPU.at(0),i);
+					}
+				}
+
+				clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU);
+				worldVertsA1GPU.copyFromHost(worldVertsA1CPU);
+				worldNormalsAGPU.copyFromHost(worldNormalsACPU);
+				worldVertsB1GPU.copyFromHost(worldVertsB1CPU);
+
+			}
+
+
+
+
+
+			///clip face B against face A, reduce contacts and append them to a global contact array
+			if (1)
+			{
+				if (clipConvexFacesAndFindContactsCPU)
+				{
+
+					//b3AlignedObjectArray<b3Int4> hostPairs;
+					//pairs->copyToHost(hostPairs);
+
+					b3AlignedObjectArray<b3Vector3> hostSepNormals;
+					m_sepNormals.copyToHost(hostSepNormals);
+					b3AlignedObjectArray<int> hostHasSepAxis;
+					m_hasSeparatingNormals.copyToHost(hostHasSepAxis);
+
+					b3AlignedObjectArray<b3Int4> hostClippingFaces;
+					clippingFacesOutGPU.copyToHost(hostClippingFaces);
+					b3AlignedObjectArray<b3Vector3> worldVertsB2CPU;
+					worldVertsB2CPU.resize(vertexFaceCapacity*nPairs);
+					
+					b3AlignedObjectArray<b3Vector3>worldVertsA1CPU;
+					worldVertsA1GPU.copyToHost(worldVertsA1CPU);
+					b3AlignedObjectArray<b3Vector3> worldNormalsACPU;
+					worldNormalsAGPU.copyToHost(worldNormalsACPU);
+
+					b3AlignedObjectArray<b3Vector3>  worldVertsB1CPU;
+					worldVertsB1GPU.copyToHost(worldVertsB1CPU);
+
+					/*
+					  __global const b3Float4* separatingNormals,
+                                                   __global const int* hasSeparatingAxis,
+                                                   __global b3Int4* clippingFacesOut,
+                                                   __global b3Float4* worldVertsA1,
+                                                   __global b3Float4* worldNormalsA1,
+                                                   __global b3Float4* worldVertsB1,
+                                                   __global b3Float4* worldVertsB2,
+                                                    int vertexFaceCapacity,
+															int pairIndex
+					*/
+					for (int i=0;i<nPairs;i++)
+					{
+						clipFacesAndFindContactsKernel(
+							&hostSepNormals.at(0),
+							&hostHasSepAxis.at(0),
+							&hostClippingFaces.at(0),
+							&worldVertsA1CPU.at(0),
+							&worldNormalsACPU.at(0),
+							&worldVertsB1CPU.at(0),
+							&worldVertsB2CPU.at(0),
+
+						vertexFaceCapacity,
+							i);
+					}
+					
+					clippingFacesOutGPU.copyFromHost(hostClippingFaces);
+					worldVertsB2GPU.copyFromHost(worldVertsB2CPU);
+
+				} else
+				{
+					B3_PROFILE("clipFacesAndFindContacts");
+					//nContacts = m_totalContactsOut.at(0);
+					//int h = m_hasSeparatingNormals.at(0);
+					//int4 p = clippingFacesOutGPU.at(0);
+					b3BufferInfoCL bInfo[] = {
+						b3BufferInfoCL( m_sepNormals.getBufferCL()),
+						b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
+						b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+						b3BufferInfoCL( worldVertsA1GPU.getBufferCL()),
+						b3BufferInfoCL( worldNormalsAGPU.getBufferCL()),
+						b3BufferInfoCL( worldVertsB1GPU.getBufferCL()),
+						b3BufferInfoCL( worldVertsB2GPU.getBufferCL())
+					};
+
+					b3LauncherCL launcher(m_queue, m_clipFacesAndFindContacts,"m_clipFacesAndFindContacts");
+					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+					launcher.setConst(vertexFaceCapacity);
+
+					launcher.setConst( nPairs  );
+					int debugMode = 0;
+					launcher.setConst( debugMode);
+					int num = nPairs;
+					launcher.launch1D( num);
+					clFinish(m_queue);
+				} 
+
+				{
+					nContacts = m_totalContactsOut.at(0);
+					//printf("nContacts = %d\n",nContacts);
+
+					int newContactCapacity = nContacts+nPairs;
+					contactOut->reserve(newContactCapacity);
+
+					if (reduceConvexContactsOnGPU)
+					{
+						{
+							B3_PROFILE("newContactReductionKernel");
+							b3BufferInfoCL bInfo[] =
+							{
+								b3BufferInfoCL( pairs->getBufferCL(), true ),
+								b3BufferInfoCL( bodyBuf->getBufferCL(),true),
+								b3BufferInfoCL( m_sepNormals.getBufferCL()),
+								b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
+								b3BufferInfoCL( contactOut->getBufferCL()),
+								b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+								b3BufferInfoCL( worldVertsB2GPU.getBufferCL()),
+								b3BufferInfoCL( m_totalContactsOut.getBufferCL())
+							};
+
+							b3LauncherCL launcher(m_queue, m_newContactReductionKernel,"m_newContactReductionKernel");
+							launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+							launcher.setConst(vertexFaceCapacity);
+							launcher.setConst(newContactCapacity);
+							launcher.setConst( nPairs  );
+							int num = nPairs;
+
+							launcher.launch1D( num);
+						}
+						nContacts = m_totalContactsOut.at(0);
+						contactOut->resize(nContacts);
+					} else
+					{
+
+						volatile int nGlobalContactsOut = nContacts;
+						b3AlignedObjectArray<b3Int4> hostPairs;
+						pairs->copyToHost(hostPairs);
+						b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf;
+						bodyBuf->copyToHost(hostBodyBuf);
+						b3AlignedObjectArray<b3Vector3> hostSepNormals;
+						m_sepNormals.copyToHost(hostSepNormals);
+						b3AlignedObjectArray<int> hostHasSepAxis;
+						m_hasSeparatingNormals.copyToHost(hostHasSepAxis);
+						b3AlignedObjectArray<b3Contact4> hostContactsOut;
+						contactOut->copyToHost(hostContactsOut);
+						hostContactsOut.resize(newContactCapacity);
+
+						b3AlignedObjectArray<b3Int4> hostClippingFaces;
+						clippingFacesOutGPU.copyToHost(hostClippingFaces);
+						b3AlignedObjectArray<b3Vector3> worldVertsB2CPU;
+						worldVertsB2GPU.copyToHost(worldVertsB2CPU);
+
+						for (int i=0;i<nPairs;i++)
+						{
+							b3NewContactReductionKernel(&hostPairs.at(0),
+								&hostBodyBuf.at(0),
+								&hostSepNormals.at(0),
+								&hostHasSepAxis.at(0),
+								&hostContactsOut.at(0),
+								&hostClippingFaces.at(0),
+								&worldVertsB2CPU.at(0),
+								&nGlobalContactsOut,
+								vertexFaceCapacity,
+								newContactCapacity,
+								nPairs,
+								i);
+						}
+
+						nContacts = nGlobalContactsOut;
+						m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true);
+						hostContactsOut.resize(nContacts);
+						//printf("contactOut4 (after newContactReductionKernel) = %d\n",nContacts);
+						contactOut->copyFromHost(hostContactsOut);
+					}
+					//                    b3Contact4 pt = contactOut->at(0);
+					//                  printf("nContacts = %d\n",nContacts);
+				}
+			}
+		}            
+		else//breakupKernel
+		{
+
+			if (nPairs)
+			{
+				b3BufferInfoCL bInfo[] = {
+					b3BufferInfoCL( pairs->getBufferCL(), true ), 
+					b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+					b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+					b3BufferInfoCL( convexData.getBufferCL(),true),
+					b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+					b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+					b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+					b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+					b3BufferInfoCL( m_sepNormals.getBufferCL()),
+					b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
+					b3BufferInfoCL( contactOut->getBufferCL()),
+					b3BufferInfoCL( m_totalContactsOut.getBufferCL())	
+				};
+				b3LauncherCL launcher(m_queue, m_clipHullHullKernel,"m_clipHullHullKernel");
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+				launcher.setConst( nPairs  );
+				launcher.setConst(maxContactCapacity);
+
+				int num = nPairs;
+				launcher.launch1D( num);
+				clFinish(m_queue);
+
+				nContacts = m_totalContactsOut.at(0);
+				if (nContacts >= maxContactCapacity)
+				{
+					b3Error("Exceeded contact capacity (%d/%d)\n",nContacts,maxContactCapacity);
+					nContacts = maxContactCapacity;
+				}
+				contactOut->resize(nContacts);
+			}
+		}
+
+
+		int nCompoundsPairs = m_gpuCompoundPairs.size();
+
+		if (nCompoundsPairs)
+		{
+			b3BufferInfoCL bInfo[] = {
+				b3BufferInfoCL( m_gpuCompoundPairs.getBufferCL(), true ), 
+				b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
+				b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
+				b3BufferInfoCL( convexData.getBufferCL(),true),
+				b3BufferInfoCL( gpuVertices.getBufferCL(),true),
+				b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+				b3BufferInfoCL( gpuFaces.getBufferCL(),true),
+				b3BufferInfoCL( gpuIndices.getBufferCL(),true),
+				b3BufferInfoCL( gpuChildShapes.getBufferCL(),true),
+				b3BufferInfoCL( m_gpuCompoundSepNormals.getBufferCL(),true),
+				b3BufferInfoCL( m_gpuHasCompoundSepNormals.getBufferCL(),true),
+				b3BufferInfoCL( contactOut->getBufferCL()),
+				b3BufferInfoCL( m_totalContactsOut.getBufferCL())	
+			};
+			b3LauncherCL launcher(m_queue, m_clipCompoundsHullHullKernel,"m_clipCompoundsHullHullKernel");
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst( nCompoundsPairs  );
+			launcher.setConst(maxContactCapacity);
+
+			int num = nCompoundsPairs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+
+			nContacts = m_totalContactsOut.at(0);
+			if (nContacts>maxContactCapacity)
+			{
+
+				b3Error("Error: contacts exceeds capacity (%d/%d)\n", nContacts, maxContactCapacity);
+				nContacts = maxContactCapacity;
+			}
+			contactOut->resize(nContacts);
+		}//if nCompoundsPairs
+		}
+	}//contactClippingOnGpu
+
+	//printf("nContacts end = %d\n",nContacts);
+	
+	//printf("frameCount = %d\n",frameCount++);
+}
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h
new file mode 100644
index 0000000000..e24c1579c6
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h
@@ -0,0 +1,118 @@
+
+#ifndef _CONVEX_HULL_CONTACT_H
+#define _CONVEX_HULL_CONTACT_H
+
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
+#include "Bullet3Common/shared/b3Int2.h"
+#include "Bullet3Common/shared/b3Int4.h"
+#include "b3OptimizedBvh.h"
+#include "b3BvhInfo.h"
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+
+//#include "../../dynamics/basic_demo/Stubs/ChNarrowPhase.h"
+
+
+
+
+struct GpuSatCollision
+{
+	cl_context				m_context;
+	cl_device_id			m_device;
+	cl_command_queue		m_queue;
+	cl_kernel				m_findSeparatingAxisKernel;
+	cl_kernel				m_mprPenetrationKernel;
+	cl_kernel				m_findSeparatingAxisUnitSphereKernel;
+
+
+	cl_kernel m_findSeparatingAxisVertexFaceKernel;
+	cl_kernel m_findSeparatingAxisEdgeEdgeKernel;
+	
+	cl_kernel				m_findConcaveSeparatingAxisKernel;
+    cl_kernel				m_findConcaveSeparatingAxisVertexFaceKernel;
+    cl_kernel				m_findConcaveSeparatingAxisEdgeEdgeKernel;
+ 
+    
+    
+    
+	cl_kernel				m_findCompoundPairsKernel;
+	cl_kernel				m_processCompoundPairsKernel;
+
+	cl_kernel				m_clipHullHullKernel;
+	cl_kernel				m_clipCompoundsHullHullKernel;
+    
+    cl_kernel               m_clipFacesAndFindContacts;
+    cl_kernel               m_findClippingFacesKernel;
+    
+	cl_kernel				m_clipHullHullConcaveConvexKernel;
+//	cl_kernel				m_extractManifoldAndAddContactKernel;
+    cl_kernel               m_newContactReductionKernel;
+
+	cl_kernel				m_bvhTraversalKernel;
+	cl_kernel				m_primitiveContactsKernel;
+	cl_kernel				m_findConcaveSphereContactsKernel;
+
+	cl_kernel				m_processCompoundPairsPrimitivesKernel;
+    
+	b3OpenCLArray<b3Vector3> m_unitSphereDirections;
+
+	b3OpenCLArray<int>		m_totalContactsOut;
+
+	b3OpenCLArray<b3Vector3> m_sepNormals;
+	b3OpenCLArray<float> m_dmins;
+
+	b3OpenCLArray<int>		m_hasSeparatingNormals;
+	b3OpenCLArray<b3Vector3> m_concaveSepNormals;
+	b3OpenCLArray<int>		m_concaveHasSeparatingNormals;
+	b3OpenCLArray<int>		m_numConcavePairsOut;
+	b3OpenCLArray<b3CompoundOverlappingPair> m_gpuCompoundPairs;
+	b3OpenCLArray<b3Vector3> m_gpuCompoundSepNormals;
+	b3OpenCLArray<int>		m_gpuHasCompoundSepNormals;
+	b3OpenCLArray<int>		m_numCompoundPairsOut;
+	
+
+	GpuSatCollision(cl_context ctx,cl_device_id device, cl_command_queue  q );
+	virtual ~GpuSatCollision();
+	
+
+	void computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* pairs, int nPairs, 
+			const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
+			b3OpenCLArray<b3Contact4>* contactOut, int& nContacts,
+			const b3OpenCLArray<b3Contact4>* oldContacts,
+			int maxContactCapacity,
+			int compoundPairCapacity,
+			const b3OpenCLArray<b3ConvexPolyhedronData>& hostConvexData,
+			const b3OpenCLArray<b3Vector3>& vertices,
+			const b3OpenCLArray<b3Vector3>& uniqueEdges,
+			const b3OpenCLArray<b3GpuFace>& faces,
+			const b3OpenCLArray<int>& indices,
+			const b3OpenCLArray<b3Collidable>& gpuCollidables,
+			const b3OpenCLArray<b3GpuChildShape>& gpuChildShapes,
+
+			const b3OpenCLArray<b3Aabb>& clAabbsWorldSpace,
+			const b3OpenCLArray<b3Aabb>& clAabbsLocalSpace,
+
+           b3OpenCLArray<b3Vector3>& worldVertsB1GPU,
+           b3OpenCLArray<b3Int4>& clippingFacesOutGPU,
+           b3OpenCLArray<b3Vector3>& worldNormalsAGPU,
+           b3OpenCLArray<b3Vector3>& worldVertsA1GPU,
+           b3OpenCLArray<b3Vector3>& worldVertsB2GPU,
+		   b3AlignedObjectArray<class b3OptimizedBvh*>& bvhData,
+		   b3OpenCLArray<b3QuantizedBvhNode>*	treeNodesGPU,
+			b3OpenCLArray<b3BvhSubtreeInfo>*	subTreesGPU,
+			b3OpenCLArray<b3BvhInfo>*	bvhInfo,
+			int numObjects,
+			int maxTriConvexPairCapacity,
+			b3OpenCLArray<b3Int4>& triangleConvexPairs,
+			int& numTriConvexPairsOut
+			);
+
+
+};
+
+#endif //_CONVEX_HULL_CONTACT_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h
new file mode 100644
index 0000000000..337100fb1a
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h
@@ -0,0 +1,9 @@
+#ifndef CONVEX_POLYHEDRON_CL
+#define CONVEX_POLYHEDRON_CL
+
+#include "Bullet3Common/b3Transform.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+
+
+
+#endif //CONVEX_POLYHEDRON_CL
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp
new file mode 100644
index 0000000000..d636f983c6
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp
@@ -0,0 +1,1014 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2008 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the
+use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software in a
+product, an acknowledgment in the product documentation would be appreciated
+but is not required.
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+/*
+GJK-EPA collision solver by Nathanael Presson, 2008
+*/
+
+#include "b3GjkEpa.h"
+
+#include "b3SupportMappings.h"
+
+namespace gjkepa2_impl2
+{
+
+	// Config
+
+	/* GJK	*/ 
+#define GJK_MAX_ITERATIONS	128
+#define GJK_ACCURACY		((b3Scalar)0.0001)
+#define GJK_MIN_DISTANCE	((b3Scalar)0.0001)
+#define GJK_DUPLICATED_EPS	((b3Scalar)0.0001)
+#define GJK_SIMPLEX2_EPS	((b3Scalar)0.0)
+#define GJK_SIMPLEX3_EPS	((b3Scalar)0.0)
+#define GJK_SIMPLEX4_EPS	((b3Scalar)0.0)
+
+	/* EPA	*/ 
+#define EPA_MAX_VERTICES	64
+#define EPA_MAX_FACES		(EPA_MAX_VERTICES*2)
+#define EPA_MAX_ITERATIONS	255
+#define EPA_ACCURACY		((b3Scalar)0.0001)
+#define EPA_FALLBACK		(10*EPA_ACCURACY)
+#define EPA_PLANE_EPS		((b3Scalar)0.00001)
+#define EPA_INSIDE_EPS		((b3Scalar)0.01)
+
+
+	// Shorthands
+	
+
+	// MinkowskiDiff
+	struct	b3MinkowskiDiff
+	{
+		
+
+		const b3ConvexPolyhedronData*	m_shapes[2];
+		
+
+		b3Matrix3x3				m_toshape1;
+		b3Transform				m_toshape0;
+
+		bool					m_enableMargin;
+		
+
+			void					EnableMargin(bool enable)
+		{
+			m_enableMargin = enable;
+		}	
+		inline b3Vector3		Support0(const b3Vector3& d, const b3AlignedObjectArray<b3Vector3>& verticesA) const
+		{
+			if (m_enableMargin)
+			{
+				return localGetSupportVertexWithMargin(d,m_shapes[0],verticesA,0.f);
+			} else
+			{
+				return localGetSupportVertexWithoutMargin(d,m_shapes[0],verticesA);
+			}
+		}
+		inline b3Vector3		Support1(const b3Vector3& d, const b3AlignedObjectArray<b3Vector3>& verticesB) const
+		{
+			if (m_enableMargin)
+			{
+				return m_toshape0*(localGetSupportVertexWithMargin(m_toshape1*d,m_shapes[1],verticesB,0.f));
+			} else
+			{
+				return m_toshape0*(localGetSupportVertexWithoutMargin(m_toshape1*d,m_shapes[1],verticesB));
+			}
+		}
+
+		inline b3Vector3		Support(const b3Vector3& d,  const b3AlignedObjectArray<b3Vector3>& verticesA,  const b3AlignedObjectArray<b3Vector3>& verticesB) const
+		{
+			return(Support0(d,verticesA)-Support1(-d,verticesB));
+		}
+		b3Vector3				Support(const b3Vector3& d,unsigned int index,const b3AlignedObjectArray<b3Vector3>& verticesA,  const b3AlignedObjectArray<b3Vector3>& verticesB) const
+		{
+			if(index)
+				return(Support1(d,verticesA));
+			else
+				return(Support0(d,verticesB));
+		}
+	};
+
+	typedef	b3MinkowskiDiff	tShape;
+
+
+	// GJK
+	struct	b3GJK
+	{
+		/* Types		*/ 
+		struct	sSV
+		{
+			b3Vector3	d,w;
+		};
+		struct	sSimplex
+		{
+			sSV*		c[4];
+			b3Scalar	p[4];
+			unsigned int			rank;
+		};
+		struct	eStatus	{ enum _ {
+			Valid,
+			Inside,
+			Failed		};};
+			/* Fields		*/ 
+			tShape			m_shape;
+			const b3AlignedObjectArray<b3Vector3>& m_verticesA;
+			const b3AlignedObjectArray<b3Vector3>& m_verticesB;
+			b3Vector3		m_ray;
+			b3Scalar		m_distance;
+			sSimplex		m_simplices[2];
+			sSV				m_store[4];
+			sSV*			m_free[4];
+			unsigned int				m_nfree;
+			unsigned int				m_current;
+			sSimplex*		m_simplex;
+			eStatus::_		m_status;
+			/* Methods		*/ 
+			b3GJK(const b3AlignedObjectArray<b3Vector3>& verticesA,const b3AlignedObjectArray<b3Vector3>& verticesB)
+				:m_verticesA(verticesA),m_verticesB(verticesB)
+			{
+				Initialize();
+			}
+			void				Initialize()
+			{
+				m_ray		=	b3MakeVector3(0,0,0);
+				m_nfree		=	0;
+				m_status	=	eStatus::Failed;
+				m_current	=	0;
+				m_distance	=	0;
+			}
+			eStatus::_			Evaluate(const tShape& shapearg,const b3Vector3& guess)
+			{
+				unsigned int			iterations=0;
+				b3Scalar	sqdist=0;
+				b3Scalar	alpha=0;
+				b3Vector3	lastw[4];
+				unsigned int			clastw=0;
+				/* Initialize solver		*/ 
+				m_free[0]			=	&m_store[0];
+				m_free[1]			=	&m_store[1];
+				m_free[2]			=	&m_store[2];
+				m_free[3]			=	&m_store[3];
+				m_nfree				=	4;
+				m_current			=	0;
+				m_status			=	eStatus::Valid;
+				m_shape				=	shapearg;
+				m_distance			=	0;
+				/* Initialize simplex		*/ 
+				m_simplices[0].rank	=	0;
+				m_ray				=	guess;
+				const b3Scalar	sqrl=	m_ray.length2();
+				appendvertice(m_simplices[0],sqrl>0?-m_ray:b3MakeVector3(1,0,0));
+				m_simplices[0].p[0]	=	1;
+				m_ray				=	m_simplices[0].c[0]->w;	
+				sqdist				=	sqrl;
+				lastw[0]			=
+					lastw[1]			=
+					lastw[2]			=
+					lastw[3]			=	m_ray;
+				/* Loop						*/ 
+				do	{
+					const unsigned int		next=1-m_current;
+					sSimplex&	cs=m_simplices[m_current];
+					sSimplex&	ns=m_simplices[next];
+					/* Check zero							*/ 
+					const b3Scalar	rl=m_ray.length();
+					if(rl<GJK_MIN_DISTANCE)
+					{/* Touching or inside				*/ 
+						m_status=eStatus::Inside;
+						break;
+					}
+					/* Append new vertice in -'v' direction	*/ 
+					appendvertice(cs,-m_ray);
+					const b3Vector3&	w=cs.c[cs.rank-1]->w;
+					bool				found=false;
+					for(unsigned int i=0;i<4;++i)
+					{
+						if((w-lastw[i]).length2()<GJK_DUPLICATED_EPS)
+						{ found=true;break; }
+					}
+					if(found)
+					{/* Return old simplex				*/ 
+						removevertice(m_simplices[m_current]);
+						break;
+					}
+					else
+					{/* Update lastw					*/ 
+						lastw[clastw=(clastw+1)&3]=w;
+					}
+					/* Check for termination				*/ 
+					const b3Scalar	omega=b3Dot(m_ray,w)/rl;
+					alpha=b3Max(omega,alpha);
+					if(((rl-alpha)-(GJK_ACCURACY*rl))<=0)
+					{/* Return old simplex				*/ 
+						removevertice(m_simplices[m_current]);
+						break;
+					}		
+					/* Reduce simplex						*/ 
+					b3Scalar	weights[4];
+					unsigned int			mask=0;
+					switch(cs.rank)
+					{
+					case	2:	sqdist=projectorigin(	cs.c[0]->w,
+									cs.c[1]->w,
+									weights,mask);break;
+					case	3:	sqdist=projectorigin(	cs.c[0]->w,
+									cs.c[1]->w,
+									cs.c[2]->w,
+									weights,mask);break;
+					case	4:	sqdist=projectorigin(	cs.c[0]->w,
+									cs.c[1]->w,
+									cs.c[2]->w,
+									cs.c[3]->w,
+									weights,mask);break;
+					}
+					if(sqdist>=0)
+					{/* Valid	*/ 
+						ns.rank		=	0;
+						m_ray		=	b3MakeVector3(0,0,0);
+						m_current	=	next;
+						for(unsigned int i=0,ni=cs.rank;i<ni;++i)
+						{
+							if(mask&(1<<i))
+							{
+								ns.c[ns.rank]		=	cs.c[i];
+								ns.p[ns.rank++]		=	weights[i];
+								m_ray				+=	cs.c[i]->w*weights[i];
+							}
+							else
+							{
+								m_free[m_nfree++]	=	cs.c[i];
+							}
+						}
+						if(mask==15) m_status=eStatus::Inside;
+					}
+					else
+					{/* Return old simplex				*/ 
+						removevertice(m_simplices[m_current]);
+						break;
+					}
+					m_status=((++iterations)<GJK_MAX_ITERATIONS)?m_status:eStatus::Failed;
+				} while(m_status==eStatus::Valid);
+				m_simplex=&m_simplices[m_current];
+				switch(m_status)
+				{
+				case	eStatus::Valid:		m_distance=m_ray.length();break;
+				case	eStatus::Inside:	m_distance=0;break;
+				default:
+					{
+					}
+				}	
+				return(m_status);
+			}
+			bool					EncloseOrigin()
+			{
+				switch(m_simplex->rank)
+				{
+				case	1:
+					{
+						for(unsigned int i=0;i<3;++i)
+						{
+							b3Vector3		axis=b3MakeVector3(0,0,0);
+							axis[i]=1;
+							appendvertice(*m_simplex, axis);
+							if(EncloseOrigin())	return(true);
+							removevertice(*m_simplex);
+							appendvertice(*m_simplex,-axis);
+							if(EncloseOrigin())	return(true);
+							removevertice(*m_simplex);
+						}
+					}
+					break;
+				case	2:
+					{
+						const b3Vector3	d=m_simplex->c[1]->w-m_simplex->c[0]->w;
+						for(unsigned int i=0;i<3;++i)
+						{
+							b3Vector3		axis=b3MakeVector3(0,0,0);
+							axis[i]=1;
+							const b3Vector3	p=b3Cross(d,axis);
+							if(p.length2()>0)
+							{
+								appendvertice(*m_simplex, p);
+								if(EncloseOrigin())	return(true);
+								removevertice(*m_simplex);
+								appendvertice(*m_simplex,-p);
+								if(EncloseOrigin())	return(true);
+								removevertice(*m_simplex);
+							}
+						}
+					}
+					break;
+				case	3:
+					{
+						const b3Vector3	n=b3Cross(m_simplex->c[1]->w-m_simplex->c[0]->w,
+							m_simplex->c[2]->w-m_simplex->c[0]->w);
+						if(n.length2()>0)
+						{
+							appendvertice(*m_simplex,n);
+							if(EncloseOrigin())	return(true);
+							removevertice(*m_simplex);
+							appendvertice(*m_simplex,-n);
+							if(EncloseOrigin())	return(true);
+							removevertice(*m_simplex);
+						}
+					}
+					break;
+				case	4:
+					{
+						if(b3Fabs(det(	m_simplex->c[0]->w-m_simplex->c[3]->w,
+							m_simplex->c[1]->w-m_simplex->c[3]->w,
+							m_simplex->c[2]->w-m_simplex->c[3]->w))>0)
+							return(true);
+					}
+					break;
+				}
+				return(false);
+			}
+			/* Internals	*/ 
+			void				getsupport(const b3Vector3& d,sSV& sv) const
+			{
+				sv.d	=	d/d.length();
+				sv.w	=	m_shape.Support(sv.d,m_verticesA,m_verticesB);
+			}
+			void				removevertice(sSimplex& simplex)
+			{
+				m_free[m_nfree++]=simplex.c[--simplex.rank];
+			}
+			void				appendvertice(sSimplex& simplex,const b3Vector3& v)
+			{
+				simplex.p[simplex.rank]=0;
+				simplex.c[simplex.rank]=m_free[--m_nfree];
+				getsupport(v,*simplex.c[simplex.rank++]);
+			}
+			static b3Scalar		det(const b3Vector3& a,const b3Vector3& b,const b3Vector3& c)
+			{
+				return(	a.y*b.z*c.x+a.z*b.x*c.y-
+					a.x*b.z*c.y-a.y*b.x*c.z+
+					a.x*b.y*c.z-a.z*b.y*c.x);
+			}
+			static b3Scalar		projectorigin(	const b3Vector3& a,
+				const b3Vector3& b,
+				b3Scalar* w,unsigned int& m)
+			{
+				const b3Vector3	d=b-a;
+				const b3Scalar	l=d.length2();
+				if(l>GJK_SIMPLEX2_EPS)
+				{
+					const b3Scalar	t(l>0?-b3Dot(a,d)/l:0);
+					if(t>=1)		{ w[0]=0;w[1]=1;m=2;return(b.length2()); }
+					else if(t<=0)	{ w[0]=1;w[1]=0;m=1;return(a.length2()); }
+					else			{ w[0]=1-(w[1]=t);m=3;return((a+d*t).length2()); }
+				}
+				return(-1);
+			}
+			static b3Scalar		projectorigin(	const b3Vector3& a,
+				const b3Vector3& b,
+				const b3Vector3& c,
+				b3Scalar* w,unsigned int& m)
+			{
+				static const unsigned int		imd3[]={1,2,0};
+				const b3Vector3*	vt[]={&a,&b,&c};
+				const b3Vector3		dl[]={a-b,b-c,c-a};
+				const b3Vector3		n=b3Cross(dl[0],dl[1]);
+				const b3Scalar		l=n.length2();
+				if(l>GJK_SIMPLEX3_EPS)
+				{
+					b3Scalar	mindist=-1;
+					b3Scalar	subw[2]={0.f,0.f};
+					unsigned int			subm(0);
+					for(unsigned int i=0;i<3;++i)
+					{
+						if(b3Dot(*vt[i],b3Cross(dl[i],n))>0)
+						{
+							const unsigned int			j=imd3[i];
+							const b3Scalar	subd(projectorigin(*vt[i],*vt[j],subw,subm));
+							if((mindist<0)||(subd<mindist))
+							{
+								mindist		=	subd;
+								m			=	static_cast<unsigned int>(((subm&1)?1<<i:0)+((subm&2)?1<<j:0));
+								w[i]		=	subw[0];
+								w[j]		=	subw[1];
+								w[imd3[j]]	=	0;				
+							}
+						}
+					}
+					if(mindist<0)
+					{
+						const b3Scalar	d=b3Dot(a,n);	
+						const b3Scalar	s=b3Sqrt(l);
+						const b3Vector3	p=n*(d/l);
+						mindist	=	p.length2();
+						m		=	7;
+						w[0]	=	(b3Cross(dl[1],b-p)).length()/s;
+						w[1]	=	(b3Cross(dl[2],c-p)).length()/s;
+						w[2]	=	1-(w[0]+w[1]);
+					}
+					return(mindist);
+				}
+				return(-1);
+			}
+			static b3Scalar		projectorigin(	const b3Vector3& a,
+				const b3Vector3& b,
+				const b3Vector3& c,
+				const b3Vector3& d,
+				b3Scalar* w,unsigned int& m)
+			{
+				static const unsigned int		imd3[]={1,2,0};
+				const b3Vector3*	vt[]={&a,&b,&c,&d};
+				const b3Vector3		dl[]={a-d,b-d,c-d};
+				const b3Scalar		vl=det(dl[0],dl[1],dl[2]);
+				const bool			ng=(vl*b3Dot(a,b3Cross(b-c,a-b)))<=0;
+				if(ng&&(b3Fabs(vl)>GJK_SIMPLEX4_EPS))
+				{
+					b3Scalar	mindist=-1;
+					b3Scalar	subw[3]={0.f,0.f,0.f};
+					unsigned int			subm(0);
+					for(unsigned int i=0;i<3;++i)
+					{
+						const unsigned int			j=imd3[i];
+						const b3Scalar	s=vl*b3Dot(d,b3Cross(dl[i],dl[j]));
+						if(s>0)
+						{
+							const b3Scalar	subd=projectorigin(*vt[i],*vt[j],d,subw,subm);
+							if((mindist<0)||(subd<mindist))
+							{
+								mindist		=	subd;
+								m			=	static_cast<unsigned int>((subm&1?1<<i:0)+
+									(subm&2?1<<j:0)+
+									(subm&4?8:0));
+								w[i]		=	subw[0];
+								w[j]		=	subw[1];
+								w[imd3[j]]	=	0;
+								w[3]		=	subw[2];
+							}
+						}
+					}
+					if(mindist<0)
+					{
+						mindist	=	0;
+						m		=	15;
+						w[0]	=	det(c,b,d)/vl;
+						w[1]	=	det(a,c,d)/vl;
+						w[2]	=	det(b,a,d)/vl;
+						w[3]	=	1-(w[0]+w[1]+w[2]);
+					}
+					return(mindist);
+				}
+				return(-1);
+			}
+	};
+
+	// EPA
+	struct	b3EPA
+	{
+		/* Types		*/ 
+		typedef	b3GJK::sSV	sSV;
+		struct	sFace
+		{
+			b3Vector3	n;
+			b3Scalar	d;
+			sSV*		c[3];
+			sFace*		f[3];
+			sFace*		l[2];
+			unsigned char			e[3];
+			unsigned char			pass;
+		};
+		struct	sList
+		{
+			sFace*		root;
+			unsigned int			count;
+			sList() : root(0),count(0)	{}
+		};
+		struct	sHorizon
+		{
+			sFace*		cf;
+			sFace*		ff;
+			unsigned int			nf;
+			sHorizon() : cf(0),ff(0),nf(0)	{}
+		};
+		struct	eStatus { enum _ {
+			Valid,
+			Touching,
+			Degenerated,
+			NonConvex,
+			InvalidHull,		
+			OutOfFaces,
+			OutOfVertices,
+			AccuraryReached,
+			FallBack,
+			Failed		};};
+			/* Fields		*/ 
+			eStatus::_		m_status;
+			b3GJK::sSimplex	m_result;
+			b3Vector3		m_normal;
+			b3Scalar		m_depth;
+			sSV				m_sv_store[EPA_MAX_VERTICES];
+			sFace			m_fc_store[EPA_MAX_FACES];
+			unsigned int				m_nextsv;
+			sList			m_hull;
+			sList			m_stock;
+			/* Methods		*/ 
+			b3EPA()
+			{
+				Initialize();	
+			}
+
+
+			static inline void		bind(sFace* fa,unsigned int ea,sFace* fb,unsigned int eb)
+			{
+				fa->e[ea]=(unsigned char)eb;fa->f[ea]=fb;
+				fb->e[eb]=(unsigned char)ea;fb->f[eb]=fa;
+			}
+			static inline void		append(sList& list,sFace* face)
+			{
+				face->l[0]	=	0;
+				face->l[1]	=	list.root;
+				if(list.root) list.root->l[0]=face;
+				list.root	=	face;
+				++list.count;
+			}
+			static inline void		remove(sList& list,sFace* face)
+			{
+				if(face->l[1]) face->l[1]->l[0]=face->l[0];
+				if(face->l[0]) face->l[0]->l[1]=face->l[1];
+				if(face==list.root) list.root=face->l[1];
+				--list.count;
+			}
+
+
+			void				Initialize()
+			{
+				m_status	=	eStatus::Failed;
+				m_normal	=	b3MakeVector3(0,0,0);
+				m_depth		=	0;
+				m_nextsv	=	0;
+				for(unsigned int i=0;i<EPA_MAX_FACES;++i)
+				{
+					append(m_stock,&m_fc_store[EPA_MAX_FACES-i-1]);
+				}
+			}
+			eStatus::_			Evaluate(b3GJK& gjk,const b3Vector3& guess)
+			{
+				b3GJK::sSimplex&	simplex=*gjk.m_simplex;
+				if((simplex.rank>1)&&gjk.EncloseOrigin())
+				{
+
+					/* Clean up				*/ 
+					while(m_hull.root)
+					{
+						sFace*	f = m_hull.root;
+						remove(m_hull,f);
+						append(m_stock,f);
+					}
+					m_status	=	eStatus::Valid;
+					m_nextsv	=	0;
+					/* Orient simplex		*/ 
+					if(gjk.det(	simplex.c[0]->w-simplex.c[3]->w,
+						simplex.c[1]->w-simplex.c[3]->w,
+						simplex.c[2]->w-simplex.c[3]->w)<0)
+					{
+						b3Swap(simplex.c[0],simplex.c[1]);
+						b3Swap(simplex.p[0],simplex.p[1]);
+					}
+					/* Build initial hull	*/ 
+					sFace*	tetra[]={newface(simplex.c[0],simplex.c[1],simplex.c[2],true),
+						newface(simplex.c[1],simplex.c[0],simplex.c[3],true),
+						newface(simplex.c[2],simplex.c[1],simplex.c[3],true),
+						newface(simplex.c[0],simplex.c[2],simplex.c[3],true)};
+					if(m_hull.count==4)
+					{
+						sFace*		best=findbest();
+						sFace		outer=*best;
+						unsigned int			pass=0;
+						unsigned int			iterations=0;
+						bind(tetra[0],0,tetra[1],0);
+						bind(tetra[0],1,tetra[2],0);
+						bind(tetra[0],2,tetra[3],0);
+						bind(tetra[1],1,tetra[3],2);
+						bind(tetra[1],2,tetra[2],1);
+						bind(tetra[2],2,tetra[3],1);
+						m_status=eStatus::Valid;
+						for(;iterations<EPA_MAX_ITERATIONS;++iterations)
+						{
+							if(m_nextsv<EPA_MAX_VERTICES)
+							{	
+								sHorizon		horizon;
+								sSV*			w=&m_sv_store[m_nextsv++];
+								bool			valid=true;					
+								best->pass	=	(unsigned char)(++pass);
+								gjk.getsupport(best->n,*w);
+								const b3Scalar	wdist=b3Dot(best->n,w->w)-best->d;
+								if(wdist>EPA_ACCURACY)
+								{
+									for(unsigned int j=0;(j<3)&&valid;++j)
+									{
+										valid&=expand(	pass,w,
+											best->f[j],best->e[j],
+											horizon);
+									}
+									if(valid&&(horizon.nf>=3))
+									{
+										bind(horizon.cf,1,horizon.ff,2);
+										remove(m_hull,best);
+										append(m_stock,best);
+										best=findbest();
+										outer=*best;
+									} else { 
+										m_status=eStatus::Failed;
+										//m_status=eStatus::InvalidHull;
+									break; }
+								} else { m_status=eStatus::AccuraryReached;break; }
+							} else { m_status=eStatus::OutOfVertices;break; }
+						}
+						const b3Vector3	projection=outer.n*outer.d;
+						m_normal	=	outer.n;
+						m_depth		=	outer.d;
+						m_result.rank	=	3;
+						m_result.c[0]	=	outer.c[0];
+						m_result.c[1]	=	outer.c[1];
+						m_result.c[2]	=	outer.c[2];
+						m_result.p[0]	=	b3Cross(	outer.c[1]->w-projection,
+							outer.c[2]->w-projection).length();
+						m_result.p[1]	=	b3Cross(	outer.c[2]->w-projection,
+							outer.c[0]->w-projection).length();
+						m_result.p[2]	=	b3Cross(	outer.c[0]->w-projection,
+							outer.c[1]->w-projection).length();
+						const b3Scalar	sum=m_result.p[0]+m_result.p[1]+m_result.p[2];
+						m_result.p[0]	/=	sum;
+						m_result.p[1]	/=	sum;
+						m_result.p[2]	/=	sum;
+						return(m_status);
+					}
+				}
+				/* Fallback		*/ 
+				m_status	=	eStatus::FallBack;
+				m_normal	=	-guess;
+				const b3Scalar	nl=m_normal.length();
+				if(nl>0)
+					m_normal	=	m_normal/nl;
+				else
+					m_normal	=	b3MakeVector3(1,0,0);
+				m_depth	=	0;
+				m_result.rank=1;
+				m_result.c[0]=simplex.c[0];
+				m_result.p[0]=1;	
+				return(m_status);
+			}
+			bool getedgedist(sFace* face, sSV* a, sSV* b, b3Scalar& dist)
+			{
+				const b3Vector3 ba = b->w - a->w;
+				const b3Vector3 n_ab = b3Cross(ba, face->n); // Outward facing edge normal direction, on triangle plane
+				const b3Scalar a_dot_nab = b3Dot(a->w, n_ab); // Only care about the sign to determine inside/outside, so not normalization required
+
+				if(a_dot_nab < 0)
+				{
+					// Outside of edge a->b
+
+					const b3Scalar ba_l2 = ba.length2();
+					const b3Scalar a_dot_ba = b3Dot(a->w, ba);
+					const b3Scalar b_dot_ba = b3Dot(b->w, ba);
+
+					if(a_dot_ba > 0)
+					{
+						// Pick distance vertex a
+						dist = a->w.length();
+					}
+					else if(b_dot_ba < 0)
+					{
+						// Pick distance vertex b
+						dist = b->w.length();
+					}
+					else
+					{
+						// Pick distance to edge a->b
+						const b3Scalar a_dot_b = b3Dot(a->w, b->w);
+						dist = b3Sqrt(b3Max((a->w.length2() * b->w.length2() - a_dot_b * a_dot_b) / ba_l2, (b3Scalar)0));
+					}
+
+					return true;
+				}
+
+				return false;
+			}
+			sFace*				newface(sSV* a,sSV* b,sSV* c,bool forced)
+			{
+				if(m_stock.root)
+				{
+					sFace*	face=m_stock.root;
+					remove(m_stock,face);
+					append(m_hull,face);
+					face->pass	=	0;
+					face->c[0]	=	a;
+					face->c[1]	=	b;
+					face->c[2]	=	c;
+					face->n		=	b3Cross(b->w-a->w,c->w-a->w);
+					const b3Scalar	l=face->n.length();
+					const bool		v=l>EPA_ACCURACY;
+
+					if(v)
+					{
+						if(!(getedgedist(face, a, b, face->d) ||
+							 getedgedist(face, b, c, face->d) ||
+							 getedgedist(face, c, a, face->d)))
+						{
+							// Origin projects to the interior of the triangle
+							// Use distance to triangle plane
+							face->d = b3Dot(a->w, face->n) / l;
+						}
+
+						face->n /= l;
+						if(forced || (face->d >= -EPA_PLANE_EPS))
+						{
+							return face;
+						}
+						else
+							m_status=eStatus::NonConvex;
+					}
+					else
+						m_status=eStatus::Degenerated;
+
+					remove(m_hull, face);
+					append(m_stock, face);
+					return 0;
+
+				}
+				m_status = m_stock.root ? eStatus::OutOfVertices : eStatus::OutOfFaces;
+				return 0;
+			}
+			sFace*				findbest()
+			{
+				sFace*		minf=m_hull.root;
+				b3Scalar	mind=minf->d*minf->d;
+				for(sFace* f=minf->l[1];f;f=f->l[1])
+				{
+					const b3Scalar	sqd=f->d*f->d;
+					if(sqd<mind)
+					{
+						minf=f;
+						mind=sqd;
+					}
+				}
+				return(minf);
+			}
+			bool				expand(unsigned int pass,sSV* w,sFace* f,unsigned int e,sHorizon& horizon)
+			{
+				static const unsigned int	i1m3[]={1,2,0};
+				static const unsigned int	i2m3[]={2,0,1};
+				if(f->pass!=pass)
+				{
+					const unsigned int	e1=i1m3[e];
+					if((b3Dot(f->n,w->w)-f->d)<-EPA_PLANE_EPS)
+					{
+						sFace*	nf=newface(f->c[e1],f->c[e],w,false);
+						if(nf)
+						{
+							bind(nf,0,f,e);
+							if(horizon.cf) bind(horizon.cf,1,nf,2); else horizon.ff=nf;
+							horizon.cf=nf;
+							++horizon.nf;
+							return(true);
+						}
+					}
+					else
+					{
+						const unsigned int	e2=i2m3[e];
+						f->pass		=	(unsigned char)pass;
+						if(	expand(pass,w,f->f[e1],f->e[e1],horizon)&&
+							expand(pass,w,f->f[e2],f->e[e2],horizon))
+						{
+							remove(m_hull,f);
+							append(m_stock,f);
+							return(true);
+						}
+					}
+				}
+				return(false);
+			}
+
+	};
+
+	//
+	static void	Initialize(const b3Transform&	transA, const b3Transform&	transB,
+								const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, 
+								const b3AlignedObjectArray<b3Vector3>& verticesA,
+								const b3AlignedObjectArray<b3Vector3>& verticesB,
+		b3GjkEpaSolver2::sResults& results,
+		tShape& shape,
+		bool withmargins)
+	{
+		/* Results		*/ 
+		results.witnesses[0]	=
+			results.witnesses[1]	=	b3MakeVector3(0,0,0);
+		results.status			=	b3GjkEpaSolver2::sResults::Separated;
+		/* Shape		*/ 
+		shape.m_shapes[0]		=	hullA;
+		shape.m_shapes[1]		=	hullB;
+		shape.m_toshape1		=	transB.getBasis().transposeTimes(transA.getBasis());
+		shape.m_toshape0		=	transA.inverseTimes(transB);
+		shape.EnableMargin(withmargins);
+	}
+
+}
+
+//
+// Api
+//
+
+using namespace	gjkepa2_impl2;
+
+//
+int			b3GjkEpaSolver2::StackSizeRequirement()
+{
+	return(sizeof(b3GJK)+sizeof(b3EPA));
+}
+
+//
+bool		b3GjkEpaSolver2::Distance(	const b3Transform&	transA, const b3Transform&	transB,
+										const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, 
+										const b3AlignedObjectArray<b3Vector3>& verticesA,
+										const b3AlignedObjectArray<b3Vector3>& verticesB,
+									  const b3Vector3&		guess,
+									  sResults&				results)
+{
+	tShape			shape;
+	Initialize(transA,transB,hullA,hullB,verticesA,verticesB,results,shape,false);
+	b3GJK				gjk(verticesA,verticesB);
+	b3GJK::eStatus::_	gjk_status=gjk.Evaluate(shape,guess);
+	if(gjk_status==b3GJK::eStatus::Valid)
+	{
+		b3Vector3	w0=b3MakeVector3(0,0,0);
+		b3Vector3	w1=b3MakeVector3(0,0,0);
+		for(unsigned int i=0;i<gjk.m_simplex->rank;++i)
+		{
+			const b3Scalar	p=gjk.m_simplex->p[i];
+			w0+=shape.Support( gjk.m_simplex->c[i]->d,0,verticesA,verticesB)*p;
+			w1+=shape.Support(-gjk.m_simplex->c[i]->d,1,verticesA,verticesB)*p;
+		}
+		results.witnesses[0]	=	transA*w0;
+		results.witnesses[1]	=	transA*w1;
+		results.normal			=	w0-w1;
+		results.distance		=	results.normal.length();
+		results.normal			/=	results.distance>GJK_MIN_DISTANCE?results.distance:1;
+		return(true);
+	}
+	else
+	{
+		results.status	=	gjk_status==b3GJK::eStatus::Inside?
+			sResults::Penetrating	:
+		sResults::GJK_Failed	;
+		return(false);
+	}
+}
+
+//
+bool	b3GjkEpaSolver2::Penetration(	const b3Transform&	transA, const b3Transform&	transB,
+										const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, 
+										const b3AlignedObjectArray<b3Vector3>& verticesA,
+										const b3AlignedObjectArray<b3Vector3>& verticesB,
+									 const b3Vector3&		guess,
+									 sResults&				results,
+									 bool					usemargins)
+{
+
+	tShape			shape;
+	Initialize(transA,transB,hullA,hullB,verticesA,verticesB,results,shape,usemargins);
+	b3GJK				gjk(verticesA,verticesB);
+	b3GJK::eStatus::_	gjk_status=gjk.Evaluate(shape,guess);
+	switch(gjk_status)
+	{
+	case	b3GJK::eStatus::Inside:
+		{
+			b3EPA				epa;
+			b3EPA::eStatus::_	epa_status=epa.Evaluate(gjk,-guess);
+			if(epa_status!=b3EPA::eStatus::Failed)
+			{
+				b3Vector3	w0=b3MakeVector3(0,0,0);
+				for(unsigned int i=0;i<epa.m_result.rank;++i)
+				{
+					w0+=shape.Support(epa.m_result.c[i]->d,0,verticesA,verticesB)*epa.m_result.p[i];
+				}
+				results.status			=	sResults::Penetrating;
+				results.witnesses[0]	=	transA*w0;
+				results.witnesses[1]	=	transA*(w0-epa.m_normal*epa.m_depth);
+				results.normal			=	-epa.m_normal;
+				results.distance		=	-epa.m_depth;
+				return(true);
+			} else results.status=sResults::EPA_Failed;
+		}
+		break;
+	case	b3GJK::eStatus::Failed:
+		results.status=sResults::GJK_Failed;
+		break;
+		default:
+					{
+					}
+	}
+	return(false);
+}
+
+
+#if 0
+//
+b3Scalar	b3GjkEpaSolver2::SignedDistance(const b3Vector3& position,
+											b3Scalar margin,
+											const b3Transform&	transA,
+											const b3ConvexPolyhedronData& hullA, 
+											const b3AlignedObjectArray<b3Vector3>& verticesA,
+											sResults& results)
+{
+	tShape			shape;
+	btSphereShape	shape1(margin);
+	b3Transform		wtrs1(b3Quaternion(0,0,0,1),position);
+	Initialize(shape0,wtrs0,&shape1,wtrs1,results,shape,false);
+	GJK				gjk;	
+	GJK::eStatus::_	gjk_status=gjk.Evaluate(shape,b3Vector3(1,1,1));
+	if(gjk_status==GJK::eStatus::Valid)
+	{
+		b3Vector3	w0=b3Vector3(0,0,0);
+		b3Vector3	w1=b3Vector3(0,0,0);
+		for(unsigned int i=0;i<gjk.m_simplex->rank;++i)
+		{
+			const b3Scalar	p=gjk.m_simplex->p[i];
+			w0+=shape.Support( gjk.m_simplex->c[i]->d,0)*p;
+			w1+=shape.Support(-gjk.m_simplex->c[i]->d,1)*p;
+		}
+		results.witnesses[0]	=	wtrs0*w0;
+		results.witnesses[1]	=	wtrs0*w1;
+		const b3Vector3	delta=	results.witnesses[1]-
+			results.witnesses[0];
+		const b3Scalar	margin=	shape0->getMarginNonVirtual()+
+			shape1.getMarginNonVirtual();
+		const b3Scalar	length=	delta.length();	
+		results.normal			=	delta/length;
+		results.witnesses[0]	+=	results.normal*margin;
+		return(length-margin);
+	}
+	else
+	{
+		if(gjk_status==GJK::eStatus::Inside)
+		{
+			if(Penetration(shape0,wtrs0,&shape1,wtrs1,gjk.m_ray,results))
+			{
+				const b3Vector3	delta=	results.witnesses[0]-
+					results.witnesses[1];
+				const b3Scalar	length=	delta.length();
+				if (length >= B3_EPSILON)
+					results.normal	=	delta/length;			
+				return(-length);
+			}
+		}	
+	}
+	return(B3_INFINITY);
+}
+
+//
+bool	b3GjkEpaSolver2::SignedDistance(const btConvexShape*	shape0,
+										const b3Transform&		wtrs0,
+										const btConvexShape*	shape1,
+										const b3Transform&		wtrs1,
+										const b3Vector3&		guess,
+										sResults&				results)
+{
+	if(!Distance(shape0,wtrs0,shape1,wtrs1,guess,results))
+		return(Penetration(shape0,wtrs0,shape1,wtrs1,guess,results,false));
+	else
+		return(true);
+}
+#endif
+
+
+/* Symbols cleanup		*/ 
+
+#undef GJK_MAX_ITERATIONS
+#undef GJK_ACCURACY
+#undef GJK_MIN_DISTANCE
+#undef GJK_DUPLICATED_EPS
+#undef GJK_SIMPLEX2_EPS
+#undef GJK_SIMPLEX3_EPS
+#undef GJK_SIMPLEX4_EPS
+
+#undef EPA_MAX_VERTICES
+#undef EPA_MAX_FACES
+#undef EPA_MAX_ITERATIONS
+#undef EPA_ACCURACY
+#undef EPA_FALLBACK
+#undef EPA_PLANE_EPS
+#undef EPA_INSIDE_EPS
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h
new file mode 100644
index 0000000000..976238a04c
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h
@@ -0,0 +1,82 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2008 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the
+use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software in a
+product, an acknowledgment in the product documentation would be appreciated
+but is not required.
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+/*
+GJK-EPA collision solver by Nathanael Presson, 2008
+*/
+#ifndef B3_GJK_EPA2_H
+#define B3_GJK_EPA2_H
+
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Common/b3Transform.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+
+
+///btGjkEpaSolver contributed under zlib by Nathanael Presson
+struct	b3GjkEpaSolver2
+{
+struct	sResults
+	{
+	enum eStatus
+		{
+		Separated,		/* Shapes doesnt penetrate												*/ 
+		Penetrating,	/* Shapes are penetrating												*/ 
+		GJK_Failed,		/* GJK phase fail, no big issue, shapes are probably just 'touching'	*/ 
+		EPA_Failed		/* EPA phase fail, bigger problem, need to save parameters, and debug	*/ 
+		}		status;
+	b3Vector3	witnesses[2];
+	b3Vector3	normal;
+	b3Scalar	distance;
+	};
+
+static int		StackSizeRequirement();
+
+static bool		Distance(	 const b3Transform&	transA, const b3Transform&	transB,
+							const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, 
+							const b3AlignedObjectArray<b3Vector3>& verticesA,
+							const b3AlignedObjectArray<b3Vector3>& verticesB,
+							const b3Vector3& guess,
+							sResults& results);
+
+static bool		Penetration( const b3Transform&	transA, const b3Transform&	transB,
+							const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, 
+							const b3AlignedObjectArray<b3Vector3>& verticesA,
+							const b3AlignedObjectArray<b3Vector3>& verticesB,
+							const b3Vector3& guess,
+							sResults& results,
+							bool usemargins=true);
+#if 0
+static b3Scalar	SignedDistance(	const b3Vector3& position,
+								b3Scalar margin,
+								const btConvexShape* shape,
+								const btTransform& wtrs,
+								sResults& results);
+							
+static bool		SignedDistance(	const btConvexShape* shape0,const btTransform& wtrs0,
+								const btConvexShape* shape1,const btTransform& wtrs1,
+								const b3Vector3& guess,
+								sResults& results);
+#endif 
+
+};
+
+#endif //B3_GJK_EPA2_H
+
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp
new file mode 100644
index 0000000000..e9e51d5a36
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp
@@ -0,0 +1,390 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "b3OptimizedBvh.h"
+#include "b3StridingMeshInterface.h"
+#include "Bullet3Geometry/b3AabbUtil.h"
+
+
+b3OptimizedBvh::b3OptimizedBvh()
+{ 
+}
+
+b3OptimizedBvh::~b3OptimizedBvh()
+{
+}
+
+
+void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax)
+{
+	m_useQuantization = useQuantizedAabbCompression;
+
+
+	// NodeArray	triangleNodes;
+
+	struct	NodeTriangleCallback : public b3InternalTriangleIndexCallback
+	{
+
+		NodeArray&	m_triangleNodes;
+
+		NodeTriangleCallback& operator=(NodeTriangleCallback& other)
+		{
+			m_triangleNodes.copyFromArray(other.m_triangleNodes);
+			return *this;
+		}
+		
+		NodeTriangleCallback(NodeArray&	triangleNodes)
+			:m_triangleNodes(triangleNodes)
+		{
+		}
+
+		virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int  triangleIndex)
+		{
+			b3OptimizedBvhNode node;
+			b3Vector3	aabbMin,aabbMax;
+			aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
+			aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); 
+			aabbMin.setMin(triangle[0]);
+			aabbMax.setMax(triangle[0]);
+			aabbMin.setMin(triangle[1]);
+			aabbMax.setMax(triangle[1]);
+			aabbMin.setMin(triangle[2]);
+			aabbMax.setMax(triangle[2]);
+
+			//with quantization?
+			node.m_aabbMinOrg = aabbMin;
+			node.m_aabbMaxOrg = aabbMax;
+
+			node.m_escapeIndex = -1;
+	
+			//for child nodes
+			node.m_subPart = partId;
+			node.m_triangleIndex = triangleIndex;
+			m_triangleNodes.push_back(node);
+		}
+	};
+	struct	QuantizedNodeTriangleCallback : public b3InternalTriangleIndexCallback
+	{
+		QuantizedNodeArray&	m_triangleNodes;
+		const b3QuantizedBvh* m_optimizedTree; // for quantization
+
+		QuantizedNodeTriangleCallback& operator=(QuantizedNodeTriangleCallback& other)
+		{
+			m_triangleNodes.copyFromArray(other.m_triangleNodes);
+			m_optimizedTree = other.m_optimizedTree;
+			return *this;
+		}
+
+		QuantizedNodeTriangleCallback(QuantizedNodeArray&	triangleNodes,const b3QuantizedBvh* tree)
+			:m_triangleNodes(triangleNodes),m_optimizedTree(tree)
+		{
+		}
+
+		virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int  triangleIndex)
+		{
+			// The partId and triangle index must fit in the same (positive) integer
+			b3Assert(partId < (1<<MAX_NUM_PARTS_IN_BITS));
+			b3Assert(triangleIndex < (1<<(31-MAX_NUM_PARTS_IN_BITS)));
+			//negative indices are reserved for escapeIndex
+			b3Assert(triangleIndex>=0);
+
+			b3QuantizedBvhNode node;
+			b3Vector3	aabbMin,aabbMax;
+			aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
+			aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); 
+			aabbMin.setMin(triangle[0]);
+			aabbMax.setMax(triangle[0]);
+			aabbMin.setMin(triangle[1]);
+			aabbMax.setMax(triangle[1]);
+			aabbMin.setMin(triangle[2]);
+			aabbMax.setMax(triangle[2]);
+
+			//PCK: add these checks for zero dimensions of aabb
+			const b3Scalar MIN_AABB_DIMENSION = b3Scalar(0.002);
+			const b3Scalar MIN_AABB_HALF_DIMENSION = b3Scalar(0.001);
+			if (aabbMax.getX() - aabbMin.getX() < MIN_AABB_DIMENSION)
+			{
+				aabbMax.setX(aabbMax.getX() + MIN_AABB_HALF_DIMENSION);
+				aabbMin.setX(aabbMin.getX() - MIN_AABB_HALF_DIMENSION);
+			}
+			if (aabbMax.getY() - aabbMin.getY() < MIN_AABB_DIMENSION)
+			{
+				aabbMax.setY(aabbMax.getY() + MIN_AABB_HALF_DIMENSION);
+				aabbMin.setY(aabbMin.getY() - MIN_AABB_HALF_DIMENSION);
+			}
+			if (aabbMax.getZ() - aabbMin.getZ() < MIN_AABB_DIMENSION)
+			{
+				aabbMax.setZ(aabbMax.getZ() + MIN_AABB_HALF_DIMENSION);
+				aabbMin.setZ(aabbMin.getZ() - MIN_AABB_HALF_DIMENSION);
+			}
+
+			m_optimizedTree->quantize(&node.m_quantizedAabbMin[0],aabbMin,0);
+			m_optimizedTree->quantize(&node.m_quantizedAabbMax[0],aabbMax,1);
+
+			node.m_escapeIndexOrTriangleIndex = (partId<<(31-MAX_NUM_PARTS_IN_BITS)) | triangleIndex;
+
+			m_triangleNodes.push_back(node);
+		}
+	};
+	
+
+
+	int numLeafNodes = 0;
+
+	
+	if (m_useQuantization)
+	{
+
+		//initialize quantization values
+		setQuantizationValues(bvhAabbMin,bvhAabbMax);
+
+		QuantizedNodeTriangleCallback	callback(m_quantizedLeafNodes,this);
+
+	
+		triangles->InternalProcessAllTriangles(&callback,m_bvhAabbMin,m_bvhAabbMax);
+
+		//now we have an array of leafnodes in m_leafNodes
+		numLeafNodes = m_quantizedLeafNodes.size();
+
+
+		m_quantizedContiguousNodes.resize(2*numLeafNodes);
+
+
+	} else
+	{
+		NodeTriangleCallback	callback(m_leafNodes);
+
+		b3Vector3 aabbMin=b3MakeVector3(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
+		b3Vector3 aabbMax=b3MakeVector3(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
+
+		triangles->InternalProcessAllTriangles(&callback,aabbMin,aabbMax);
+
+		//now we have an array of leafnodes in m_leafNodes
+		numLeafNodes = m_leafNodes.size();
+
+		m_contiguousNodes.resize(2*numLeafNodes);
+	}
+
+	m_curNodeIndex = 0;
+
+	buildTree(0,numLeafNodes);
+
+	///if the entire tree is small then subtree size, we need to create a header info for the tree
+	if(m_useQuantization && !m_SubtreeHeaders.size())
+	{
+		b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand();
+		subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[0]);
+		subtree.m_rootNodeIndex = 0;
+		subtree.m_subtreeSize = m_quantizedContiguousNodes[0].isLeafNode() ? 1 : m_quantizedContiguousNodes[0].getEscapeIndex();
+	}
+
+	//PCK: update the copy of the size
+	m_subtreeHeaderCount = m_SubtreeHeaders.size();
+
+	//PCK: clear m_quantizedLeafNodes and m_leafNodes, they are temporary
+	m_quantizedLeafNodes.clear();
+	m_leafNodes.clear();
+}
+
+
+
+
+void	b3OptimizedBvh::refit(b3StridingMeshInterface* meshInterface,const b3Vector3& aabbMin,const b3Vector3& aabbMax)
+{
+	if (m_useQuantization)
+	{
+
+		setQuantizationValues(aabbMin,aabbMax);
+
+		updateBvhNodes(meshInterface,0,m_curNodeIndex,0);
+
+		///now update all subtree headers
+
+		int i;
+		for (i=0;i<m_SubtreeHeaders.size();i++)
+		{
+			b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
+			subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]);
+		}
+
+	} else
+	{
+
+	}
+}
+
+
+
+
+void	b3OptimizedBvh::refitPartial(b3StridingMeshInterface* meshInterface,const b3Vector3& aabbMin,const b3Vector3& aabbMax)
+{
+	//incrementally initialize quantization values
+	b3Assert(m_useQuantization);
+
+	b3Assert(aabbMin.getX() > m_bvhAabbMin.getX());
+	b3Assert(aabbMin.getY() > m_bvhAabbMin.getY());
+	b3Assert(aabbMin.getZ() > m_bvhAabbMin.getZ());
+
+	b3Assert(aabbMax.getX() < m_bvhAabbMax.getX());
+	b3Assert(aabbMax.getY() < m_bvhAabbMax.getY());
+	b3Assert(aabbMax.getZ() < m_bvhAabbMax.getZ());
+
+	///we should update all quantization values, using updateBvhNodes(meshInterface);
+	///but we only update chunks that overlap the given aabb
+	
+	unsigned short	quantizedQueryAabbMin[3];
+	unsigned short	quantizedQueryAabbMax[3];
+
+	quantize(&quantizedQueryAabbMin[0],aabbMin,0);
+	quantize(&quantizedQueryAabbMax[0],aabbMax,1);
+
+	int i;
+	for (i=0;i<this->m_SubtreeHeaders.size();i++)
+	{
+		b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
+
+		//PCK: unsigned instead of bool
+		unsigned overlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
+		if (overlap != 0)
+		{
+			updateBvhNodes(meshInterface,subtree.m_rootNodeIndex,subtree.m_rootNodeIndex+subtree.m_subtreeSize,i);
+
+			subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]);
+		}
+	}
+	
+}
+
+void	b3OptimizedBvh::updateBvhNodes(b3StridingMeshInterface* meshInterface,int firstNode,int endNode,int index)
+{
+	(void)index;
+
+	b3Assert(m_useQuantization);
+
+	int curNodeSubPart=-1;
+
+	//get access info to trianglemesh data
+		const unsigned char *vertexbase = 0;
+		int numverts = 0;
+		PHY_ScalarType type = PHY_INTEGER;
+		int stride = 0;
+		const unsigned char *indexbase = 0;
+		int indexstride = 0;
+		int numfaces = 0;
+		PHY_ScalarType indicestype = PHY_INTEGER;
+
+		b3Vector3	triangleVerts[3];
+		b3Vector3	aabbMin,aabbMax;
+		const b3Vector3& meshScaling = meshInterface->getScaling();
+		
+		int i;
+		for (i=endNode-1;i>=firstNode;i--)
+		{
+
+
+			b3QuantizedBvhNode& curNode = m_quantizedContiguousNodes[i];
+			if (curNode.isLeafNode())
+			{
+				//recalc aabb from triangle data
+				int nodeSubPart = curNode.getPartId();
+				int nodeTriangleIndex = curNode.getTriangleIndex();
+				if (nodeSubPart != curNodeSubPart)
+				{
+					if (curNodeSubPart >= 0)
+						meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
+					meshInterface->getLockedReadOnlyVertexIndexBase(&vertexbase,numverts,	type,stride,&indexbase,indexstride,numfaces,indicestype,nodeSubPart);
+
+					curNodeSubPart = nodeSubPart;
+					b3Assert(indicestype==PHY_INTEGER||indicestype==PHY_SHORT);
+				}
+				//triangles->getLockedReadOnlyVertexIndexBase(vertexBase,numVerts,
+
+				unsigned int* gfxbase = (unsigned int*)(indexbase+nodeTriangleIndex*indexstride);
+				
+				
+				for (int j=2;j>=0;j--)
+				{
+					
+					int graphicsindex = indicestype==PHY_SHORT?((unsigned short*)gfxbase)[j]:gfxbase[j];
+					if (type == PHY_FLOAT)
+					{
+						float* graphicsbase = (float*)(vertexbase+graphicsindex*stride);
+						triangleVerts[j] = b3MakeVector3(
+							graphicsbase[0]*meshScaling.getX(),
+							graphicsbase[1]*meshScaling.getY(),
+							graphicsbase[2]*meshScaling.getZ());
+					}
+					else
+					{
+						double* graphicsbase = (double*)(vertexbase+graphicsindex*stride);
+						triangleVerts[j] = b3MakeVector3( b3Scalar(graphicsbase[0]*meshScaling.getX()), b3Scalar(graphicsbase[1]*meshScaling.getY()), b3Scalar(graphicsbase[2]*meshScaling.getZ()));
+					}
+				}
+
+
+				
+				aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
+				aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); 
+				aabbMin.setMin(triangleVerts[0]);
+				aabbMax.setMax(triangleVerts[0]);
+				aabbMin.setMin(triangleVerts[1]);
+				aabbMax.setMax(triangleVerts[1]);
+				aabbMin.setMin(triangleVerts[2]);
+				aabbMax.setMax(triangleVerts[2]);
+
+				quantize(&curNode.m_quantizedAabbMin[0],aabbMin,0);
+				quantize(&curNode.m_quantizedAabbMax[0],aabbMax,1);
+				
+			} else
+			{
+				//combine aabb from both children
+
+				b3QuantizedBvhNode* leftChildNode = &m_quantizedContiguousNodes[i+1];
+				
+				b3QuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? &m_quantizedContiguousNodes[i+2] :
+					&m_quantizedContiguousNodes[i+1+leftChildNode->getEscapeIndex()];
+				
+
+				{
+					for (int i=0;i<3;i++)
+					{
+						curNode.m_quantizedAabbMin[i] = leftChildNode->m_quantizedAabbMin[i];
+						if (curNode.m_quantizedAabbMin[i]>rightChildNode->m_quantizedAabbMin[i])
+							curNode.m_quantizedAabbMin[i]=rightChildNode->m_quantizedAabbMin[i];
+
+						curNode.m_quantizedAabbMax[i] = leftChildNode->m_quantizedAabbMax[i];
+						if (curNode.m_quantizedAabbMax[i] < rightChildNode->m_quantizedAabbMax[i])
+							curNode.m_quantizedAabbMax[i] = rightChildNode->m_quantizedAabbMax[i];
+					}
+				}
+			}
+
+		}
+
+		if (curNodeSubPart >= 0)
+			meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
+
+		
+}
+
+///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
+b3OptimizedBvh* b3OptimizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian)
+{
+	b3QuantizedBvh* bvh = b3QuantizedBvh::deSerializeInPlace(i_alignedDataBuffer,i_dataBufferSize,i_swapEndian);
+	
+	//we don't add additional data so just do a static upcast
+	return static_cast<b3OptimizedBvh*>(bvh);
+}
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h
new file mode 100644
index 0000000000..0272ef83bf
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h
@@ -0,0 +1,65 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+///Contains contributions from Disney Studio's
+
+#ifndef B3_OPTIMIZED_BVH_H
+#define B3_OPTIMIZED_BVH_H
+
+#include "b3QuantizedBvh.h"
+
+class b3StridingMeshInterface;
+
+
+///The b3OptimizedBvh extends the b3QuantizedBvh to create AABB tree for triangle meshes, through the b3StridingMeshInterface.
+B3_ATTRIBUTE_ALIGNED16(class) b3OptimizedBvh : public b3QuantizedBvh
+{
+	
+public:
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+protected:
+
+public:
+
+	b3OptimizedBvh();
+
+	virtual ~b3OptimizedBvh();
+
+	void	build(b3StridingMeshInterface* triangles,bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax);
+
+	void	refit(b3StridingMeshInterface* triangles,const b3Vector3& aabbMin,const b3Vector3& aabbMax);
+
+	void	refitPartial(b3StridingMeshInterface* triangles,const b3Vector3& aabbMin, const b3Vector3& aabbMax);
+
+	void	updateBvhNodes(b3StridingMeshInterface* meshInterface,int firstNode,int endNode,int index);
+
+	/// Data buffer MUST be 16 byte aligned
+	virtual bool serializeInPlace(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const
+	{
+		return b3QuantizedBvh::serialize(o_alignedDataBuffer,i_dataBufferSize,i_swapEndian);
+
+	}
+
+	///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
+	static b3OptimizedBvh *deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
+
+
+};
+
+
+#endif //B3_OPTIMIZED_BVH_H
+
+
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp
new file mode 100644
index 0000000000..52027e1118
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp
@@ -0,0 +1,1301 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "b3QuantizedBvh.h"
+
+#include "Bullet3Geometry/b3AabbUtil.h"
+
+
+#define RAYAABB2
+
+b3QuantizedBvh::b3QuantizedBvh() : 
+					m_bulletVersion(B3_BULLET_VERSION),
+					m_useQuantization(false), 
+					m_traversalMode(TRAVERSAL_STACKLESS_CACHE_FRIENDLY)
+					//m_traversalMode(TRAVERSAL_STACKLESS)
+					//m_traversalMode(TRAVERSAL_RECURSIVE)
+					,m_subtreeHeaderCount(0) //PCK: add this line
+{
+	m_bvhAabbMin.setValue(-B3_INFINITY,-B3_INFINITY,-B3_INFINITY);
+	m_bvhAabbMax.setValue(B3_INFINITY,B3_INFINITY,B3_INFINITY);
+}
+
+
+
+
+
+void b3QuantizedBvh::buildInternal()
+{
+	///assumes that caller filled in the m_quantizedLeafNodes
+	m_useQuantization = true;
+	int numLeafNodes = 0;
+	
+	if (m_useQuantization)
+	{
+		//now we have an array of leafnodes in m_leafNodes
+		numLeafNodes = m_quantizedLeafNodes.size();
+
+		m_quantizedContiguousNodes.resize(2*numLeafNodes);
+
+	}
+
+	m_curNodeIndex = 0;
+
+	buildTree(0,numLeafNodes);
+
+	///if the entire tree is small then subtree size, we need to create a header info for the tree
+	if(m_useQuantization && !m_SubtreeHeaders.size())
+	{
+		b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand();
+		subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[0]);
+		subtree.m_rootNodeIndex = 0;
+		subtree.m_subtreeSize = m_quantizedContiguousNodes[0].isLeafNode() ? 1 : m_quantizedContiguousNodes[0].getEscapeIndex();
+	}
+
+	//PCK: update the copy of the size
+	m_subtreeHeaderCount = m_SubtreeHeaders.size();
+
+	//PCK: clear m_quantizedLeafNodes and m_leafNodes, they are temporary
+	m_quantizedLeafNodes.clear();
+	m_leafNodes.clear();
+}
+
+
+
+///just for debugging, to visualize the individual patches/subtrees
+#ifdef DEBUG_PATCH_COLORS
+b3Vector3 color[4]=
+{
+	b3Vector3(1,0,0),
+	b3Vector3(0,1,0),
+	b3Vector3(0,0,1),
+	b3Vector3(0,1,1)
+};
+#endif //DEBUG_PATCH_COLORS
+
+
+
+void	b3QuantizedBvh::setQuantizationValues(const b3Vector3& bvhAabbMin,const b3Vector3& bvhAabbMax,b3Scalar quantizationMargin)
+{
+	//enlarge the AABB to avoid division by zero when initializing the quantization values
+	b3Vector3 clampValue =b3MakeVector3(quantizationMargin,quantizationMargin,quantizationMargin);
+	m_bvhAabbMin = bvhAabbMin - clampValue;
+	m_bvhAabbMax = bvhAabbMax + clampValue;
+	b3Vector3 aabbSize = m_bvhAabbMax - m_bvhAabbMin;
+	m_bvhQuantization = b3MakeVector3(b3Scalar(65533.0),b3Scalar(65533.0),b3Scalar(65533.0)) / aabbSize;
+	m_useQuantization = true;
+}
+
+
+
+
+b3QuantizedBvh::~b3QuantizedBvh()
+{
+}
+
+#ifdef DEBUG_TREE_BUILDING
+int gStackDepth = 0;
+int gMaxStackDepth = 0;
+#endif //DEBUG_TREE_BUILDING
+
+void	b3QuantizedBvh::buildTree	(int startIndex,int endIndex)
+{
+#ifdef DEBUG_TREE_BUILDING
+	gStackDepth++;
+	if (gStackDepth > gMaxStackDepth)
+		gMaxStackDepth = gStackDepth;
+#endif //DEBUG_TREE_BUILDING
+
+
+	int splitAxis, splitIndex, i;
+	int numIndices =endIndex-startIndex;
+	int curIndex = m_curNodeIndex;
+
+	b3Assert(numIndices>0);
+
+	if (numIndices==1)
+	{
+#ifdef DEBUG_TREE_BUILDING
+		gStackDepth--;
+#endif //DEBUG_TREE_BUILDING
+		
+		assignInternalNodeFromLeafNode(m_curNodeIndex,startIndex);
+
+		m_curNodeIndex++;
+		return;	
+	}
+	//calculate Best Splitting Axis and where to split it. Sort the incoming 'leafNodes' array within range 'startIndex/endIndex'.
+	
+	splitAxis = calcSplittingAxis(startIndex,endIndex);
+
+	splitIndex = sortAndCalcSplittingIndex(startIndex,endIndex,splitAxis);
+
+	int internalNodeIndex = m_curNodeIndex;
+	
+	//set the min aabb to 'inf' or a max value, and set the max aabb to a -inf/minimum value.
+	//the aabb will be expanded during buildTree/mergeInternalNodeAabb with actual node values
+	setInternalNodeAabbMin(m_curNodeIndex,m_bvhAabbMax);//can't use b3Vector3(B3_INFINITY,B3_INFINITY,B3_INFINITY)) because of quantization
+	setInternalNodeAabbMax(m_curNodeIndex,m_bvhAabbMin);//can't use b3Vector3(-B3_INFINITY,-B3_INFINITY,-B3_INFINITY)) because of quantization
+	
+	
+	for (i=startIndex;i<endIndex;i++)
+	{
+		mergeInternalNodeAabb(m_curNodeIndex,getAabbMin(i),getAabbMax(i));
+	}
+
+	m_curNodeIndex++;
+	
+
+	//internalNode->m_escapeIndex;
+	
+	int leftChildNodexIndex = m_curNodeIndex;
+
+	//build left child tree
+	buildTree(startIndex,splitIndex);
+
+	int rightChildNodexIndex = m_curNodeIndex;
+	//build right child tree
+	buildTree(splitIndex,endIndex);
+
+#ifdef DEBUG_TREE_BUILDING
+	gStackDepth--;
+#endif //DEBUG_TREE_BUILDING
+
+	int escapeIndex = m_curNodeIndex - curIndex;
+
+	if (m_useQuantization)
+	{
+		//escapeIndex is the number of nodes of this subtree
+		const int sizeQuantizedNode =sizeof(b3QuantizedBvhNode);
+		const int treeSizeInBytes = escapeIndex * sizeQuantizedNode;
+		if (treeSizeInBytes > MAX_SUBTREE_SIZE_IN_BYTES)
+		{
+			updateSubtreeHeaders(leftChildNodexIndex,rightChildNodexIndex);
+		}
+	} else
+	{
+
+	}
+
+	setInternalNodeEscapeIndex(internalNodeIndex,escapeIndex);
+
+}
+
+void	b3QuantizedBvh::updateSubtreeHeaders(int leftChildNodexIndex,int rightChildNodexIndex)
+{
+	b3Assert(m_useQuantization);
+
+	b3QuantizedBvhNode& leftChildNode = m_quantizedContiguousNodes[leftChildNodexIndex];
+	int leftSubTreeSize = leftChildNode.isLeafNode() ? 1 : leftChildNode.getEscapeIndex();
+	int leftSubTreeSizeInBytes =  leftSubTreeSize * static_cast<int>(sizeof(b3QuantizedBvhNode));
+	
+	b3QuantizedBvhNode& rightChildNode = m_quantizedContiguousNodes[rightChildNodexIndex];
+	int rightSubTreeSize = rightChildNode.isLeafNode() ? 1 : rightChildNode.getEscapeIndex();
+	int rightSubTreeSizeInBytes =  rightSubTreeSize *  static_cast<int>(sizeof(b3QuantizedBvhNode));
+
+	if(leftSubTreeSizeInBytes <= MAX_SUBTREE_SIZE_IN_BYTES)
+	{
+		b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand();
+		subtree.setAabbFromQuantizeNode(leftChildNode);
+		subtree.m_rootNodeIndex = leftChildNodexIndex;
+		subtree.m_subtreeSize = leftSubTreeSize;
+	}
+
+	if(rightSubTreeSizeInBytes <= MAX_SUBTREE_SIZE_IN_BYTES)
+	{
+		b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand();
+		subtree.setAabbFromQuantizeNode(rightChildNode);
+		subtree.m_rootNodeIndex = rightChildNodexIndex;
+		subtree.m_subtreeSize = rightSubTreeSize;
+	}
+
+	//PCK: update the copy of the size
+	m_subtreeHeaderCount = m_SubtreeHeaders.size();
+}
+
+
+int	b3QuantizedBvh::sortAndCalcSplittingIndex(int startIndex,int endIndex,int splitAxis)
+{
+	int i;
+	int splitIndex =startIndex;
+	int numIndices = endIndex - startIndex;
+	b3Scalar splitValue;
+
+	b3Vector3 means=b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+	for (i=startIndex;i<endIndex;i++)
+	{
+		b3Vector3 center = b3Scalar(0.5)*(getAabbMax(i)+getAabbMin(i));
+		means+=center;
+	}
+	means *= (b3Scalar(1.)/(b3Scalar)numIndices);
+	
+	splitValue = means[splitAxis];
+	
+	//sort leafNodes so all values larger then splitValue comes first, and smaller values start from 'splitIndex'.
+	for (i=startIndex;i<endIndex;i++)
+	{
+		b3Vector3 center = b3Scalar(0.5)*(getAabbMax(i)+getAabbMin(i));
+		if (center[splitAxis] > splitValue)
+		{
+			//swap
+			swapLeafNodes(i,splitIndex);
+			splitIndex++;
+		}
+	}
+
+	//if the splitIndex causes unbalanced trees, fix this by using the center in between startIndex and endIndex
+	//otherwise the tree-building might fail due to stack-overflows in certain cases.
+	//unbalanced1 is unsafe: it can cause stack overflows
+	//bool unbalanced1 = ((splitIndex==startIndex) || (splitIndex == (endIndex-1)));
+
+	//unbalanced2 should work too: always use center (perfect balanced trees)	
+	//bool unbalanced2 = true;
+
+	//this should be safe too:
+	int rangeBalancedIndices = numIndices/3;
+	bool unbalanced = ((splitIndex<=(startIndex+rangeBalancedIndices)) || (splitIndex >=(endIndex-1-rangeBalancedIndices)));
+	
+	if (unbalanced)
+	{
+		splitIndex = startIndex+ (numIndices>>1);
+	}
+
+	bool unbal = (splitIndex==startIndex) || (splitIndex == (endIndex));
+	(void)unbal;
+	b3Assert(!unbal);
+
+	return splitIndex;
+}
+
+
+int	b3QuantizedBvh::calcSplittingAxis(int startIndex,int endIndex)
+{
+	int i;
+
+	b3Vector3 means=b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+	b3Vector3 variance=b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+	int numIndices = endIndex-startIndex;
+
+	for (i=startIndex;i<endIndex;i++)
+	{
+		b3Vector3 center = b3Scalar(0.5)*(getAabbMax(i)+getAabbMin(i));
+		means+=center;
+	}
+	means *= (b3Scalar(1.)/(b3Scalar)numIndices);
+		
+	for (i=startIndex;i<endIndex;i++)
+	{
+		b3Vector3 center = b3Scalar(0.5)*(getAabbMax(i)+getAabbMin(i));
+		b3Vector3 diff2 = center-means;
+		diff2 = diff2 * diff2;
+		variance += diff2;
+	}
+	variance *= (b3Scalar(1.)/	((b3Scalar)numIndices-1)	);
+	
+	return variance.maxAxis();
+}
+
+
+
+void	b3QuantizedBvh::reportAabbOverlappingNodex(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const
+{
+	//either choose recursive traversal (walkTree) or stackless (walkStacklessTree)
+
+	if (m_useQuantization)
+	{
+		///quantize query AABB
+		unsigned short int quantizedQueryAabbMin[3];
+		unsigned short int quantizedQueryAabbMax[3];
+		quantizeWithClamp(quantizedQueryAabbMin,aabbMin,0);
+		quantizeWithClamp(quantizedQueryAabbMax,aabbMax,1);
+
+		switch (m_traversalMode)
+		{
+		case TRAVERSAL_STACKLESS:
+				walkStacklessQuantizedTree(nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax,0,m_curNodeIndex);
+			break;
+		case TRAVERSAL_STACKLESS_CACHE_FRIENDLY:
+				walkStacklessQuantizedTreeCacheFriendly(nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax);
+			break;
+		case TRAVERSAL_RECURSIVE:
+			{
+				const b3QuantizedBvhNode* rootNode = &m_quantizedContiguousNodes[0];
+				walkRecursiveQuantizedTreeAgainstQueryAabb(rootNode,nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax);
+			}
+			break;
+		default:
+			//unsupported
+			b3Assert(0);
+		}
+	} else
+	{
+		walkStacklessTree(nodeCallback,aabbMin,aabbMax);
+	}
+}
+
+
+static int b3s_maxIterations = 0;
+
+
+void	b3QuantizedBvh::walkStacklessTree(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const
+{
+	b3Assert(!m_useQuantization);
+
+	const b3OptimizedBvhNode* rootNode = &m_contiguousNodes[0];
+	int escapeIndex, curIndex = 0;
+	int walkIterations = 0;
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned aabbOverlap;
+
+	while (curIndex < m_curNodeIndex)
+	{
+		//catch bugs in tree data
+		b3Assert (walkIterations < m_curNodeIndex);
+
+		walkIterations++;
+		aabbOverlap = b3TestAabbAgainstAabb2(aabbMin,aabbMax,rootNode->m_aabbMinOrg,rootNode->m_aabbMaxOrg);
+		isLeafNode = rootNode->m_escapeIndex == -1;
+		
+		//PCK: unsigned instead of bool
+		if (isLeafNode && (aabbOverlap != 0))
+		{
+			nodeCallback->processNode(rootNode->m_subPart,rootNode->m_triangleIndex);
+		} 
+		
+		//PCK: unsigned instead of bool
+		if ((aabbOverlap != 0) || isLeafNode)
+		{
+			rootNode++;
+			curIndex++;
+		} else
+		{
+			escapeIndex = rootNode->m_escapeIndex;
+			rootNode += escapeIndex;
+			curIndex += escapeIndex;
+		}
+	}
+	if (b3s_maxIterations < walkIterations)
+		b3s_maxIterations = walkIterations;
+
+}
+
+/*
+///this was the original recursive traversal, before we optimized towards stackless traversal
+void	b3QuantizedBvh::walkTree(b3OptimizedBvhNode* rootNode,b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const
+{
+	bool isLeafNode, aabbOverlap = TestAabbAgainstAabb2(aabbMin,aabbMax,rootNode->m_aabbMin,rootNode->m_aabbMax);
+	if (aabbOverlap)
+	{
+		isLeafNode = (!rootNode->m_leftChild && !rootNode->m_rightChild);
+		if (isLeafNode)
+		{
+			nodeCallback->processNode(rootNode);
+		} else
+		{
+			walkTree(rootNode->m_leftChild,nodeCallback,aabbMin,aabbMax);
+			walkTree(rootNode->m_rightChild,nodeCallback,aabbMin,aabbMax);
+		}
+	}
+
+}
+*/
+
+void b3QuantizedBvh::walkRecursiveQuantizedTreeAgainstQueryAabb(const b3QuantizedBvhNode* currentNode,b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const
+{
+	b3Assert(m_useQuantization);
+	
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned aabbOverlap;
+
+	//PCK: unsigned instead of bool
+	aabbOverlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,currentNode->m_quantizedAabbMin,currentNode->m_quantizedAabbMax);
+	isLeafNode = currentNode->isLeafNode();
+		
+	//PCK: unsigned instead of bool
+	if (aabbOverlap != 0)
+	{
+		if (isLeafNode)
+		{
+			nodeCallback->processNode(currentNode->getPartId(),currentNode->getTriangleIndex());
+		} else
+		{
+			//process left and right children
+			const b3QuantizedBvhNode* leftChildNode = currentNode+1;
+			walkRecursiveQuantizedTreeAgainstQueryAabb(leftChildNode,nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax);
+
+			const b3QuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? leftChildNode+1:leftChildNode+leftChildNode->getEscapeIndex();
+			walkRecursiveQuantizedTreeAgainstQueryAabb(rightChildNode,nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax);
+		}
+	}		
+}
+
+
+
+void	b3QuantizedBvh::walkStacklessTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const
+{
+	b3Assert(!m_useQuantization);
+
+	const b3OptimizedBvhNode* rootNode = &m_contiguousNodes[0];
+	int escapeIndex, curIndex = 0;
+	int walkIterations = 0;
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned aabbOverlap=0;
+	unsigned rayBoxOverlap=0;
+	b3Scalar lambda_max = 1.0;
+	
+		/* Quick pruning by quantized box */
+	b3Vector3 rayAabbMin = raySource;
+	b3Vector3 rayAabbMax = raySource;
+	rayAabbMin.setMin(rayTarget);
+	rayAabbMax.setMax(rayTarget);
+
+	/* Add box cast extents to bounding box */
+	rayAabbMin += aabbMin;
+	rayAabbMax += aabbMax;
+
+#ifdef RAYAABB2
+	b3Vector3 rayDir = (rayTarget-raySource);
+	rayDir.normalize ();
+	lambda_max = rayDir.dot(rayTarget-raySource);
+	///what about division by zero? --> just set rayDirection[i] to 1.0
+	b3Vector3 rayDirectionInverse;
+	rayDirectionInverse[0] = rayDir[0] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[0];
+	rayDirectionInverse[1] = rayDir[1] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[1];
+	rayDirectionInverse[2] = rayDir[2] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[2];
+	unsigned int sign[3] = { rayDirectionInverse[0] < 0.0, rayDirectionInverse[1] < 0.0, rayDirectionInverse[2] < 0.0};
+#endif
+
+	b3Vector3 bounds[2];
+
+	while (curIndex < m_curNodeIndex)
+	{
+		b3Scalar param = 1.0;
+		//catch bugs in tree data
+		b3Assert (walkIterations < m_curNodeIndex);
+
+		walkIterations++;
+
+		bounds[0] = rootNode->m_aabbMinOrg;
+		bounds[1] = rootNode->m_aabbMaxOrg;
+		/* Add box cast extents */
+		bounds[0] -= aabbMax;
+		bounds[1] -= aabbMin;
+
+		aabbOverlap = b3TestAabbAgainstAabb2(rayAabbMin,rayAabbMax,rootNode->m_aabbMinOrg,rootNode->m_aabbMaxOrg);
+		//perhaps profile if it is worth doing the aabbOverlap test first
+
+#ifdef RAYAABB2
+			///careful with this check: need to check division by zero (above) and fix the unQuantize method
+			///thanks Joerg/hiker for the reproduction case!
+			///http://www.bulletphysics.com/Bullet/phpBB3/viewtopic.php?f=9&t=1858
+		rayBoxOverlap = aabbOverlap ? b3RayAabb2 (raySource, rayDirectionInverse, sign, bounds, param, 0.0f, lambda_max) : false;
+
+#else
+		b3Vector3 normal;
+		rayBoxOverlap = b3RayAabb(raySource, rayTarget,bounds[0],bounds[1],param, normal);
+#endif
+
+		isLeafNode = rootNode->m_escapeIndex == -1;
+		
+		//PCK: unsigned instead of bool
+		if (isLeafNode && (rayBoxOverlap != 0))
+		{
+			nodeCallback->processNode(rootNode->m_subPart,rootNode->m_triangleIndex);
+		} 
+		
+		//PCK: unsigned instead of bool
+		if ((rayBoxOverlap != 0) || isLeafNode)
+		{
+			rootNode++;
+			curIndex++;
+		} else
+		{
+			escapeIndex = rootNode->m_escapeIndex;
+			rootNode += escapeIndex;
+			curIndex += escapeIndex;
+		}
+	}
+	if (b3s_maxIterations < walkIterations)
+		b3s_maxIterations = walkIterations;
+
+}
+
+
+
+void	b3QuantizedBvh::walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const
+{
+	b3Assert(m_useQuantization);
+	
+	int curIndex = startNodeIndex;
+	int walkIterations = 0;
+	int subTreeSize = endNodeIndex - startNodeIndex;
+	(void)subTreeSize;
+
+	const b3QuantizedBvhNode* rootNode = &m_quantizedContiguousNodes[startNodeIndex];
+	int escapeIndex;
+	
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned boxBoxOverlap = 0;
+	unsigned rayBoxOverlap = 0;
+
+	b3Scalar lambda_max = 1.0;
+
+#ifdef RAYAABB2
+	b3Vector3 rayDirection = (rayTarget-raySource);
+	rayDirection.normalize ();
+	lambda_max = rayDirection.dot(rayTarget-raySource);
+	///what about division by zero? --> just set rayDirection[i] to 1.0
+	rayDirection[0] = rayDirection[0] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDirection[0];
+	rayDirection[1] = rayDirection[1] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDirection[1];
+	rayDirection[2] = rayDirection[2] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDirection[2];
+	unsigned int sign[3] = { rayDirection[0] < 0.0, rayDirection[1] < 0.0, rayDirection[2] < 0.0};
+#endif
+
+	/* Quick pruning by quantized box */
+	b3Vector3 rayAabbMin = raySource;
+	b3Vector3 rayAabbMax = raySource;
+	rayAabbMin.setMin(rayTarget);
+	rayAabbMax.setMax(rayTarget);
+
+	/* Add box cast extents to bounding box */
+	rayAabbMin += aabbMin;
+	rayAabbMax += aabbMax;
+
+	unsigned short int quantizedQueryAabbMin[3];
+	unsigned short int quantizedQueryAabbMax[3];
+	quantizeWithClamp(quantizedQueryAabbMin,rayAabbMin,0);
+	quantizeWithClamp(quantizedQueryAabbMax,rayAabbMax,1);
+
+	while (curIndex < endNodeIndex)
+	{
+
+//#define VISUALLY_ANALYZE_BVH 1
+#ifdef VISUALLY_ANALYZE_BVH
+		//some code snippet to debugDraw aabb, to visually analyze bvh structure
+		static int drawPatch = 0;
+		//need some global access to a debugDrawer
+		extern b3IDebugDraw* debugDrawerPtr;
+		if (curIndex==drawPatch)
+		{
+			b3Vector3 aabbMin,aabbMax;
+			aabbMin = unQuantize(rootNode->m_quantizedAabbMin);
+			aabbMax = unQuantize(rootNode->m_quantizedAabbMax);
+			b3Vector3	color(1,0,0);
+			debugDrawerPtr->drawAabb(aabbMin,aabbMax,color);
+		}
+#endif//VISUALLY_ANALYZE_BVH
+
+		//catch bugs in tree data
+		b3Assert (walkIterations < subTreeSize);
+
+		walkIterations++;
+		//PCK: unsigned instead of bool
+		// only interested if this is closer than any previous hit
+		b3Scalar param = 1.0;
+		rayBoxOverlap = 0;
+		boxBoxOverlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode->m_quantizedAabbMin,rootNode->m_quantizedAabbMax);
+		isLeafNode = rootNode->isLeafNode();
+		if (boxBoxOverlap)
+		{
+			b3Vector3 bounds[2];
+			bounds[0] = unQuantize(rootNode->m_quantizedAabbMin);
+			bounds[1] = unQuantize(rootNode->m_quantizedAabbMax);
+			/* Add box cast extents */
+			bounds[0] -= aabbMax;
+			bounds[1] -= aabbMin;
+#if 0
+			b3Vector3 normal;
+			bool ra2 = b3RayAabb2 (raySource, rayDirection, sign, bounds, param, 0.0, lambda_max);
+			bool ra = b3RayAabb (raySource, rayTarget, bounds[0], bounds[1], param, normal);
+			if (ra2 != ra)
+			{
+				printf("functions don't match\n");
+			}
+#endif
+#ifdef RAYAABB2
+			///careful with this check: need to check division by zero (above) and fix the unQuantize method
+			///thanks Joerg/hiker for the reproduction case!
+			///http://www.bulletphysics.com/Bullet/phpBB3/viewtopic.php?f=9&t=1858
+
+			//B3_PROFILE("b3RayAabb2");
+			rayBoxOverlap = b3RayAabb2 (raySource, rayDirection, sign, bounds, param, 0.0f, lambda_max);
+			
+#else
+			rayBoxOverlap = true;//b3RayAabb(raySource, rayTarget, bounds[0], bounds[1], param, normal);
+#endif
+		}
+		
+		if (isLeafNode && rayBoxOverlap)
+		{
+			nodeCallback->processNode(rootNode->getPartId(),rootNode->getTriangleIndex());
+		}
+		
+		//PCK: unsigned instead of bool
+		if ((rayBoxOverlap != 0) || isLeafNode)
+		{
+			rootNode++;
+			curIndex++;
+		} else
+		{
+			escapeIndex = rootNode->getEscapeIndex();
+			rootNode += escapeIndex;
+			curIndex += escapeIndex;
+		}
+	}
+	if (b3s_maxIterations < walkIterations)
+		b3s_maxIterations = walkIterations;
+
+}
+
+void	b3QuantizedBvh::walkStacklessQuantizedTree(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,int startNodeIndex,int endNodeIndex) const
+{
+	b3Assert(m_useQuantization);
+	
+	int curIndex = startNodeIndex;
+	int walkIterations = 0;
+	int subTreeSize = endNodeIndex - startNodeIndex;
+	(void)subTreeSize;
+
+	const b3QuantizedBvhNode* rootNode = &m_quantizedContiguousNodes[startNodeIndex];
+	int escapeIndex;
+	
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned aabbOverlap;
+
+	while (curIndex < endNodeIndex)
+	{
+
+//#define VISUALLY_ANALYZE_BVH 1
+#ifdef VISUALLY_ANALYZE_BVH
+		//some code snippet to debugDraw aabb, to visually analyze bvh structure
+		static int drawPatch = 0;
+		//need some global access to a debugDrawer
+		extern b3IDebugDraw* debugDrawerPtr;
+		if (curIndex==drawPatch)
+		{
+			b3Vector3 aabbMin,aabbMax;
+			aabbMin = unQuantize(rootNode->m_quantizedAabbMin);
+			aabbMax = unQuantize(rootNode->m_quantizedAabbMax);
+			b3Vector3	color(1,0,0);
+			debugDrawerPtr->drawAabb(aabbMin,aabbMax,color);
+		}
+#endif//VISUALLY_ANALYZE_BVH
+
+		//catch bugs in tree data
+		b3Assert (walkIterations < subTreeSize);
+
+		walkIterations++;
+		//PCK: unsigned instead of bool
+		aabbOverlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode->m_quantizedAabbMin,rootNode->m_quantizedAabbMax);
+		isLeafNode = rootNode->isLeafNode();
+		
+		if (isLeafNode && aabbOverlap)
+		{
+			nodeCallback->processNode(rootNode->getPartId(),rootNode->getTriangleIndex());
+		} 
+		
+		//PCK: unsigned instead of bool
+		if ((aabbOverlap != 0) || isLeafNode)
+		{
+			rootNode++;
+			curIndex++;
+		} else
+		{
+			escapeIndex = rootNode->getEscapeIndex();
+			rootNode += escapeIndex;
+			curIndex += escapeIndex;
+		}
+	}
+	if (b3s_maxIterations < walkIterations)
+		b3s_maxIterations = walkIterations;
+
+}
+
+//This traversal can be called from Playstation 3 SPU
+void	b3QuantizedBvh::walkStacklessQuantizedTreeCacheFriendly(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const
+{
+	b3Assert(m_useQuantization);
+
+	int i;
+
+
+	for (i=0;i<this->m_SubtreeHeaders.size();i++)
+	{
+		const b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
+
+		//PCK: unsigned instead of bool
+		unsigned overlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
+		if (overlap != 0)
+		{
+			walkStacklessQuantizedTree(nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax,
+				subtree.m_rootNodeIndex,
+				subtree.m_rootNodeIndex+subtree.m_subtreeSize);
+		}
+	}
+}
+
+
+void	b3QuantizedBvh::reportRayOverlappingNodex (b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget) const
+{
+	reportBoxCastOverlappingNodex(nodeCallback,raySource,rayTarget,b3MakeVector3(0,0,0),b3MakeVector3(0,0,0));
+}
+
+
+void	b3QuantizedBvh::reportBoxCastOverlappingNodex(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin,const b3Vector3& aabbMax) const
+{
+	//always use stackless
+
+	if (m_useQuantization)
+	{
+		walkStacklessQuantizedTreeAgainstRay(nodeCallback, raySource, rayTarget, aabbMin, aabbMax, 0, m_curNodeIndex);
+	}
+	else
+	{
+		walkStacklessTreeAgainstRay(nodeCallback, raySource, rayTarget, aabbMin, aabbMax, 0, m_curNodeIndex);
+	}
+	/*
+	{
+		//recursive traversal
+		b3Vector3 qaabbMin = raySource;
+		b3Vector3 qaabbMax = raySource;
+		qaabbMin.setMin(rayTarget);
+		qaabbMax.setMax(rayTarget);
+		qaabbMin += aabbMin;
+		qaabbMax += aabbMax;
+		reportAabbOverlappingNodex(nodeCallback,qaabbMin,qaabbMax);
+	}
+	*/
+
+}
+
+
+void	b3QuantizedBvh::swapLeafNodes(int i,int splitIndex)
+{
+	if (m_useQuantization)
+	{
+			b3QuantizedBvhNode tmp = m_quantizedLeafNodes[i];
+			m_quantizedLeafNodes[i] = m_quantizedLeafNodes[splitIndex];
+			m_quantizedLeafNodes[splitIndex] = tmp;
+	} else
+	{
+			b3OptimizedBvhNode tmp = m_leafNodes[i];
+			m_leafNodes[i] = m_leafNodes[splitIndex];
+			m_leafNodes[splitIndex] = tmp;
+	}
+}
+
+void	b3QuantizedBvh::assignInternalNodeFromLeafNode(int internalNode,int leafNodeIndex)
+{
+	if (m_useQuantization)
+	{
+		m_quantizedContiguousNodes[internalNode] = m_quantizedLeafNodes[leafNodeIndex];
+	} else
+	{
+		m_contiguousNodes[internalNode] = m_leafNodes[leafNodeIndex];
+	}
+}
+
+//PCK: include
+#include <new>
+
+#if 0
+//PCK: consts
+static const unsigned BVH_ALIGNMENT = 16;
+static const unsigned BVH_ALIGNMENT_MASK = BVH_ALIGNMENT-1;
+
+static const unsigned BVH_ALIGNMENT_BLOCKS = 2;
+#endif
+
+
+unsigned int b3QuantizedBvh::getAlignmentSerializationPadding()
+{
+	// I changed this to 0 since the extra padding is not needed or used.
+	return 0;//BVH_ALIGNMENT_BLOCKS * BVH_ALIGNMENT;
+}
+
+unsigned b3QuantizedBvh::calculateSerializeBufferSize() const
+{
+	unsigned baseSize = sizeof(b3QuantizedBvh) + getAlignmentSerializationPadding();
+	baseSize += sizeof(b3BvhSubtreeInfo) * m_subtreeHeaderCount;
+	if (m_useQuantization)
+	{
+		return baseSize + m_curNodeIndex * sizeof(b3QuantizedBvhNode);
+	}
+	return baseSize + m_curNodeIndex * sizeof(b3OptimizedBvhNode);
+}
+
+bool b3QuantizedBvh::serialize(void *o_alignedDataBuffer, unsigned /*i_dataBufferSize */, bool i_swapEndian) const
+{
+	b3Assert(m_subtreeHeaderCount == m_SubtreeHeaders.size());
+	m_subtreeHeaderCount = m_SubtreeHeaders.size();
+
+/*	if (i_dataBufferSize < calculateSerializeBufferSize() || o_alignedDataBuffer == NULL || (((unsigned)o_alignedDataBuffer & BVH_ALIGNMENT_MASK) != 0))
+	{
+		///check alignedment for buffer?
+		b3Assert(0);
+		return false;
+	}
+*/
+
+	b3QuantizedBvh *targetBvh = (b3QuantizedBvh *)o_alignedDataBuffer;
+
+	// construct the class so the virtual function table, etc will be set up
+	// Also, m_leafNodes and m_quantizedLeafNodes will be initialized to default values by the constructor
+	new (targetBvh) b3QuantizedBvh;
+
+	if (i_swapEndian)
+	{
+		targetBvh->m_curNodeIndex = static_cast<int>(b3SwapEndian(m_curNodeIndex));
+
+
+		b3SwapVector3Endian(m_bvhAabbMin,targetBvh->m_bvhAabbMin);
+		b3SwapVector3Endian(m_bvhAabbMax,targetBvh->m_bvhAabbMax);
+		b3SwapVector3Endian(m_bvhQuantization,targetBvh->m_bvhQuantization);
+
+		targetBvh->m_traversalMode = (b3TraversalMode)b3SwapEndian(m_traversalMode);
+		targetBvh->m_subtreeHeaderCount = static_cast<int>(b3SwapEndian(m_subtreeHeaderCount));
+	}
+	else
+	{
+		targetBvh->m_curNodeIndex = m_curNodeIndex;
+		targetBvh->m_bvhAabbMin = m_bvhAabbMin;
+		targetBvh->m_bvhAabbMax = m_bvhAabbMax;
+		targetBvh->m_bvhQuantization = m_bvhQuantization;
+		targetBvh->m_traversalMode = m_traversalMode;
+		targetBvh->m_subtreeHeaderCount = m_subtreeHeaderCount;
+	}
+
+	targetBvh->m_useQuantization = m_useQuantization;
+
+	unsigned char *nodeData = (unsigned char *)targetBvh;
+	nodeData += sizeof(b3QuantizedBvh);
+	
+	unsigned sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK;
+	nodeData += sizeToAdd;
+	
+	int nodeCount = m_curNodeIndex;
+
+	if (m_useQuantization)
+	{
+		targetBvh->m_quantizedContiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount);
+
+		if (i_swapEndian)
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2]);
+
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2]);
+
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = static_cast<int>(b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex));
+			}
+		}
+		else
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+	
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2];
+
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2];
+
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex;
+
+
+			}
+		}
+		nodeData += sizeof(b3QuantizedBvhNode) * nodeCount;
+
+		// this clears the pointer in the member variable it doesn't really do anything to the data
+		// it does call the destructor on the contained objects, but they are all classes with no destructor defined
+		// so the memory (which is not freed) is left alone
+		targetBvh->m_quantizedContiguousNodes.initializeFromBuffer(NULL, 0, 0);
+	}
+	else
+	{
+		targetBvh->m_contiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount);
+
+		if (i_swapEndian)
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				b3SwapVector3Endian(m_contiguousNodes[nodeIndex].m_aabbMinOrg, targetBvh->m_contiguousNodes[nodeIndex].m_aabbMinOrg);
+				b3SwapVector3Endian(m_contiguousNodes[nodeIndex].m_aabbMaxOrg, targetBvh->m_contiguousNodes[nodeIndex].m_aabbMaxOrg);
+
+				targetBvh->m_contiguousNodes[nodeIndex].m_escapeIndex = static_cast<int>(b3SwapEndian(m_contiguousNodes[nodeIndex].m_escapeIndex));
+				targetBvh->m_contiguousNodes[nodeIndex].m_subPart = static_cast<int>(b3SwapEndian(m_contiguousNodes[nodeIndex].m_subPart));
+				targetBvh->m_contiguousNodes[nodeIndex].m_triangleIndex = static_cast<int>(b3SwapEndian(m_contiguousNodes[nodeIndex].m_triangleIndex));
+			}
+		}
+		else
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				targetBvh->m_contiguousNodes[nodeIndex].m_aabbMinOrg = m_contiguousNodes[nodeIndex].m_aabbMinOrg;
+				targetBvh->m_contiguousNodes[nodeIndex].m_aabbMaxOrg = m_contiguousNodes[nodeIndex].m_aabbMaxOrg;
+
+				targetBvh->m_contiguousNodes[nodeIndex].m_escapeIndex = m_contiguousNodes[nodeIndex].m_escapeIndex;
+				targetBvh->m_contiguousNodes[nodeIndex].m_subPart = m_contiguousNodes[nodeIndex].m_subPart;
+				targetBvh->m_contiguousNodes[nodeIndex].m_triangleIndex = m_contiguousNodes[nodeIndex].m_triangleIndex;
+			}
+		}
+		nodeData += sizeof(b3OptimizedBvhNode) * nodeCount;
+
+		// this clears the pointer in the member variable it doesn't really do anything to the data
+		// it does call the destructor on the contained objects, but they are all classes with no destructor defined
+		// so the memory (which is not freed) is left alone
+		targetBvh->m_contiguousNodes.initializeFromBuffer(NULL, 0, 0);
+	}
+
+	sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK;
+	nodeData += sizeToAdd;
+
+	// Now serialize the subtree headers
+	targetBvh->m_SubtreeHeaders.initializeFromBuffer(nodeData, m_subtreeHeaderCount, m_subtreeHeaderCount);
+	if (i_swapEndian)
+	{
+		for (int i = 0; i < m_subtreeHeaderCount; i++)
+		{
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMin[0]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMin[1]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMin[2]);
+
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMax[0]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMax[1]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMax[2]);
+
+			targetBvh->m_SubtreeHeaders[i].m_rootNodeIndex = static_cast<int>(b3SwapEndian(m_SubtreeHeaders[i].m_rootNodeIndex));
+			targetBvh->m_SubtreeHeaders[i].m_subtreeSize = static_cast<int>(b3SwapEndian(m_SubtreeHeaders[i].m_subtreeSize));
+		}
+	}
+	else
+	{
+		for (int i = 0; i < m_subtreeHeaderCount; i++)
+		{
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0] = (m_SubtreeHeaders[i].m_quantizedAabbMin[0]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1] = (m_SubtreeHeaders[i].m_quantizedAabbMin[1]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2] = (m_SubtreeHeaders[i].m_quantizedAabbMin[2]);
+
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0] = (m_SubtreeHeaders[i].m_quantizedAabbMax[0]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1] = (m_SubtreeHeaders[i].m_quantizedAabbMax[1]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2] = (m_SubtreeHeaders[i].m_quantizedAabbMax[2]);
+
+			targetBvh->m_SubtreeHeaders[i].m_rootNodeIndex = (m_SubtreeHeaders[i].m_rootNodeIndex);
+			targetBvh->m_SubtreeHeaders[i].m_subtreeSize = (m_SubtreeHeaders[i].m_subtreeSize);
+
+			// need to clear padding in destination buffer
+			targetBvh->m_SubtreeHeaders[i].m_padding[0] = 0;
+			targetBvh->m_SubtreeHeaders[i].m_padding[1] = 0;
+			targetBvh->m_SubtreeHeaders[i].m_padding[2] = 0;
+		}
+	}
+	nodeData += sizeof(b3BvhSubtreeInfo) * m_subtreeHeaderCount;
+
+	// this clears the pointer in the member variable it doesn't really do anything to the data
+	// it does call the destructor on the contained objects, but they are all classes with no destructor defined
+	// so the memory (which is not freed) is left alone
+	targetBvh->m_SubtreeHeaders.initializeFromBuffer(NULL, 0, 0);
+
+	// this wipes the virtual function table pointer at the start of the buffer for the class
+	*((void**)o_alignedDataBuffer) = NULL;
+
+	return true;
+}
+
+b3QuantizedBvh *b3QuantizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian)
+{
+
+	if (i_alignedDataBuffer == NULL)// || (((unsigned)i_alignedDataBuffer & BVH_ALIGNMENT_MASK) != 0))
+	{
+		return NULL;
+	}
+	b3QuantizedBvh *bvh = (b3QuantizedBvh *)i_alignedDataBuffer;
+
+	if (i_swapEndian)
+	{
+		bvh->m_curNodeIndex = static_cast<int>(b3SwapEndian(bvh->m_curNodeIndex));
+
+		b3UnSwapVector3Endian(bvh->m_bvhAabbMin);
+		b3UnSwapVector3Endian(bvh->m_bvhAabbMax);
+		b3UnSwapVector3Endian(bvh->m_bvhQuantization);
+
+		bvh->m_traversalMode = (b3TraversalMode)b3SwapEndian(bvh->m_traversalMode);
+		bvh->m_subtreeHeaderCount = static_cast<int>(b3SwapEndian(bvh->m_subtreeHeaderCount));
+	}
+
+	unsigned int calculatedBufSize = bvh->calculateSerializeBufferSize();
+	b3Assert(calculatedBufSize <= i_dataBufferSize);
+
+	if (calculatedBufSize > i_dataBufferSize)
+	{
+		return NULL;
+	}
+
+	unsigned char *nodeData = (unsigned char *)bvh;
+	nodeData += sizeof(b3QuantizedBvh);
+	
+	unsigned sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK;
+	nodeData += sizeToAdd;
+	
+	int nodeCount = bvh->m_curNodeIndex;
+
+	// Must call placement new to fill in virtual function table, etc, but we don't want to overwrite most data, so call a special version of the constructor
+	// Also, m_leafNodes and m_quantizedLeafNodes will be initialized to default values by the constructor
+	new (bvh) b3QuantizedBvh(*bvh, false);
+
+	if (bvh->m_useQuantization)
+	{
+		bvh->m_quantizedContiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount);
+
+		if (i_swapEndian)
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2]);
+
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2]);
+
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = static_cast<int>(b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex));
+			}
+		}
+		nodeData += sizeof(b3QuantizedBvhNode) * nodeCount;
+	}
+	else
+	{
+		bvh->m_contiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount);
+
+		if (i_swapEndian)
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				b3UnSwapVector3Endian(bvh->m_contiguousNodes[nodeIndex].m_aabbMinOrg);
+				b3UnSwapVector3Endian(bvh->m_contiguousNodes[nodeIndex].m_aabbMaxOrg);
+				
+				bvh->m_contiguousNodes[nodeIndex].m_escapeIndex = static_cast<int>(b3SwapEndian(bvh->m_contiguousNodes[nodeIndex].m_escapeIndex));
+				bvh->m_contiguousNodes[nodeIndex].m_subPart = static_cast<int>(b3SwapEndian(bvh->m_contiguousNodes[nodeIndex].m_subPart));
+				bvh->m_contiguousNodes[nodeIndex].m_triangleIndex = static_cast<int>(b3SwapEndian(bvh->m_contiguousNodes[nodeIndex].m_triangleIndex));
+			}
+		}
+		nodeData += sizeof(b3OptimizedBvhNode) * nodeCount;
+	}
+
+	sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK;
+	nodeData += sizeToAdd;
+
+	// Now serialize the subtree headers
+	bvh->m_SubtreeHeaders.initializeFromBuffer(nodeData, bvh->m_subtreeHeaderCount, bvh->m_subtreeHeaderCount);
+	if (i_swapEndian)
+	{
+		for (int i = 0; i < bvh->m_subtreeHeaderCount; i++)
+		{
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0]);
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1]);
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2]);
+
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0]);
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1]);
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2]);
+
+			bvh->m_SubtreeHeaders[i].m_rootNodeIndex = static_cast<int>(b3SwapEndian(bvh->m_SubtreeHeaders[i].m_rootNodeIndex));
+			bvh->m_SubtreeHeaders[i].m_subtreeSize = static_cast<int>(b3SwapEndian(bvh->m_SubtreeHeaders[i].m_subtreeSize));
+		}
+	}
+
+	return bvh;
+}
+
+// Constructor that prevents b3Vector3's default constructor from being called
+b3QuantizedBvh::b3QuantizedBvh(b3QuantizedBvh &self, bool /* ownsMemory */) :
+m_bvhAabbMin(self.m_bvhAabbMin),
+m_bvhAabbMax(self.m_bvhAabbMax),
+m_bvhQuantization(self.m_bvhQuantization),
+m_bulletVersion(B3_BULLET_VERSION)
+{
+
+}
+
+void b3QuantizedBvh::deSerializeFloat(struct b3QuantizedBvhFloatData& quantizedBvhFloatData)
+{
+	m_bvhAabbMax.deSerializeFloat(quantizedBvhFloatData.m_bvhAabbMax);
+	m_bvhAabbMin.deSerializeFloat(quantizedBvhFloatData.m_bvhAabbMin);
+	m_bvhQuantization.deSerializeFloat(quantizedBvhFloatData.m_bvhQuantization);
+
+	m_curNodeIndex = quantizedBvhFloatData.m_curNodeIndex;
+	m_useQuantization = quantizedBvhFloatData.m_useQuantization!=0;
+	
+	{
+		int numElem = quantizedBvhFloatData.m_numContiguousLeafNodes;
+		m_contiguousNodes.resize(numElem);
+
+		if (numElem)
+		{
+			b3OptimizedBvhNodeFloatData* memPtr = quantizedBvhFloatData.m_contiguousNodesPtr;
+
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_contiguousNodes[i].m_aabbMaxOrg.deSerializeFloat(memPtr->m_aabbMaxOrg);
+				m_contiguousNodes[i].m_aabbMinOrg.deSerializeFloat(memPtr->m_aabbMinOrg);
+				m_contiguousNodes[i].m_escapeIndex = memPtr->m_escapeIndex;
+				m_contiguousNodes[i].m_subPart = memPtr->m_subPart;
+				m_contiguousNodes[i].m_triangleIndex = memPtr->m_triangleIndex;
+			}
+		}
+	}
+
+	{
+		int numElem = quantizedBvhFloatData.m_numQuantizedContiguousNodes;
+		m_quantizedContiguousNodes.resize(numElem);
+		
+		if (numElem)
+		{
+			b3QuantizedBvhNodeData* memPtr = quantizedBvhFloatData.m_quantizedContiguousNodesPtr;
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_quantizedContiguousNodes[i].m_escapeIndexOrTriangleIndex = memPtr->m_escapeIndexOrTriangleIndex;
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[1] = memPtr->m_quantizedAabbMin[1];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[2] = memPtr->m_quantizedAabbMin[2];
+			}
+		}
+	}
+
+	m_traversalMode = b3TraversalMode(quantizedBvhFloatData.m_traversalMode);
+	
+	{
+		int numElem = quantizedBvhFloatData.m_numSubtreeHeaders;
+		m_SubtreeHeaders.resize(numElem);
+		if (numElem)
+		{
+			b3BvhSubtreeInfoData* memPtr = quantizedBvhFloatData.m_subTreeInfoPtr;
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_SubtreeHeaders[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0] ;
+				m_SubtreeHeaders[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1];
+				m_SubtreeHeaders[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[1] = memPtr->m_quantizedAabbMin[1];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[2] = memPtr->m_quantizedAabbMin[2];
+				m_SubtreeHeaders[i].m_rootNodeIndex = memPtr->m_rootNodeIndex;
+				m_SubtreeHeaders[i].m_subtreeSize = memPtr->m_subtreeSize;
+			}
+		}
+	}
+}
+
+void b3QuantizedBvh::deSerializeDouble(struct b3QuantizedBvhDoubleData& quantizedBvhDoubleData)
+{
+	m_bvhAabbMax.deSerializeDouble(quantizedBvhDoubleData.m_bvhAabbMax);
+	m_bvhAabbMin.deSerializeDouble(quantizedBvhDoubleData.m_bvhAabbMin);
+	m_bvhQuantization.deSerializeDouble(quantizedBvhDoubleData.m_bvhQuantization);
+
+	m_curNodeIndex = quantizedBvhDoubleData.m_curNodeIndex;
+	m_useQuantization = quantizedBvhDoubleData.m_useQuantization!=0;
+	
+	{
+		int numElem = quantizedBvhDoubleData.m_numContiguousLeafNodes;
+		m_contiguousNodes.resize(numElem);
+
+		if (numElem)
+		{
+			b3OptimizedBvhNodeDoubleData* memPtr = quantizedBvhDoubleData.m_contiguousNodesPtr;
+
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_contiguousNodes[i].m_aabbMaxOrg.deSerializeDouble(memPtr->m_aabbMaxOrg);
+				m_contiguousNodes[i].m_aabbMinOrg.deSerializeDouble(memPtr->m_aabbMinOrg);
+				m_contiguousNodes[i].m_escapeIndex = memPtr->m_escapeIndex;
+				m_contiguousNodes[i].m_subPart = memPtr->m_subPart;
+				m_contiguousNodes[i].m_triangleIndex = memPtr->m_triangleIndex;
+			}
+		}
+	}
+
+	{
+		int numElem = quantizedBvhDoubleData.m_numQuantizedContiguousNodes;
+		m_quantizedContiguousNodes.resize(numElem);
+		
+		if (numElem)
+		{
+			b3QuantizedBvhNodeData* memPtr = quantizedBvhDoubleData.m_quantizedContiguousNodesPtr;
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_quantizedContiguousNodes[i].m_escapeIndexOrTriangleIndex = memPtr->m_escapeIndexOrTriangleIndex;
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[1] = memPtr->m_quantizedAabbMin[1];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[2] = memPtr->m_quantizedAabbMin[2];
+			}
+		}
+	}
+
+	m_traversalMode = b3TraversalMode(quantizedBvhDoubleData.m_traversalMode);
+	
+	{
+		int numElem = quantizedBvhDoubleData.m_numSubtreeHeaders;
+		m_SubtreeHeaders.resize(numElem);
+		if (numElem)
+		{
+			b3BvhSubtreeInfoData* memPtr = quantizedBvhDoubleData.m_subTreeInfoPtr;
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_SubtreeHeaders[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0] ;
+				m_SubtreeHeaders[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1];
+				m_SubtreeHeaders[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[1] = memPtr->m_quantizedAabbMin[1];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[2] = memPtr->m_quantizedAabbMin[2];
+				m_SubtreeHeaders[i].m_rootNodeIndex = memPtr->m_rootNodeIndex;
+				m_SubtreeHeaders[i].m_subtreeSize = memPtr->m_subtreeSize;
+			}
+		}
+	}
+
+}
+
+
+
+///fills the dataBuffer and returns the struct name (and 0 on failure)
+const char*	b3QuantizedBvh::serialize(void* dataBuffer, b3Serializer* serializer) const
+{
+	b3Assert(0);
+	return 0;
+}
+
+
+
+
+
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h
new file mode 100644
index 0000000000..63c523c758
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h
@@ -0,0 +1,556 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_QUANTIZED_BVH_H
+#define B3_QUANTIZED_BVH_H
+
+class b3Serializer;
+
+//#define DEBUG_CHECK_DEQUANTIZATION 1
+#ifdef DEBUG_CHECK_DEQUANTIZATION
+#ifdef __SPU__
+#define printf spu_printf
+#endif //__SPU__
+
+#include <stdio.h>
+#include <stdlib.h>
+#endif //DEBUG_CHECK_DEQUANTIZATION
+
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3AlignedAllocator.h"
+
+#ifdef B3_USE_DOUBLE_PRECISION
+#define b3QuantizedBvhData b3QuantizedBvhDoubleData
+#define b3OptimizedBvhNodeData b3OptimizedBvhNodeDoubleData
+#define b3QuantizedBvhDataName "b3QuantizedBvhDoubleData"
+#else
+#define b3QuantizedBvhData b3QuantizedBvhFloatData
+#define b3OptimizedBvhNodeData b3OptimizedBvhNodeFloatData
+#define b3QuantizedBvhDataName "b3QuantizedBvhFloatData"
+#endif
+
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h"
+
+
+
+//http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclang/html/vclrf__m128.asp
+
+
+//Note: currently we have 16 bytes per quantized node
+#define MAX_SUBTREE_SIZE_IN_BYTES  2048
+
+// 10 gives the potential for 1024 parts, with at most 2^21 (2097152) (minus one
+// actually) triangles each (since the sign bit is reserved
+#define MAX_NUM_PARTS_IN_BITS 10
+
+///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.
+///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
+B3_ATTRIBUTE_ALIGNED16	(struct) b3QuantizedBvhNode : public b3QuantizedBvhNodeData
+{
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	bool isLeafNode() const
+	{
+		//skipindex is negative (internal node), triangleindex >=0 (leafnode)
+		return (m_escapeIndexOrTriangleIndex >= 0);
+	}
+	int getEscapeIndex() const
+	{
+		b3Assert(!isLeafNode());
+		return -m_escapeIndexOrTriangleIndex;
+	}
+	int	getTriangleIndex() const
+	{
+		b3Assert(isLeafNode());
+		unsigned int x=0;
+		unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+		// Get only the lower bits where the triangle index is stored
+		return (m_escapeIndexOrTriangleIndex&~(y));
+	}
+	int	getPartId() const
+	{
+		b3Assert(isLeafNode());
+		// Get only the highest bits where the part index is stored
+		return (m_escapeIndexOrTriangleIndex>>(31-MAX_NUM_PARTS_IN_BITS));
+	}
+}
+;
+
+/// b3OptimizedBvhNode contains both internal and leaf node information.
+/// Total node size is 44 bytes / node. You can use the compressed version of 16 bytes.
+B3_ATTRIBUTE_ALIGNED16 (struct) b3OptimizedBvhNode
+{
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	//32 bytes
+	b3Vector3	m_aabbMinOrg;
+	b3Vector3	m_aabbMaxOrg;
+
+	//4
+	int	m_escapeIndex;
+
+	//8
+	//for child nodes
+	int	m_subPart;
+	int	m_triangleIndex;
+
+//pad the size to 64 bytes
+	char	m_padding[20];
+};
+
+
+///b3BvhSubtreeInfo provides info to gather a subtree of limited size
+B3_ATTRIBUTE_ALIGNED16(class) b3BvhSubtreeInfo : public b3BvhSubtreeInfoData
+{
+public:
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	b3BvhSubtreeInfo()
+	{
+		//memset(&m_padding[0], 0, sizeof(m_padding));
+	}
+
+
+	void	setAabbFromQuantizeNode(const b3QuantizedBvhNode& quantizedNode)
+	{
+		m_quantizedAabbMin[0] = quantizedNode.m_quantizedAabbMin[0];
+		m_quantizedAabbMin[1] = quantizedNode.m_quantizedAabbMin[1];
+		m_quantizedAabbMin[2] = quantizedNode.m_quantizedAabbMin[2];
+		m_quantizedAabbMax[0] = quantizedNode.m_quantizedAabbMax[0];
+		m_quantizedAabbMax[1] = quantizedNode.m_quantizedAabbMax[1];
+		m_quantizedAabbMax[2] = quantizedNode.m_quantizedAabbMax[2];
+	}
+}
+;
+
+
+class b3NodeOverlapCallback
+{
+public:
+	virtual ~b3NodeOverlapCallback() {};
+
+	virtual void processNode(int subPart, int triangleIndex) = 0;
+};
+
+#include "Bullet3Common/b3AlignedAllocator.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+
+
+
+///for code readability:
+typedef b3AlignedObjectArray<b3OptimizedBvhNode>	NodeArray;
+typedef b3AlignedObjectArray<b3QuantizedBvhNode>	QuantizedNodeArray;
+typedef b3AlignedObjectArray<b3BvhSubtreeInfo>		BvhSubtreeInfoArray;
+
+
+///The b3QuantizedBvh class stores an AABB tree that can be quickly traversed on CPU and Cell SPU.
+///It is used by the b3BvhTriangleMeshShape as midphase
+///It is recommended to use quantization for better performance and lower memory requirements.
+B3_ATTRIBUTE_ALIGNED16(class) b3QuantizedBvh
+{
+public:
+	enum b3TraversalMode
+	{
+		TRAVERSAL_STACKLESS = 0,
+		TRAVERSAL_STACKLESS_CACHE_FRIENDLY,
+		TRAVERSAL_RECURSIVE
+	};
+
+
+
+
+	b3Vector3			m_bvhAabbMin;
+	b3Vector3			m_bvhAabbMax;
+	b3Vector3			m_bvhQuantization;
+
+protected:
+	int					m_bulletVersion;	//for serialization versioning. It could also be used to detect endianess.
+
+	int					m_curNodeIndex;
+	//quantization data
+	bool				m_useQuantization;
+
+
+
+	NodeArray			m_leafNodes;
+	NodeArray			m_contiguousNodes;
+	QuantizedNodeArray	m_quantizedLeafNodes;
+	QuantizedNodeArray	m_quantizedContiguousNodes;
+	
+	b3TraversalMode	m_traversalMode;
+	BvhSubtreeInfoArray		m_SubtreeHeaders;
+
+	//This is only used for serialization so we don't have to add serialization directly to b3AlignedObjectArray
+	mutable int m_subtreeHeaderCount;
+
+	
+
+
+
+	///two versions, one for quantized and normal nodes. This allows code-reuse while maintaining readability (no template/macro!)
+	///this might be refactored into a virtual, it is usually not calculated at run-time
+	void	setInternalNodeAabbMin(int nodeIndex, const b3Vector3& aabbMin)
+	{
+		if (m_useQuantization)
+		{
+			quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] ,aabbMin,0);
+		} else
+		{
+			m_contiguousNodes[nodeIndex].m_aabbMinOrg = aabbMin;
+
+		}
+	}
+	void	setInternalNodeAabbMax(int nodeIndex,const b3Vector3& aabbMax)
+	{
+		if (m_useQuantization)
+		{
+			quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0],aabbMax,1);
+		} else
+		{
+			m_contiguousNodes[nodeIndex].m_aabbMaxOrg = aabbMax;
+		}
+	}
+
+	b3Vector3 getAabbMin(int nodeIndex) const
+	{
+		if (m_useQuantization)
+		{
+			return unQuantize(&m_quantizedLeafNodes[nodeIndex].m_quantizedAabbMin[0]);
+		}
+		//non-quantized
+		return m_leafNodes[nodeIndex].m_aabbMinOrg;
+
+	}
+	b3Vector3 getAabbMax(int nodeIndex) const
+	{
+		if (m_useQuantization)
+		{
+			return unQuantize(&m_quantizedLeafNodes[nodeIndex].m_quantizedAabbMax[0]);
+		} 
+		//non-quantized
+		return m_leafNodes[nodeIndex].m_aabbMaxOrg;
+		
+	}
+
+	
+	void	setInternalNodeEscapeIndex(int nodeIndex, int escapeIndex)
+	{
+		if (m_useQuantization)
+		{
+			m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = -escapeIndex;
+		} 
+		else
+		{
+			m_contiguousNodes[nodeIndex].m_escapeIndex = escapeIndex;
+		}
+
+	}
+
+	void mergeInternalNodeAabb(int nodeIndex,const b3Vector3& newAabbMin,const b3Vector3& newAabbMax) 
+	{
+		if (m_useQuantization)
+		{
+			unsigned short int quantizedAabbMin[3];
+			unsigned short int quantizedAabbMax[3];
+			quantize(quantizedAabbMin,newAabbMin,0);
+			quantize(quantizedAabbMax,newAabbMax,1);
+			for (int i=0;i<3;i++)
+			{
+				if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] > quantizedAabbMin[i])
+					m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] = quantizedAabbMin[i];
+
+				if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] < quantizedAabbMax[i])
+					m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] = quantizedAabbMax[i];
+
+			}
+		} else
+		{
+			//non-quantized
+			m_contiguousNodes[nodeIndex].m_aabbMinOrg.setMin(newAabbMin);
+			m_contiguousNodes[nodeIndex].m_aabbMaxOrg.setMax(newAabbMax);		
+		}
+	}
+
+	void	swapLeafNodes(int firstIndex,int secondIndex);
+
+	void	assignInternalNodeFromLeafNode(int internalNode,int leafNodeIndex);
+
+protected:
+
+	
+
+	void	buildTree	(int startIndex,int endIndex);
+
+	int	calcSplittingAxis(int startIndex,int endIndex);
+
+	int	sortAndCalcSplittingIndex(int startIndex,int endIndex,int splitAxis);
+	
+	void	walkStacklessTree(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
+
+	void	walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const;
+	void	walkStacklessQuantizedTree(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,int startNodeIndex,int endNodeIndex) const;
+	void	walkStacklessTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const;
+
+	///tree traversal designed for small-memory processors like PS3 SPU
+	void	walkStacklessQuantizedTreeCacheFriendly(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const;
+
+	///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal
+	void	walkRecursiveQuantizedTreeAgainstQueryAabb(const b3QuantizedBvhNode* currentNode,b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const;
+
+	///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal
+	void	walkRecursiveQuantizedTreeAgainstQuantizedTree(const b3QuantizedBvhNode* treeNodeA,const b3QuantizedBvhNode* treeNodeB,b3NodeOverlapCallback* nodeCallback) const;
+	
+
+
+
+	void	updateSubtreeHeaders(int leftChildNodexIndex,int rightChildNodexIndex);
+
+public:
+	
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	b3QuantizedBvh();
+
+	virtual ~b3QuantizedBvh();
+
+	
+	///***************************************** expert/internal use only *************************
+	void	setQuantizationValues(const b3Vector3& bvhAabbMin,const b3Vector3& bvhAabbMax,b3Scalar quantizationMargin=b3Scalar(1.0));
+	QuantizedNodeArray&	getLeafNodeArray() {			return	m_quantizedLeafNodes;	}
+	///buildInternal is expert use only: assumes that setQuantizationValues and LeafNodeArray are initialized
+	void	buildInternal();
+	///***************************************** expert/internal use only *************************
+
+	void	reportAabbOverlappingNodex(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
+	void	reportRayOverlappingNodex (b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget) const;
+	void	reportBoxCastOverlappingNodex(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
+
+		B3_FORCE_INLINE void quantize(unsigned short* out, const b3Vector3& point,int isMax) const
+	{
+
+		b3Assert(m_useQuantization);
+
+		b3Assert(point.getX() <= m_bvhAabbMax.getX());
+		b3Assert(point.getY() <= m_bvhAabbMax.getY());
+		b3Assert(point.getZ() <= m_bvhAabbMax.getZ());
+
+		b3Assert(point.getX() >= m_bvhAabbMin.getX());
+		b3Assert(point.getY() >= m_bvhAabbMin.getY());
+		b3Assert(point.getZ() >= m_bvhAabbMin.getZ());
+
+		b3Vector3 v = (point - m_bvhAabbMin) * m_bvhQuantization;
+		///Make sure rounding is done in a way that unQuantize(quantizeWithClamp(...)) is conservative
+		///end-points always set the first bit, so that they are sorted properly (so that neighbouring AABBs overlap properly)
+		///@todo: double-check this
+		if (isMax)
+		{
+			out[0] = (unsigned short) (((unsigned short)(v.getX()+b3Scalar(1.)) | 1));
+			out[1] = (unsigned short) (((unsigned short)(v.getY()+b3Scalar(1.)) | 1));
+			out[2] = (unsigned short) (((unsigned short)(v.getZ()+b3Scalar(1.)) | 1));
+		} else
+		{
+			out[0] = (unsigned short) (((unsigned short)(v.getX()) & 0xfffe));
+			out[1] = (unsigned short) (((unsigned short)(v.getY()) & 0xfffe));
+			out[2] = (unsigned short) (((unsigned short)(v.getZ()) & 0xfffe));
+		}
+
+
+#ifdef DEBUG_CHECK_DEQUANTIZATION
+		b3Vector3 newPoint = unQuantize(out);
+		if (isMax)
+		{
+			if (newPoint.getX() < point.getX())
+			{
+				printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n",newPoint.getX()-point.getX(), newPoint.getX(),point.getX());
+			}
+			if (newPoint.getY() < point.getY())
+			{
+				printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n",newPoint.getY()-point.getY(), newPoint.getY(),point.getY());
+			}
+			if (newPoint.getZ() < point.getZ())
+			{
+
+				printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n",newPoint.getZ()-point.getZ(), newPoint.getZ(),point.getZ());
+			}
+		} else
+		{
+			if (newPoint.getX() > point.getX())
+			{
+				printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n",newPoint.getX()-point.getX(), newPoint.getX(),point.getX());
+			}
+			if (newPoint.getY() > point.getY())
+			{
+				printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n",newPoint.getY()-point.getY(), newPoint.getY(),point.getY());
+			}
+			if (newPoint.getZ() > point.getZ())
+			{
+				printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n",newPoint.getZ()-point.getZ(), newPoint.getZ(),point.getZ());
+			}
+		}
+#endif //DEBUG_CHECK_DEQUANTIZATION
+
+	}
+
+
+	B3_FORCE_INLINE void quantizeWithClamp(unsigned short* out, const b3Vector3& point2,int isMax) const
+	{
+
+		b3Assert(m_useQuantization);
+
+		b3Vector3 clampedPoint(point2);
+		clampedPoint.setMax(m_bvhAabbMin);
+		clampedPoint.setMin(m_bvhAabbMax);
+
+		quantize(out,clampedPoint,isMax);
+
+	}
+	
+	B3_FORCE_INLINE b3Vector3	unQuantize(const unsigned short* vecIn) const
+	{
+			b3Vector3	vecOut;
+			vecOut.setValue(
+			(b3Scalar)(vecIn[0]) / (m_bvhQuantization.getX()),
+			(b3Scalar)(vecIn[1]) / (m_bvhQuantization.getY()),
+			(b3Scalar)(vecIn[2]) / (m_bvhQuantization.getZ()));
+			vecOut += m_bvhAabbMin;
+			return vecOut;
+	}
+
+	///setTraversalMode let's you choose between stackless, recursive or stackless cache friendly tree traversal. Note this is only implemented for quantized trees.
+	void	setTraversalMode(b3TraversalMode	traversalMode)
+	{
+		m_traversalMode = traversalMode;
+	}
+
+
+	B3_FORCE_INLINE QuantizedNodeArray&	getQuantizedNodeArray()
+	{	
+		return	m_quantizedContiguousNodes;
+	}
+
+
+	B3_FORCE_INLINE BvhSubtreeInfoArray&	getSubtreeInfoArray()
+	{
+		return m_SubtreeHeaders;
+	}
+
+////////////////////////////////////////////////////////////////////
+
+	/////Calculate space needed to store BVH for serialization
+	unsigned calculateSerializeBufferSize() const;
+
+	/// Data buffer MUST be 16 byte aligned
+	virtual bool serialize(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const;
+
+	///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
+	static b3QuantizedBvh *deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
+
+	static unsigned int getAlignmentSerializationPadding();
+//////////////////////////////////////////////////////////////////////
+
+	
+	virtual	int	calculateSerializeBufferSizeNew() const;
+
+	///fills the dataBuffer and returns the struct name (and 0 on failure)
+	virtual	const char*	serialize(void* dataBuffer, b3Serializer* serializer) const;
+
+	virtual	void deSerializeFloat(struct b3QuantizedBvhFloatData& quantizedBvhFloatData);
+
+	virtual	void deSerializeDouble(struct b3QuantizedBvhDoubleData& quantizedBvhDoubleData);
+
+
+////////////////////////////////////////////////////////////////////
+
+	B3_FORCE_INLINE bool isQuantized()
+	{
+		return m_useQuantization;
+	}
+
+private:
+	// Special "copy" constructor that allows for in-place deserialization
+	// Prevents b3Vector3's default constructor from being called, but doesn't inialize much else
+	// ownsMemory should most likely be false if deserializing, and if you are not, don't call this (it also changes the function signature, which we need)
+	b3QuantizedBvh(b3QuantizedBvh &other, bool ownsMemory);
+
+}
+;
+
+
+struct b3OptimizedBvhNodeFloatData
+{
+	b3Vector3FloatData	m_aabbMinOrg;
+	b3Vector3FloatData	m_aabbMaxOrg;
+	int	m_escapeIndex;
+	int	m_subPart;
+	int	m_triangleIndex;
+	char m_pad[4];
+};
+
+struct b3OptimizedBvhNodeDoubleData
+{
+	b3Vector3DoubleData	m_aabbMinOrg;
+	b3Vector3DoubleData	m_aabbMaxOrg;
+	int	m_escapeIndex;
+	int	m_subPart;
+	int	m_triangleIndex;
+	char	m_pad[4];
+};
+
+
+
+struct	b3QuantizedBvhFloatData
+{
+	b3Vector3FloatData			m_bvhAabbMin;
+	b3Vector3FloatData			m_bvhAabbMax;
+	b3Vector3FloatData			m_bvhQuantization;
+	int					m_curNodeIndex;
+	int					m_useQuantization;
+	int					m_numContiguousLeafNodes;
+	int					m_numQuantizedContiguousNodes;
+	b3OptimizedBvhNodeFloatData	*m_contiguousNodesPtr;
+	b3QuantizedBvhNodeData		*m_quantizedContiguousNodesPtr;
+	b3BvhSubtreeInfoData	*m_subTreeInfoPtr;
+	int					m_traversalMode;
+	int					m_numSubtreeHeaders;
+	
+};
+
+struct	b3QuantizedBvhDoubleData
+{
+	b3Vector3DoubleData			m_bvhAabbMin;
+	b3Vector3DoubleData			m_bvhAabbMax;
+	b3Vector3DoubleData			m_bvhQuantization;
+	int							m_curNodeIndex;
+	int							m_useQuantization;
+	int							m_numContiguousLeafNodes;
+	int							m_numQuantizedContiguousNodes;
+	b3OptimizedBvhNodeDoubleData	*m_contiguousNodesPtr;
+	b3QuantizedBvhNodeData			*m_quantizedContiguousNodesPtr;
+
+	int							m_traversalMode;
+	int							m_numSubtreeHeaders;
+	b3BvhSubtreeInfoData		*m_subTreeInfoPtr;
+};
+
+
+B3_FORCE_INLINE	int	b3QuantizedBvh::calculateSerializeBufferSizeNew() const
+{
+	return sizeof(b3QuantizedBvhData);
+}
+
+
+
+#endif //B3_QUANTIZED_BVH_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp
new file mode 100644
index 0000000000..4d97f7f62b
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp
@@ -0,0 +1,214 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "b3StridingMeshInterface.h"
+
+
+b3StridingMeshInterface::~b3StridingMeshInterface()
+{
+
+}
+
+
+void	b3StridingMeshInterface::InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const
+{
+	(void)aabbMin;
+	(void)aabbMax;
+	int numtotalphysicsverts = 0;
+	int part,graphicssubparts = getNumSubParts();
+	const unsigned char * vertexbase;
+	const unsigned char * indexbase;
+	int indexstride;
+	PHY_ScalarType type;
+	PHY_ScalarType gfxindextype;
+	int stride,numverts,numtriangles;
+	int gfxindex;
+	b3Vector3 triangle[3];
+
+	b3Vector3 meshScaling = getScaling();
+
+	///if the number of parts is big, the performance might drop due to the innerloop switch on indextype
+	for (part=0;part<graphicssubparts ;part++)
+	{
+		getLockedReadOnlyVertexIndexBase(&vertexbase,numverts,type,stride,&indexbase,indexstride,numtriangles,gfxindextype,part);
+		numtotalphysicsverts+=numtriangles*3; //upper bound
+
+		///unlike that developers want to pass in double-precision meshes in single-precision Bullet build
+		///so disable this feature by default
+		///see patch http://code.google.com/p/bullet/issues/detail?id=213
+
+		switch (type)
+		{
+		case PHY_FLOAT:
+		 {
+
+			 float* graphicsbase;
+
+			 switch (gfxindextype)
+			 {
+			 case PHY_INTEGER:
+				 {
+					 for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+					 {
+						 unsigned int* tri_indices= (unsigned int*)(indexbase+gfxindex*indexstride);
+						 graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
+						 triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
+						 triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
+						 triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+					 }
+					 break;
+				 }
+			 case PHY_SHORT:
+				 {
+					 for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+					 {
+						 unsigned short int* tri_indices= (unsigned short int*)(indexbase+gfxindex*indexstride);
+						 graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
+						 triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
+						 triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
+						 triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+					 }
+					 break;
+				 }
+			case PHY_UCHAR:
+				 {
+					 for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+					 {
+						 unsigned char* tri_indices= (unsigned char*)(indexbase+gfxindex*indexstride);
+						 graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
+						 triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
+						 triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
+						 triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+					 }
+					 break;
+				 }
+			 default:
+				 b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
+			 }
+			 break;
+		 }
+
+		case PHY_DOUBLE:
+			{
+				double* graphicsbase;
+
+				switch (gfxindextype)
+				{
+				case PHY_INTEGER:
+					{
+						for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+						{
+							unsigned int* tri_indices= (unsigned int*)(indexbase+gfxindex*indexstride);
+							graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
+							triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
+							triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
+							triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+						}
+						break;
+					}
+				case PHY_SHORT:
+					{
+						for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+						{
+							unsigned short int* tri_indices= (unsigned short int*)(indexbase+gfxindex*indexstride);
+							graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
+							triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
+							triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
+							triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+						}
+						break;
+					}
+				case PHY_UCHAR:
+					{
+						for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+						{
+							unsigned char* tri_indices= (unsigned char*)(indexbase+gfxindex*indexstride);
+							graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
+							triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
+							triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
+							triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
+							callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+						}
+						break;
+					}
+				default:
+					b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
+				}
+				break;
+			}
+		default:
+			b3Assert((type == PHY_FLOAT) || (type == PHY_DOUBLE));
+		}
+
+		unLockReadOnlyVertexBase(part);
+	}
+}
+
+void	b3StridingMeshInterface::calculateAabbBruteForce(b3Vector3& aabbMin,b3Vector3& aabbMax)
+{
+
+	struct	AabbCalculationCallback : public b3InternalTriangleIndexCallback
+	{
+		b3Vector3	m_aabbMin;
+		b3Vector3	m_aabbMax;
+
+		AabbCalculationCallback()
+		{
+			m_aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
+			m_aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
+		}
+
+		virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int  triangleIndex)
+		{
+			(void)partId;
+			(void)triangleIndex;
+
+			m_aabbMin.setMin(triangle[0]);
+			m_aabbMax.setMax(triangle[0]);
+			m_aabbMin.setMin(triangle[1]);
+			m_aabbMax.setMax(triangle[1]);
+			m_aabbMin.setMin(triangle[2]);
+			m_aabbMax.setMax(triangle[2]);
+		}
+	};
+
+	//first calculate the total aabb for all triangles
+	AabbCalculationCallback	aabbCallback;
+	aabbMin.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
+	aabbMax.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
+	InternalProcessAllTriangles(&aabbCallback,aabbMin,aabbMax);
+
+	aabbMin = aabbCallback.m_aabbMin;
+	aabbMax = aabbCallback.m_aabbMax;
+}
+
+
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h
new file mode 100644
index 0000000000..9513f68f77
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h
@@ -0,0 +1,167 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_STRIDING_MESHINTERFACE_H
+#define B3_STRIDING_MESHINTERFACE_H
+
+#include "Bullet3Common/b3Vector3.h"
+#include "b3TriangleCallback.h"
+//#include "b3ConcaveShape.h"
+
+
+enum  	PHY_ScalarType { 
+  PHY_FLOAT, PHY_DOUBLE, PHY_INTEGER, PHY_SHORT, 
+  PHY_FIXEDPOINT88, PHY_UCHAR 
+};
+
+
+///	The b3StridingMeshInterface is the interface class for high performance generic access to triangle meshes, used in combination with b3BvhTriangleMeshShape and some other collision shapes.
+/// Using index striding of 3*sizeof(integer) it can use triangle arrays, using index striding of 1*sizeof(integer) it can handle triangle strips.
+/// It allows for sharing graphics and collision meshes. Also it provides locking/unlocking of graphics meshes that are in gpu memory.
+B3_ATTRIBUTE_ALIGNED16(class ) b3StridingMeshInterface
+{
+	protected:
+	
+		b3Vector3 m_scaling;
+
+	public:
+		B3_DECLARE_ALIGNED_ALLOCATOR();
+		
+		b3StridingMeshInterface() :m_scaling(b3MakeVector3(b3Scalar(1.),b3Scalar(1.),b3Scalar(1.)))
+		{
+
+		}
+
+		virtual ~b3StridingMeshInterface();
+
+
+
+		virtual void	InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
+
+		///brute force method to calculate aabb
+		void	calculateAabbBruteForce(b3Vector3& aabbMin,b3Vector3& aabbMax);
+
+		/// get read and write access to a subpart of a triangle mesh
+		/// this subpart has a continuous array of vertices and indices
+		/// in this way the mesh can be handled as chunks of memory with striding
+		/// very similar to OpenGL vertexarray support
+		/// make a call to unLockVertexBase when the read and write access is finished	
+		virtual void	getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& stride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0)=0;
+		
+		virtual void	getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& stride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0) const=0;
+	
+		/// unLockVertexBase finishes the access to a subpart of the triangle mesh
+		/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
+		virtual void	unLockVertexBase(int subpart)=0;
+
+		virtual void	unLockReadOnlyVertexBase(int subpart) const=0;
+
+
+		/// getNumSubParts returns the number of seperate subparts
+		/// each subpart has a continuous array of vertices and indices
+		virtual int		getNumSubParts() const=0;
+
+		virtual void	preallocateVertices(int numverts)=0;
+		virtual void	preallocateIndices(int numindices)=0;
+
+		virtual bool	hasPremadeAabb() const { return false; }
+		virtual void	setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const
+                {
+                        (void) aabbMin;
+                        (void) aabbMax;
+                }
+		virtual void	getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const
+        {
+            (void) aabbMin;
+            (void) aabbMax;
+        }
+
+		const b3Vector3&	getScaling() const {
+			return m_scaling;
+		}
+		void	setScaling(const b3Vector3& scaling)
+		{
+			m_scaling = scaling;
+		}
+
+		virtual	int	calculateSerializeBufferSize() const;
+
+		///fills the dataBuffer and returns the struct name (and 0 on failure)
+		//virtual	const char*	serialize(void* dataBuffer, b3Serializer* serializer) const;
+
+
+};
+
+struct	b3IntIndexData
+{
+	int	m_value;
+};
+
+struct	b3ShortIntIndexData
+{
+	short m_value;
+	char m_pad[2];
+};
+
+struct	b3ShortIntIndexTripletData
+{
+	short	m_values[3];
+	char	m_pad[2];
+};
+
+struct	b3CharIndexTripletData
+{
+	unsigned char m_values[3];
+	char	m_pad;
+};
+
+
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	b3MeshPartData
+{
+	b3Vector3FloatData			*m_vertices3f;
+	b3Vector3DoubleData			*m_vertices3d;
+
+	b3IntIndexData				*m_indices32;
+	b3ShortIntIndexTripletData	*m_3indices16;
+	b3CharIndexTripletData		*m_3indices8;
+
+	b3ShortIntIndexData			*m_indices16;//backwards compatibility
+
+	int                     m_numTriangles;//length of m_indices = m_numTriangles
+	int                     m_numVertices;
+};
+
+
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	b3StridingMeshInterfaceData
+{
+	b3MeshPartData	*m_meshPartsPtr;
+	b3Vector3FloatData	m_scaling;
+	int	m_numMeshParts;
+	char m_padding[4];
+};
+
+
+
+
+B3_FORCE_INLINE	int	b3StridingMeshInterface::calculateSerializeBufferSize() const
+{
+	return sizeof(b3StridingMeshInterfaceData);
+}
+
+
+
+#endif //B3_STRIDING_MESHINTERFACE_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h
new file mode 100644
index 0000000000..d073ee57c3
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h
@@ -0,0 +1,38 @@
+
+#ifndef B3_SUPPORT_MAPPINGS_H
+#define B3_SUPPORT_MAPPINGS_H
+
+#include "Bullet3Common/b3Transform.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "b3VectorFloat4.h"
+
+
+struct b3GjkPairDetector;
+
+
+
+inline b3Vector3 localGetSupportVertexWithMargin(const float4& supportVec,const struct b3ConvexPolyhedronData* hull, 
+	const b3AlignedObjectArray<b3Vector3>& verticesA, b3Scalar margin)
+{
+	b3Vector3 supVec = b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+	b3Scalar maxDot = b3Scalar(-B3_LARGE_FLOAT);
+
+    // Here we take advantage of dot(a, b*c) = dot(a*b, c).  Note: This is true mathematically, but not numerically. 
+    if( 0 < hull->m_numVertices )
+    {
+        const b3Vector3 scaled = supportVec;
+		int index = (int) scaled.maxDot( &verticesA[hull->m_vertexOffset], hull->m_numVertices, maxDot); 
+        return verticesA[hull->m_vertexOffset+index];
+    }
+
+    return supVec;
+
+}
+
+inline b3Vector3 localGetSupportVertexWithoutMargin(const float4& supportVec,const struct b3ConvexPolyhedronData* hull, 
+	const b3AlignedObjectArray<b3Vector3>& verticesA)
+{
+	return localGetSupportVertexWithMargin(supportVec,hull,verticesA,0.f);
+}
+
+#endif //B3_SUPPORT_MAPPINGS_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp
new file mode 100644
index 0000000000..9066451884
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp
@@ -0,0 +1,28 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "b3TriangleCallback.h"
+
+b3TriangleCallback::~b3TriangleCallback()
+{
+	
+}
+
+
+b3InternalTriangleIndexCallback::~b3InternalTriangleIndexCallback()
+{
+
+}
+
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h
new file mode 100644
index 0000000000..3059fa4f21
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h
@@ -0,0 +1,42 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_TRIANGLE_CALLBACK_H
+#define B3_TRIANGLE_CALLBACK_H
+
+#include "Bullet3Common/b3Vector3.h"
+
+
+///The b3TriangleCallback provides a callback for each overlapping triangle when calling processAllTriangles.
+///This callback is called by processAllTriangles for all b3ConcaveShape derived class, such as  b3BvhTriangleMeshShape, b3StaticPlaneShape and b3HeightfieldTerrainShape.
+class b3TriangleCallback
+{
+public:
+
+	virtual ~b3TriangleCallback();
+	virtual void processTriangle(b3Vector3* triangle, int partId, int triangleIndex) = 0;
+};
+
+class b3InternalTriangleIndexCallback
+{
+public:
+
+	virtual ~b3InternalTriangleIndexCallback();
+	virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int  triangleIndex) = 0;
+};
+
+
+
+#endif //B3_TRIANGLE_CALLBACK_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp
new file mode 100644
index 0000000000..a0f59babbe
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp
@@ -0,0 +1,95 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "b3TriangleIndexVertexArray.h"
+
+b3TriangleIndexVertexArray::b3TriangleIndexVertexArray(int numTriangles,int* triangleIndexBase,int triangleIndexStride,int numVertices,b3Scalar* vertexBase,int vertexStride)
+: m_hasAabb(0)
+{
+	b3IndexedMesh mesh;
+
+	mesh.m_numTriangles = numTriangles;
+	mesh.m_triangleIndexBase = (const unsigned char *)triangleIndexBase;
+	mesh.m_triangleIndexStride = triangleIndexStride;
+	mesh.m_numVertices = numVertices;
+	mesh.m_vertexBase = (const unsigned char *)vertexBase;
+	mesh.m_vertexStride = vertexStride;
+
+	addIndexedMesh(mesh);
+
+}
+
+b3TriangleIndexVertexArray::~b3TriangleIndexVertexArray()
+{
+
+}
+
+void	b3TriangleIndexVertexArray::getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart)
+{
+	b3Assert(subpart< getNumSubParts() );
+
+	b3IndexedMesh& mesh = m_indexedMeshes[subpart];
+
+	numverts = mesh.m_numVertices;
+	(*vertexbase) = (unsigned char *) mesh.m_vertexBase;
+
+   type = mesh.m_vertexType;
+
+	vertexStride = mesh.m_vertexStride;
+
+	numfaces = mesh.m_numTriangles;
+
+	(*indexbase) = (unsigned char *)mesh.m_triangleIndexBase;
+	indexstride = mesh.m_triangleIndexStride;
+	indicestype = mesh.m_indexType;
+}
+
+void	b3TriangleIndexVertexArray::getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart) const
+{
+	const b3IndexedMesh& mesh = m_indexedMeshes[subpart];
+
+	numverts = mesh.m_numVertices;
+	(*vertexbase) = (const unsigned char *)mesh.m_vertexBase;
+
+   type = mesh.m_vertexType;
+   
+	vertexStride = mesh.m_vertexStride;
+
+	numfaces = mesh.m_numTriangles;
+	(*indexbase) = (const unsigned char *)mesh.m_triangleIndexBase;
+	indexstride = mesh.m_triangleIndexStride;
+	indicestype = mesh.m_indexType;
+}
+
+bool	b3TriangleIndexVertexArray::hasPremadeAabb() const
+{
+	return (m_hasAabb == 1);
+}
+
+
+void	b3TriangleIndexVertexArray::setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const
+{
+	m_aabbMin = aabbMin;
+	m_aabbMax = aabbMax;
+	m_hasAabb = 1; // this is intentionally an int see notes in header
+}
+
+void	b3TriangleIndexVertexArray::getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const
+{
+	*aabbMin = m_aabbMin;
+	*aabbMax = m_aabbMax;
+}
+
+
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h
new file mode 100644
index 0000000000..d26b2893bc
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h
@@ -0,0 +1,133 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_TRIANGLE_INDEX_VERTEX_ARRAY_H
+#define B3_TRIANGLE_INDEX_VERTEX_ARRAY_H
+
+#include "b3StridingMeshInterface.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Common/b3Scalar.h"
+
+
+///The b3IndexedMesh indexes a single vertex and index array. Multiple b3IndexedMesh objects can be passed into a b3TriangleIndexVertexArray using addIndexedMesh.
+///Instead of the number of indices, we pass the number of triangles.
+B3_ATTRIBUTE_ALIGNED16( struct)	b3IndexedMesh
+{
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+   int                     m_numTriangles;
+   const unsigned char *   m_triangleIndexBase;
+   // Size in byte of the indices for one triangle (3*sizeof(index_type) if the indices are tightly packed)
+   int                     m_triangleIndexStride;
+   int                     m_numVertices;
+   const unsigned char *   m_vertexBase;
+   // Size of a vertex, in bytes
+   int                     m_vertexStride;
+
+   // The index type is set when adding an indexed mesh to the
+   // b3TriangleIndexVertexArray, do not set it manually
+   PHY_ScalarType m_indexType;
+
+   // The vertex type has a default type similar to Bullet's precision mode (float or double)
+   // but can be set manually if you for example run Bullet with double precision but have
+   // mesh data in single precision..
+   PHY_ScalarType m_vertexType;
+
+
+   b3IndexedMesh()
+	   :m_indexType(PHY_INTEGER),
+#ifdef B3_USE_DOUBLE_PRECISION
+      m_vertexType(PHY_DOUBLE)
+#else // B3_USE_DOUBLE_PRECISION
+      m_vertexType(PHY_FLOAT)
+#endif // B3_USE_DOUBLE_PRECISION
+      {
+      }
+}
+;
+
+
+typedef b3AlignedObjectArray<b3IndexedMesh>	IndexedMeshArray;
+
+///The b3TriangleIndexVertexArray allows to access multiple triangle meshes, by indexing into existing triangle/index arrays.
+///Additional meshes can be added using addIndexedMesh
+///No duplcate is made of the vertex/index data, it only indexes into external vertex/index arrays.
+///So keep those arrays around during the lifetime of this b3TriangleIndexVertexArray.
+B3_ATTRIBUTE_ALIGNED16( class) b3TriangleIndexVertexArray : public b3StridingMeshInterface
+{
+protected:
+	IndexedMeshArray	m_indexedMeshes;
+	int m_pad[2];
+	mutable int m_hasAabb; // using int instead of bool to maintain alignment
+	mutable b3Vector3 m_aabbMin;
+	mutable b3Vector3 m_aabbMax;
+
+public:
+
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	b3TriangleIndexVertexArray() : m_hasAabb(0)
+	{
+	}
+
+	virtual ~b3TriangleIndexVertexArray();
+
+	//just to be backwards compatible
+	b3TriangleIndexVertexArray(int numTriangles,int* triangleIndexBase,int triangleIndexStride,int numVertices,b3Scalar* vertexBase,int vertexStride);
+	
+	void	addIndexedMesh(const b3IndexedMesh& mesh, PHY_ScalarType indexType = PHY_INTEGER)
+	{
+		m_indexedMeshes.push_back(mesh);
+		m_indexedMeshes[m_indexedMeshes.size()-1].m_indexType = indexType;
+	}
+	
+	
+	virtual void	getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0);
+
+	virtual void	getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0) const;
+
+	/// unLockVertexBase finishes the access to a subpart of the triangle mesh
+	/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
+	virtual void	unLockVertexBase(int subpart) {(void)subpart;}
+
+	virtual void	unLockReadOnlyVertexBase(int subpart) const {(void)subpart;}
+
+	/// getNumSubParts returns the number of seperate subparts
+	/// each subpart has a continuous array of vertices and indices
+	virtual int		getNumSubParts() const { 
+		return (int)m_indexedMeshes.size();
+	}
+
+	IndexedMeshArray&	getIndexedMeshArray()
+	{
+		return m_indexedMeshes;
+	}
+
+	const IndexedMeshArray&	getIndexedMeshArray() const
+	{
+		return m_indexedMeshes;
+	}
+
+	virtual void	preallocateVertices(int numverts){(void) numverts;}
+	virtual void	preallocateIndices(int numindices){(void) numindices;}
+
+	virtual bool	hasPremadeAabb() const;
+	virtual void	setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const;
+	virtual void	getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const;
+
+}
+;
+
+#endif //B3_TRIANGLE_INDEX_VERTEX_ARRAY_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h
new file mode 100644
index 0000000000..f6f65f7719
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h
@@ -0,0 +1,11 @@
+#ifndef B3_VECTOR_FLOAT4_H
+#define B3_VECTOR_FLOAT4_H
+
+#include "Bullet3Common/b3Transform.h"
+
+//#define cross3(a,b) (a.cross(b))
+#define float4 b3Vector3
+//#define make_float4(x,y,z,w) b3Vector4(x,y,z,w)
+
+
+#endif //B3_VECTOR_FLOAT4_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp
new file mode 100644
index 0000000000..cf3d5ef49d
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp
@@ -0,0 +1,609 @@
+
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+	
+	Elsevier CDROM license agreements grants nonexclusive license to use the software
+	for any purpose, commercial or non-commercial as long as the following credit is included
+	identifying the original source of the software:
+
+	Parts of the source are "from the book Real-Time Collision Detection by
+	Christer Ericson, published by Morgan Kaufmann Publishers,
+	(c) 2005 Elsevier Inc."
+		
+*/
+
+
+#include "b3VoronoiSimplexSolver.h"
+
+#define VERTA  0
+#define VERTB  1
+#define VERTC  2
+#define VERTD  3
+
+#define B3_CATCH_DEGENERATE_TETRAHEDRON 1
+void	b3VoronoiSimplexSolver::removeVertex(int index)
+{
+	
+	b3Assert(m_numVertices>0);
+	m_numVertices--;
+	m_simplexVectorW[index] = m_simplexVectorW[m_numVertices];
+	m_simplexPointsP[index] = m_simplexPointsP[m_numVertices];
+	m_simplexPointsQ[index] = m_simplexPointsQ[m_numVertices];
+}
+
+void	b3VoronoiSimplexSolver::reduceVertices (const b3UsageBitfield& usedVerts)
+{
+	if ((numVertices() >= 4) && (!usedVerts.usedVertexD))
+		removeVertex(3);
+
+	if ((numVertices() >= 3) && (!usedVerts.usedVertexC))
+		removeVertex(2);
+
+	if ((numVertices() >= 2) && (!usedVerts.usedVertexB))
+		removeVertex(1);
+	
+	if ((numVertices() >= 1) && (!usedVerts.usedVertexA))
+		removeVertex(0);
+
+}
+
+
+
+
+
+//clear the simplex, remove all the vertices
+void b3VoronoiSimplexSolver::reset()
+{
+	m_cachedValidClosest = false;
+	m_numVertices = 0;
+	m_needsUpdate = true;
+	m_lastW = b3MakeVector3(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
+	m_cachedBC.reset();
+}
+
+
+
+	//add a vertex
+void b3VoronoiSimplexSolver::addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q)
+{
+	m_lastW = w;
+	m_needsUpdate = true;
+
+	m_simplexVectorW[m_numVertices] = w;
+	m_simplexPointsP[m_numVertices] = p;
+	m_simplexPointsQ[m_numVertices] = q;
+
+	m_numVertices++;
+}
+
+bool	b3VoronoiSimplexSolver::updateClosestVectorAndPoints()
+{
+	
+	if (m_needsUpdate)
+	{
+		m_cachedBC.reset();
+
+		m_needsUpdate = false;
+
+		switch (numVertices())
+		{
+		case 0:
+				m_cachedValidClosest = false;
+				break;
+		case 1:
+			{
+				m_cachedP1 = m_simplexPointsP[0];
+				m_cachedP2 = m_simplexPointsQ[0];
+				m_cachedV = m_cachedP1-m_cachedP2; //== m_simplexVectorW[0]
+				m_cachedBC.reset();
+				m_cachedBC.setBarycentricCoordinates(b3Scalar(1.),b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+				m_cachedValidClosest = m_cachedBC.isValid();
+				break;
+			};
+		case 2:
+			{
+			//closest point origin from line segment
+					const b3Vector3& from = m_simplexVectorW[0];
+					const b3Vector3& to = m_simplexVectorW[1];
+					b3Vector3 nearest;
+
+					b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+					b3Vector3 diff = p - from;
+					b3Vector3 v = to - from;
+					b3Scalar t = v.dot(diff);
+					
+					if (t > 0) {
+						b3Scalar dotVV = v.dot(v);
+						if (t < dotVV) {
+							t /= dotVV;
+							diff -= t*v;
+							m_cachedBC.m_usedVertices.usedVertexA = true;
+							m_cachedBC.m_usedVertices.usedVertexB = true;
+						} else {
+							t = 1;
+							diff -= v;
+							//reduce to 1 point
+							m_cachedBC.m_usedVertices.usedVertexB = true;
+						}
+					} else
+					{
+						t = 0;
+						//reduce to 1 point
+						m_cachedBC.m_usedVertices.usedVertexA = true;
+					}
+					m_cachedBC.setBarycentricCoordinates(1-t,t);
+					nearest = from + t*v;
+
+					m_cachedP1 = m_simplexPointsP[0] + t * (m_simplexPointsP[1] - m_simplexPointsP[0]);
+					m_cachedP2 = m_simplexPointsQ[0] + t * (m_simplexPointsQ[1] - m_simplexPointsQ[0]);
+					m_cachedV = m_cachedP1 - m_cachedP2;
+					
+					reduceVertices(m_cachedBC.m_usedVertices);
+
+					m_cachedValidClosest = m_cachedBC.isValid();
+					break;
+			}
+		case 3: 
+			{ 
+				//closest point origin from triangle 
+				b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); 
+
+				const b3Vector3& a = m_simplexVectorW[0]; 
+				const b3Vector3& b = m_simplexVectorW[1]; 
+				const b3Vector3& c = m_simplexVectorW[2]; 
+
+				closestPtPointTriangle(p,a,b,c,m_cachedBC); 
+				m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] + 
+				m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] + 
+				m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2]; 
+
+				m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] + 
+				m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] + 
+				m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2]; 
+
+				m_cachedV = m_cachedP1-m_cachedP2; 
+
+				reduceVertices (m_cachedBC.m_usedVertices); 
+				m_cachedValidClosest = m_cachedBC.isValid(); 
+
+				break; 
+			}
+		case 4:
+			{
+
+				
+				b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+				
+				const b3Vector3& a = m_simplexVectorW[0];
+				const b3Vector3& b = m_simplexVectorW[1];
+				const b3Vector3& c = m_simplexVectorW[2];
+				const b3Vector3& d = m_simplexVectorW[3];
+
+				bool hasSeperation = closestPtPointTetrahedron(p,a,b,c,d,m_cachedBC);
+
+				if (hasSeperation)
+				{
+
+					m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] +
+						m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] +
+						m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2] +
+						m_simplexPointsP[3] * m_cachedBC.m_barycentricCoords[3];
+
+					m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] +
+						m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] +
+						m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2] +
+						m_simplexPointsQ[3] * m_cachedBC.m_barycentricCoords[3];
+
+					m_cachedV = m_cachedP1-m_cachedP2;
+					reduceVertices (m_cachedBC.m_usedVertices);
+				} else
+				{
+//					printf("sub distance got penetration\n");
+
+					if (m_cachedBC.m_degenerate)
+					{
+						m_cachedValidClosest = false;
+					} else
+					{
+						m_cachedValidClosest = true;
+						//degenerate case == false, penetration = true + zero
+						m_cachedV.setValue(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+					}
+					break;
+				}
+
+				m_cachedValidClosest = m_cachedBC.isValid();
+
+				//closest point origin from tetrahedron
+				break;
+			}
+		default:
+			{
+				m_cachedValidClosest = false;
+			}
+		};
+	}
+
+	return m_cachedValidClosest;
+
+}
+
+//return/calculate the closest vertex
+bool b3VoronoiSimplexSolver::closest(b3Vector3& v)
+{
+	bool succes = updateClosestVectorAndPoints();
+	v = m_cachedV;
+	return succes;
+}
+
+
+
+b3Scalar b3VoronoiSimplexSolver::maxVertex()
+{
+	int i, numverts = numVertices();
+	b3Scalar maxV = b3Scalar(0.);
+	for (i=0;i<numverts;i++)
+	{
+		b3Scalar curLen2 = m_simplexVectorW[i].length2();
+		if (maxV < curLen2)
+			maxV = curLen2;
+	}
+	return maxV;
+}
+
+
+
+	//return the current simplex
+int b3VoronoiSimplexSolver::getSimplex(b3Vector3 *pBuf, b3Vector3 *qBuf, b3Vector3 *yBuf) const
+{
+	int i;
+	for (i=0;i<numVertices();i++)
+	{
+		yBuf[i] = m_simplexVectorW[i];
+		pBuf[i] = m_simplexPointsP[i];
+		qBuf[i] = m_simplexPointsQ[i];
+	}
+	return numVertices();
+}
+
+
+
+
+bool b3VoronoiSimplexSolver::inSimplex(const b3Vector3& w)
+{
+	bool found = false;
+	int i, numverts = numVertices();
+	//b3Scalar maxV = b3Scalar(0.);
+	
+	//w is in the current (reduced) simplex
+	for (i=0;i<numverts;i++)
+	{
+#ifdef BT_USE_EQUAL_VERTEX_THRESHOLD
+		if ( m_simplexVectorW[i].distance2(w) <= m_equalVertexThreshold)
+#else
+		if (m_simplexVectorW[i] == w)
+#endif
+			found = true;
+	}
+
+	//check in case lastW is already removed
+	if (w == m_lastW)
+		return true;
+    	
+	return found;
+}
+
+void b3VoronoiSimplexSolver::backup_closest(b3Vector3& v) 
+{
+	v = m_cachedV;
+}
+
+
+bool b3VoronoiSimplexSolver::emptySimplex() const 
+{
+	return (numVertices() == 0);
+
+}
+
+void b3VoronoiSimplexSolver::compute_points(b3Vector3& p1, b3Vector3& p2) 
+{
+	updateClosestVectorAndPoints();
+	p1 = m_cachedP1;
+	p2 = m_cachedP2;
+
+}
+
+
+
+
+bool	b3VoronoiSimplexSolver::closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c,b3SubSimplexClosestResult& result)
+{
+	result.m_usedVertices.reset();
+
+    // Check if P in vertex region outside A
+    b3Vector3 ab = b - a;
+    b3Vector3 ac = c - a;
+    b3Vector3 ap = p - a;
+    b3Scalar d1 = ab.dot(ap);
+    b3Scalar d2 = ac.dot(ap);
+    if (d1 <= b3Scalar(0.0) && d2 <= b3Scalar(0.0)) 
+	{
+		result.m_closestPointOnSimplex = a;
+		result.m_usedVertices.usedVertexA = true;
+		result.setBarycentricCoordinates(1,0,0);
+		return true;// a; // barycentric coordinates (1,0,0)
+	}
+
+    // Check if P in vertex region outside B
+    b3Vector3 bp = p - b;
+    b3Scalar d3 = ab.dot(bp);
+    b3Scalar d4 = ac.dot(bp);
+    if (d3 >= b3Scalar(0.0) && d4 <= d3) 
+	{
+		result.m_closestPointOnSimplex = b;
+		result.m_usedVertices.usedVertexB = true;
+		result.setBarycentricCoordinates(0,1,0);
+
+		return true; // b; // barycentric coordinates (0,1,0)
+	}
+    // Check if P in edge region of AB, if so return projection of P onto AB
+    b3Scalar vc = d1*d4 - d3*d2;
+    if (vc <= b3Scalar(0.0) && d1 >= b3Scalar(0.0) && d3 <= b3Scalar(0.0)) {
+        b3Scalar v = d1 / (d1 - d3);
+		result.m_closestPointOnSimplex = a + v * ab;
+		result.m_usedVertices.usedVertexA = true;
+		result.m_usedVertices.usedVertexB = true;
+		result.setBarycentricCoordinates(1-v,v,0);
+		return true;
+        //return a + v * ab; // barycentric coordinates (1-v,v,0)
+    }
+
+    // Check if P in vertex region outside C
+    b3Vector3 cp = p - c;
+    b3Scalar d5 = ab.dot(cp);
+    b3Scalar d6 = ac.dot(cp);
+    if (d6 >= b3Scalar(0.0) && d5 <= d6) 
+	{
+		result.m_closestPointOnSimplex = c;
+		result.m_usedVertices.usedVertexC = true;
+		result.setBarycentricCoordinates(0,0,1);
+		return true;//c; // barycentric coordinates (0,0,1)
+	}
+
+    // Check if P in edge region of AC, if so return projection of P onto AC
+    b3Scalar vb = d5*d2 - d1*d6;
+    if (vb <= b3Scalar(0.0) && d2 >= b3Scalar(0.0) && d6 <= b3Scalar(0.0)) {
+        b3Scalar w = d2 / (d2 - d6);
+		result.m_closestPointOnSimplex = a + w * ac;
+		result.m_usedVertices.usedVertexA = true;
+		result.m_usedVertices.usedVertexC = true;
+		result.setBarycentricCoordinates(1-w,0,w);
+		return true;
+        //return a + w * ac; // barycentric coordinates (1-w,0,w)
+    }
+
+    // Check if P in edge region of BC, if so return projection of P onto BC
+    b3Scalar va = d3*d6 - d5*d4;
+    if (va <= b3Scalar(0.0) && (d4 - d3) >= b3Scalar(0.0) && (d5 - d6) >= b3Scalar(0.0)) {
+        b3Scalar w = (d4 - d3) / ((d4 - d3) + (d5 - d6));
+		
+		result.m_closestPointOnSimplex = b + w * (c - b);
+		result.m_usedVertices.usedVertexB = true;
+		result.m_usedVertices.usedVertexC = true;
+		result.setBarycentricCoordinates(0,1-w,w);
+		return true;		
+       // return b + w * (c - b); // barycentric coordinates (0,1-w,w)
+    }
+
+    // P inside face region. Compute Q through its barycentric coordinates (u,v,w)
+    b3Scalar denom = b3Scalar(1.0) / (va + vb + vc);
+    b3Scalar v = vb * denom;
+    b3Scalar w = vc * denom;
+    
+	result.m_closestPointOnSimplex = a + ab * v + ac * w;
+	result.m_usedVertices.usedVertexA = true;
+	result.m_usedVertices.usedVertexB = true;
+	result.m_usedVertices.usedVertexC = true;
+	result.setBarycentricCoordinates(1-v-w,v,w);
+	
+	return true;
+//	return a + ab * v + ac * w; // = u*a + v*b + w*c, u = va * denom = b3Scalar(1.0) - v - w
+
+}
+
+
+
+
+
+/// Test if point p and d lie on opposite sides of plane through abc
+int b3VoronoiSimplexSolver::pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d)
+{
+	b3Vector3 normal = (b-a).cross(c-a);
+
+    b3Scalar signp = (p - a).dot(normal); // [AP AB AC]
+    b3Scalar signd = (d - a).dot( normal); // [AD AB AC]
+
+#ifdef B3_CATCH_DEGENERATE_TETRAHEDRON
+#ifdef BT_USE_DOUBLE_PRECISION
+if (signd * signd < (b3Scalar(1e-8) * b3Scalar(1e-8)))
+	{
+		return -1;
+	}
+#else
+	if (signd * signd < (b3Scalar(1e-4) * b3Scalar(1e-4)))
+	{
+//		printf("affine dependent/degenerate\n");//
+		return -1;
+	}
+#endif
+
+#endif
+	// Points on opposite sides if expression signs are opposite
+    return signp * signd < b3Scalar(0.);
+}
+
+
+bool	b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult)
+{
+	b3SubSimplexClosestResult tempResult;
+
+    // Start out assuming point inside all halfspaces, so closest to itself
+	finalResult.m_closestPointOnSimplex = p;
+	finalResult.m_usedVertices.reset();
+    finalResult.m_usedVertices.usedVertexA = true;
+	finalResult.m_usedVertices.usedVertexB = true;
+	finalResult.m_usedVertices.usedVertexC = true;
+	finalResult.m_usedVertices.usedVertexD = true;
+
+    int pointOutsideABC = pointOutsideOfPlane(p, a, b, c, d);
+	int pointOutsideACD = pointOutsideOfPlane(p, a, c, d, b);
+  	int	pointOutsideADB = pointOutsideOfPlane(p, a, d, b, c);
+	int	pointOutsideBDC = pointOutsideOfPlane(p, b, d, c, a);
+
+   if (pointOutsideABC < 0 || pointOutsideACD < 0 || pointOutsideADB < 0 || pointOutsideBDC < 0)
+   {
+	   finalResult.m_degenerate = true;
+	   return false;
+   }
+
+   if (!pointOutsideABC  && !pointOutsideACD && !pointOutsideADB && !pointOutsideBDC)
+	 {
+		 return false;
+	 }
+
+
+    b3Scalar bestSqDist = FLT_MAX;
+    // If point outside face abc then compute closest point on abc
+	if (pointOutsideABC) 
+	{
+        closestPtPointTriangle(p, a, b, c,tempResult);
+		b3Vector3 q = tempResult.m_closestPointOnSimplex;
+		
+        b3Scalar sqDist = (q - p).dot( q - p);
+        // Update best closest point if (squared) distance is less than current best
+        if (sqDist < bestSqDist) {
+			bestSqDist = sqDist;
+			finalResult.m_closestPointOnSimplex = q;
+			//convert result bitmask!
+			finalResult.m_usedVertices.reset();
+			finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA;
+			finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexB;
+			finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexC;
+			finalResult.setBarycentricCoordinates(
+					tempResult.m_barycentricCoords[VERTA],
+					tempResult.m_barycentricCoords[VERTB],
+					tempResult.m_barycentricCoords[VERTC],
+					0
+			);
+
+		}
+    }
+  
+
+	// Repeat test for face acd
+	if (pointOutsideACD) 
+	{
+        closestPtPointTriangle(p, a, c, d,tempResult);
+		b3Vector3 q = tempResult.m_closestPointOnSimplex;
+		//convert result bitmask!
+
+        b3Scalar sqDist = (q - p).dot( q - p);
+        if (sqDist < bestSqDist) 
+		{
+			bestSqDist = sqDist;
+			finalResult.m_closestPointOnSimplex = q;
+			finalResult.m_usedVertices.reset();
+			finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA;
+
+			finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexB;
+			finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexC;
+			finalResult.setBarycentricCoordinates(
+					tempResult.m_barycentricCoords[VERTA],
+					0,
+					tempResult.m_barycentricCoords[VERTB],
+					tempResult.m_barycentricCoords[VERTC]
+			);
+
+		}
+    }
+    // Repeat test for face adb
+
+	
+	if (pointOutsideADB)
+	{
+		closestPtPointTriangle(p, a, d, b,tempResult);
+		b3Vector3 q = tempResult.m_closestPointOnSimplex;
+		//convert result bitmask!
+
+        b3Scalar sqDist = (q - p).dot( q - p);
+        if (sqDist < bestSqDist) 
+		{
+			bestSqDist = sqDist;
+			finalResult.m_closestPointOnSimplex = q;
+			finalResult.m_usedVertices.reset();
+			finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA;
+			finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexC;
+			
+			finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB;
+			finalResult.setBarycentricCoordinates(
+					tempResult.m_barycentricCoords[VERTA],
+					tempResult.m_barycentricCoords[VERTC],
+					0,
+					tempResult.m_barycentricCoords[VERTB]
+			);
+
+		}
+    }
+    // Repeat test for face bdc
+    
+
+	if (pointOutsideBDC)
+	{
+        closestPtPointTriangle(p, b, d, c,tempResult);
+		b3Vector3 q = tempResult.m_closestPointOnSimplex;
+		//convert result bitmask!
+        b3Scalar sqDist = (q - p).dot( q - p);
+        if (sqDist < bestSqDist) 
+		{
+			bestSqDist = sqDist;
+			finalResult.m_closestPointOnSimplex = q;
+			finalResult.m_usedVertices.reset();
+			//
+			finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexA;
+			finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexC;
+			finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB;
+
+			finalResult.setBarycentricCoordinates(
+					0,
+					tempResult.m_barycentricCoords[VERTA],
+					tempResult.m_barycentricCoords[VERTC],
+					tempResult.m_barycentricCoords[VERTB]
+			);
+
+		}
+    }
+
+	//help! we ended up full !
+	
+	if (finalResult.m_usedVertices.usedVertexA &&
+		finalResult.m_usedVertices.usedVertexB &&
+		finalResult.m_usedVertices.usedVertexC &&
+		finalResult.m_usedVertices.usedVertexD) 
+	{
+		return true;
+	}
+
+    return true;
+}
+
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h
new file mode 100644
index 0000000000..a6e27667d8
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h
@@ -0,0 +1,177 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+#ifndef B3_VORONOI_SIMPLEX_SOLVER_H
+#define B3_VORONOI_SIMPLEX_SOLVER_H
+
+#include "Bullet3Common/b3Vector3.h"
+
+
+#define VORONOI_SIMPLEX_MAX_VERTS 5
+
+///disable next define, or use defaultCollisionConfiguration->getSimplexSolver()->setEqualVertexThreshold(0.f) to disable/configure
+//#define BT_USE_EQUAL_VERTEX_THRESHOLD
+#define VORONOI_DEFAULT_EQUAL_VERTEX_THRESHOLD 0.0001f
+
+
+struct b3UsageBitfield{
+	b3UsageBitfield()
+	{
+		reset();
+	}
+
+	void reset()
+	{
+		usedVertexA = false;
+		usedVertexB = false;
+		usedVertexC = false;
+		usedVertexD = false;
+	}
+	unsigned short usedVertexA	: 1;
+	unsigned short usedVertexB	: 1;
+	unsigned short usedVertexC	: 1;
+	unsigned short usedVertexD	: 1;
+	unsigned short unused1		: 1;
+	unsigned short unused2		: 1;
+	unsigned short unused3		: 1;
+	unsigned short unused4		: 1;
+};
+
+
+struct	b3SubSimplexClosestResult
+{
+	b3Vector3	m_closestPointOnSimplex;
+	//MASK for m_usedVertices
+	//stores the simplex vertex-usage, using the MASK, 
+	// if m_usedVertices & MASK then the related vertex is used
+	b3UsageBitfield	m_usedVertices;
+	b3Scalar	m_barycentricCoords[4];
+	bool m_degenerate;
+
+	void	reset()
+	{
+		m_degenerate = false;
+		setBarycentricCoordinates();
+		m_usedVertices.reset();
+	}
+	bool	isValid()
+	{
+		bool valid = (m_barycentricCoords[0] >= b3Scalar(0.)) &&
+			(m_barycentricCoords[1] >= b3Scalar(0.)) &&
+			(m_barycentricCoords[2] >= b3Scalar(0.)) &&
+			(m_barycentricCoords[3] >= b3Scalar(0.));
+
+
+		return valid;
+	}
+	void	setBarycentricCoordinates(b3Scalar a=b3Scalar(0.),b3Scalar b=b3Scalar(0.),b3Scalar c=b3Scalar(0.),b3Scalar d=b3Scalar(0.))
+	{
+		m_barycentricCoords[0] = a;
+		m_barycentricCoords[1] = b;
+		m_barycentricCoords[2] = c;
+		m_barycentricCoords[3] = d;
+	}
+
+};
+
+/// b3VoronoiSimplexSolver is an implementation of the closest point distance algorithm from a 1-4 points simplex to the origin.
+/// Can be used with GJK, as an alternative to Johnson distance algorithm.
+
+B3_ATTRIBUTE_ALIGNED16(class) b3VoronoiSimplexSolver 
+{
+public:
+
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	int	m_numVertices;
+
+	b3Vector3	m_simplexVectorW[VORONOI_SIMPLEX_MAX_VERTS];
+	b3Vector3	m_simplexPointsP[VORONOI_SIMPLEX_MAX_VERTS];
+	b3Vector3	m_simplexPointsQ[VORONOI_SIMPLEX_MAX_VERTS];
+
+	
+
+	b3Vector3	m_cachedP1;
+	b3Vector3	m_cachedP2;
+	b3Vector3	m_cachedV;
+	b3Vector3	m_lastW;
+	
+	b3Scalar	m_equalVertexThreshold;
+	bool		m_cachedValidClosest;
+
+
+	b3SubSimplexClosestResult m_cachedBC;
+
+	bool	m_needsUpdate;
+	
+	void	removeVertex(int index);
+	void	reduceVertices (const b3UsageBitfield& usedVerts);
+	bool	updateClosestVectorAndPoints();
+
+	bool	closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult);
+	int		pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d);
+	bool	closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c,b3SubSimplexClosestResult& result);
+
+public:
+
+	b3VoronoiSimplexSolver()
+		:  m_equalVertexThreshold(VORONOI_DEFAULT_EQUAL_VERTEX_THRESHOLD)
+	{
+	}
+	 void reset();
+
+	 void addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q);
+
+	 void	setEqualVertexThreshold(b3Scalar threshold)
+	 {
+		 m_equalVertexThreshold = threshold;
+	 }
+
+	 b3Scalar	getEqualVertexThreshold() const
+	 {
+		 return m_equalVertexThreshold;
+	 }
+
+	 bool closest(b3Vector3& v);
+
+	 b3Scalar maxVertex();
+
+	 bool fullSimplex() const
+	 {
+		 return (m_numVertices == 4);
+	 }
+
+	 int getSimplex(b3Vector3 *pBuf, b3Vector3 *qBuf, b3Vector3 *yBuf) const;
+
+	 bool inSimplex(const b3Vector3& w);
+	
+	 void backup_closest(b3Vector3& v) ;
+
+	 bool emptySimplex() const ;
+
+	 void compute_points(b3Vector3& p1, b3Vector3& p2) ;
+
+	 int numVertices() const 
+	 {
+		 return m_numVertices;
+	 }
+
+
+};
+
+#endif //B3_VORONOI_SIMPLEX_SOLVER_H
+
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl
new file mode 100644
index 0000000000..faa413441c
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl
@@ -0,0 +1,283 @@
+//keep this enum in sync with the CPU version (in btCollidable.h)
+//written by Erwin Coumans
+
+#define SHAPE_CONVEX_HULL 3
+#define SHAPE_CONCAVE_TRIMESH 5
+#define TRIANGLE_NUM_CONVEX_FACES 5
+#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6
+#define SHAPE_SPHERE 7
+
+typedef unsigned int u32;
+
+#define MAX_NUM_PARTS_IN_BITS 10
+
+///btQuantizedBvhNode is a compressed aabb node, 16 bytes.
+///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
+typedef struct
+{
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes
+	int	m_escapeIndexOrTriangleIndex;
+} btQuantizedBvhNode;
+
+typedef struct
+{
+	float4		m_aabbMin;
+	float4		m_aabbMax;
+	float4		m_quantization;
+	int			m_numNodes;
+	int			m_numSubTrees;
+	int			m_nodeOffset;
+	int			m_subTreeOffset;
+
+} b3BvhInfo;
+
+int	getTriangleIndex(const btQuantizedBvhNode* rootNode)
+{
+	unsigned int x=0;
+	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+	// Get only the lower bits where the triangle index is stored
+	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));
+}
+
+int isLeaf(const btQuantizedBvhNode* rootNode)
+{
+	//skipindex is negative (internal node), triangleindex >=0 (leafnode)
+	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;
+}
+	
+int getEscapeIndex(const btQuantizedBvhNode* rootNode)
+{
+	return -rootNode->m_escapeIndexOrTriangleIndex;
+}
+
+typedef struct
+{
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes, points to the root of the subtree
+	int			m_rootNodeIndex;
+	//4 bytes
+	int			m_subtreeSize;
+	int			m_padding[3];
+} btBvhSubtreeInfo;
+
+///keep this in sync with btCollidable.h
+typedef struct
+{
+	int m_numChildShapes;
+	int blaat2;
+	int m_shapeType;
+	int m_shapeIndex;
+	
+} btCollidableGpu;
+
+typedef struct
+{
+	float4	m_childPosition;
+	float4	m_childOrientation;
+	int m_shapeIndex;
+	int m_unused0;
+	int m_unused1;
+	int m_unused2;
+} btGpuChildShape;
+
+
+typedef struct
+{
+	float4 m_pos;
+	float4 m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+
+	u32 m_collidableIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} BodyData;
+
+typedef struct 
+{
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+
+
+int testQuantizedAabbAgainstQuantizedAabb(
+								const unsigned short int* aabbMin1,
+								const unsigned short int* aabbMax1,
+								const unsigned short int* aabbMin2,
+								const unsigned short int* aabbMax2)
+{
+	//int overlap = 1;
+	if (aabbMin1[0] > aabbMax2[0])
+		return 0;
+	if (aabbMax1[0] < aabbMin2[0])
+		return 0;
+	if (aabbMin1[1] > aabbMax2[1])
+		return 0;
+	if (aabbMax1[1] < aabbMin2[1])
+		return 0;
+	if (aabbMin1[2] > aabbMax2[2])
+		return 0;
+	if (aabbMax1[2] < aabbMin2[2])
+		return 0;
+	return 1;
+	//overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;
+	//overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;
+	//overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;
+	//return overlap;
+}
+
+
+void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)
+{
+	float4 clampedPoint = max(point2,bvhAabbMin);
+	clampedPoint = min (clampedPoint, bvhAabbMax);
+
+	float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;
+	if (isMax)
+	{
+		out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));
+		out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));
+		out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));
+	} else
+	{
+		out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));
+		out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));
+		out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));
+	}
+
+}
+
+
+// work-in-progress
+__kernel void   bvhTraversalKernel( __global const int4* pairs, 
+									__global const BodyData* rigidBodies, 
+									__global const btCollidableGpu* collidables,
+									__global btAabbCL* aabbs,
+									__global int4* concavePairsOut,
+									__global volatile int* numConcavePairsOut,
+									__global const btBvhSubtreeInfo* subtreeHeadersRoot,
+									__global const btQuantizedBvhNode* quantizedNodesRoot,
+									__global const b3BvhInfo* bvhInfos,
+									int numPairs,
+									int maxNumConcavePairsCapacity)
+{
+	int id = get_global_id(0);
+	if (id>=numPairs)
+		return;
+	
+	int bodyIndexA = pairs[id].x;
+	int bodyIndexB = pairs[id].y;
+	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+	
+	//once the broadphase avoids static-static pairs, we can remove this test
+	if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+	{
+		return;
+	}
+		
+	if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)
+		return;
+
+	int shapeTypeB = collidables[collidableIndexB].m_shapeType;
+		
+	if (shapeTypeB!=SHAPE_CONVEX_HULL &&
+		shapeTypeB!=SHAPE_SPHERE	&&
+		shapeTypeB!=SHAPE_COMPOUND_OF_CONVEX_HULLS
+		)
+		return;
+
+	b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];
+
+	float4 bvhAabbMin = bvhInfo.m_aabbMin;
+	float4 bvhAabbMax = bvhInfo.m_aabbMax;
+	float4 bvhQuantization = bvhInfo.m_quantization;
+	int numSubtreeHeaders = bvhInfo.m_numSubTrees;
+	__global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];
+	__global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];
+	
+
+	unsigned short int quantizedQueryAabbMin[3];
+	unsigned short int quantizedQueryAabbMax[3];
+	quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);
+	quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);
+	
+	for (int i=0;i<numSubtreeHeaders;i++)
+	{
+		btBvhSubtreeInfo subtree = subtreeHeaders[i];
+				
+		int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
+		if (overlap != 0)
+		{
+			int startNodeIndex = subtree.m_rootNodeIndex;
+			int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;
+			int curIndex = startNodeIndex;
+			int escapeIndex;
+			int isLeafNode;
+			int aabbOverlap;
+			while (curIndex < endNodeIndex)
+			{
+				btQuantizedBvhNode rootNode = quantizedNodes[curIndex];
+				aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);
+				isLeafNode = isLeaf(&rootNode);
+				if (aabbOverlap)
+				{
+					if (isLeafNode)
+					{
+						int triangleIndex = getTriangleIndex(&rootNode);
+						if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+						{
+								int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+								int pairIdx = atomic_add(numConcavePairsOut,numChildrenB);
+								for (int b=0;b<numChildrenB;b++)
+								{
+									if ((pairIdx+b)<maxNumConcavePairsCapacity)
+									{
+										int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
+										int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);
+										concavePairsOut[pairIdx+b] = newPair;
+									}
+								}
+						} else
+						{
+							int pairIdx = atomic_inc(numConcavePairsOut);
+							if (pairIdx<maxNumConcavePairsCapacity)
+							{
+								int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0);
+								concavePairsOut[pairIdx] = newPair;
+							}
+						}
+					} 
+					curIndex++;
+				} else
+				{
+					if (isLeafNode)
+					{
+						curIndex++;
+					} else
+					{
+						escapeIndex = getEscapeIndex(&rootNode);
+						curIndex += escapeIndex;
+					}
+				}
+			}
+		}
+	}
+
+}
+\ No newline at end of file
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h
new file mode 100644
index 0000000000..4b3b49eae8
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h
@@ -0,0 +1,258 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* bvhTraversalKernelCL= \
+"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
+"//written by Erwin Coumans\n"
+"#define SHAPE_CONVEX_HULL 3\n"
+"#define SHAPE_CONCAVE_TRIMESH 5\n"
+"#define TRIANGLE_NUM_CONVEX_FACES 5\n"
+"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
+"#define SHAPE_SPHERE 7\n"
+"typedef unsigned int u32;\n"
+"#define MAX_NUM_PARTS_IN_BITS 10\n"
+"///btQuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
+"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n"
+"typedef struct\n"
+"{\n"
+"	//12 bytes\n"
+"	unsigned short int	m_quantizedAabbMin[3];\n"
+"	unsigned short int	m_quantizedAabbMax[3];\n"
+"	//4 bytes\n"
+"	int	m_escapeIndexOrTriangleIndex;\n"
+"} btQuantizedBvhNode;\n"
+"typedef struct\n"
+"{\n"
+"	float4		m_aabbMin;\n"
+"	float4		m_aabbMax;\n"
+"	float4		m_quantization;\n"
+"	int			m_numNodes;\n"
+"	int			m_numSubTrees;\n"
+"	int			m_nodeOffset;\n"
+"	int			m_subTreeOffset;\n"
+"} b3BvhInfo;\n"
+"int	getTriangleIndex(const btQuantizedBvhNode* rootNode)\n"
+"{\n"
+"	unsigned int x=0;\n"
+"	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
+"	// Get only the lower bits where the triangle index is stored\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
+"}\n"
+"int isLeaf(const btQuantizedBvhNode* rootNode)\n"
+"{\n"
+"	//skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
+"}\n"
+"	\n"
+"int getEscapeIndex(const btQuantizedBvhNode* rootNode)\n"
+"{\n"
+"	return -rootNode->m_escapeIndexOrTriangleIndex;\n"
+"}\n"
+"typedef struct\n"
+"{\n"
+"	//12 bytes\n"
+"	unsigned short int	m_quantizedAabbMin[3];\n"
+"	unsigned short int	m_quantizedAabbMax[3];\n"
+"	//4 bytes, points to the root of the subtree\n"
+"	int			m_rootNodeIndex;\n"
+"	//4 bytes\n"
+"	int			m_subtreeSize;\n"
+"	int			m_padding[3];\n"
+"} btBvhSubtreeInfo;\n"
+"///keep this in sync with btCollidable.h\n"
+"typedef struct\n"
+"{\n"
+"	int m_numChildShapes;\n"
+"	int blaat2;\n"
+"	int m_shapeType;\n"
+"	int m_shapeIndex;\n"
+"	\n"
+"} btCollidableGpu;\n"
+"typedef struct\n"
+"{\n"
+"	float4	m_childPosition;\n"
+"	float4	m_childOrientation;\n"
+"	int m_shapeIndex;\n"
+"	int m_unused0;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"} btGpuChildShape;\n"
+"typedef struct\n"
+"{\n"
+"	float4 m_pos;\n"
+"	float4 m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"	u32 m_collidableIdx;\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} BodyData;\n"
+"typedef struct \n"
+"{\n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} btAabbCL;\n"
+"int testQuantizedAabbAgainstQuantizedAabb(\n"
+"								const unsigned short int* aabbMin1,\n"
+"								const unsigned short int* aabbMax1,\n"
+"								const unsigned short int* aabbMin2,\n"
+"								const unsigned short int* aabbMax2)\n"
+"{\n"
+"	//int overlap = 1;\n"
+"	if (aabbMin1[0] > aabbMax2[0])\n"
+"		return 0;\n"
+"	if (aabbMax1[0] < aabbMin2[0])\n"
+"		return 0;\n"
+"	if (aabbMin1[1] > aabbMax2[1])\n"
+"		return 0;\n"
+"	if (aabbMax1[1] < aabbMin2[1])\n"
+"		return 0;\n"
+"	if (aabbMin1[2] > aabbMax2[2])\n"
+"		return 0;\n"
+"	if (aabbMax1[2] < aabbMin2[2])\n"
+"		return 0;\n"
+"	return 1;\n"
+"	//overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;\n"
+"	//overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;\n"
+"	//overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;\n"
+"	//return overlap;\n"
+"}\n"
+"void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)\n"
+"{\n"
+"	float4 clampedPoint = max(point2,bvhAabbMin);\n"
+"	clampedPoint = min (clampedPoint, bvhAabbMax);\n"
+"	float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;\n"
+"	if (isMax)\n"
+"	{\n"
+"		out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));\n"
+"		out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));\n"
+"		out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));\n"
+"	} else\n"
+"	{\n"
+"		out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));\n"
+"		out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));\n"
+"		out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));\n"
+"	}\n"
+"}\n"
+"// work-in-progress\n"
+"__kernel void   bvhTraversalKernel( __global const int4* pairs, \n"
+"									__global const BodyData* rigidBodies, \n"
+"									__global const btCollidableGpu* collidables,\n"
+"									__global btAabbCL* aabbs,\n"
+"									__global int4* concavePairsOut,\n"
+"									__global volatile int* numConcavePairsOut,\n"
+"									__global const btBvhSubtreeInfo* subtreeHeadersRoot,\n"
+"									__global const btQuantizedBvhNode* quantizedNodesRoot,\n"
+"									__global const b3BvhInfo* bvhInfos,\n"
+"									int numPairs,\n"
+"									int maxNumConcavePairsCapacity)\n"
+"{\n"
+"	int id = get_global_id(0);\n"
+"	if (id>=numPairs)\n"
+"		return;\n"
+"	\n"
+"	int bodyIndexA = pairs[id].x;\n"
+"	int bodyIndexB = pairs[id].y;\n"
+"	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"	\n"
+"	//once the broadphase avoids static-static pairs, we can remove this test\n"
+"	if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
+"	{\n"
+"		return;\n"
+"	}\n"
+"		\n"
+"	if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)\n"
+"		return;\n"
+"	int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
+"		\n"
+"	if (shapeTypeB!=SHAPE_CONVEX_HULL &&\n"
+"		shapeTypeB!=SHAPE_SPHERE	&&\n"
+"		shapeTypeB!=SHAPE_COMPOUND_OF_CONVEX_HULLS\n"
+"		)\n"
+"		return;\n"
+"	b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];\n"
+"	float4 bvhAabbMin = bvhInfo.m_aabbMin;\n"
+"	float4 bvhAabbMax = bvhInfo.m_aabbMax;\n"
+"	float4 bvhQuantization = bvhInfo.m_quantization;\n"
+"	int numSubtreeHeaders = bvhInfo.m_numSubTrees;\n"
+"	__global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n"
+"	__global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n"
+"	\n"
+"	unsigned short int quantizedQueryAabbMin[3];\n"
+"	unsigned short int quantizedQueryAabbMax[3];\n"
+"	quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
+"	quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
+"	\n"
+"	for (int i=0;i<numSubtreeHeaders;i++)\n"
+"	{\n"
+"		btBvhSubtreeInfo subtree = subtreeHeaders[i];\n"
+"				\n"
+"		int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);\n"
+"		if (overlap != 0)\n"
+"		{\n"
+"			int startNodeIndex = subtree.m_rootNodeIndex;\n"
+"			int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;\n"
+"			int curIndex = startNodeIndex;\n"
+"			int escapeIndex;\n"
+"			int isLeafNode;\n"
+"			int aabbOverlap;\n"
+"			while (curIndex < endNodeIndex)\n"
+"			{\n"
+"				btQuantizedBvhNode rootNode = quantizedNodes[curIndex];\n"
+"				aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);\n"
+"				isLeafNode = isLeaf(&rootNode);\n"
+"				if (aabbOverlap)\n"
+"				{\n"
+"					if (isLeafNode)\n"
+"					{\n"
+"						int triangleIndex = getTriangleIndex(&rootNode);\n"
+"						if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"						{\n"
+"								int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n"
+"								int pairIdx = atomic_add(numConcavePairsOut,numChildrenB);\n"
+"								for (int b=0;b<numChildrenB;b++)\n"
+"								{\n"
+"									if ((pairIdx+b)<maxNumConcavePairsCapacity)\n"
+"									{\n"
+"										int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n"
+"										int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);\n"
+"										concavePairsOut[pairIdx+b] = newPair;\n"
+"									}\n"
+"								}\n"
+"						} else\n"
+"						{\n"
+"							int pairIdx = atomic_inc(numConcavePairsOut);\n"
+"							if (pairIdx<maxNumConcavePairsCapacity)\n"
+"							{\n"
+"								int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0);\n"
+"								concavePairsOut[pairIdx] = newPair;\n"
+"							}\n"
+"						}\n"
+"					} \n"
+"					curIndex++;\n"
+"				} else\n"
+"				{\n"
+"					if (isLeafNode)\n"
+"					{\n"
+"						curIndex++;\n"
+"					} else\n"
+"					{\n"
+"						escapeIndex = getEscapeIndex(&rootNode);\n"
+"						curIndex += escapeIndex;\n"
+"					}\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+;
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl
new file mode 100644
index 0000000000..e754f4e1da
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl
@@ -0,0 +1,311 @@
+
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3MprPenetration.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+
+#define AppendInc(x, out) out = atomic_inc(x)
+#define GET_NPOINTS(x) (x).m_worldNormalOnB.w
+#ifdef cl_ext_atomic_counters_32
+	#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+#else
+	#define counter32_t volatile __global int*
+#endif
+
+
+__kernel void   mprPenetrationKernel( __global int4* pairs,
+																					__global const b3RigidBodyData_t* rigidBodies, 
+																					__global const b3Collidable_t* collidables,
+																					__global const b3ConvexPolyhedronData_t* convexShapes, 
+																					__global const float4* vertices,
+																					__global float4* separatingNormals,
+																					__global int* hasSeparatingAxis,
+																					__global struct b3Contact4Data* restrict globalContactsOut,
+																					counter32_t nGlobalContactsOut,
+																					int contactCapacity,
+																					int numPairs)
+{
+	int i = get_global_id(0);
+	int pairIndex = i;
+	if (i<numPairs)
+	{
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+	
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		
+		
+		//once the broadphase avoids static-static pairs, we can remove this test
+		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+		{
+			return;
+		}
+		
+
+		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))
+		{
+			return;
+		}
+
+		float depthOut;
+		b3Float4 dirOut;
+		b3Float4 posOut;
+
+
+		int res = b3MprPenetration(pairIndex, bodyIndexA, bodyIndexB,rigidBodies,convexShapes,collidables,vertices,separatingNormals,hasSeparatingAxis,&depthOut, &dirOut, &posOut);
+		
+		
+		
+		
+
+		if (res==0)
+		{
+			//add a contact
+
+			int dstIdx;
+			AppendInc( nGlobalContactsOut, dstIdx );
+			if (dstIdx<contactCapacity)
+			{
+				pairs[pairIndex].z = dstIdx;
+				__global struct b3Contact4Data* c = globalContactsOut + dstIdx;
+				c->m_worldNormalOnB = -dirOut;//normal;
+				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+				c->m_batchIdx = pairIndex;
+				int bodyA = pairs[pairIndex].x;
+				int bodyB = pairs[pairIndex].y;
+				c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;
+				c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;
+				c->m_childIndexA = -1;
+				c->m_childIndexB = -1;
+				//for (int i=0;i<nContacts;i++)
+				posOut.w = -depthOut;
+				c->m_worldPosB[0] = posOut;//localPoints[contactIdx[i]];
+				GET_NPOINTS(*c) = 1;//nContacts;
+			}
+		}
+
+	}
+}
+
+typedef float4 Quaternion;
+#define make_float4 (float4)
+
+__inline
+float dot3F4(float4 a, float4 b)
+{
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+}
+
+
+
+
+__inline
+float4 cross3(float4 a, float4 b)
+{
+	return cross(a,b);
+}
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b)
+{
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+}
+
+__inline
+Quaternion qtInvert(Quaternion q)
+{
+	return (Quaternion)(-q.xyz, q.w);
+}
+
+__inline
+float4 qtRotate(Quaternion q, float4 vec)
+{
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+}
+
+__inline
+float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)
+{
+	return qtRotate( *orientation, *p ) + (*translation);
+}
+
+
+__inline
+float4 qtInvRotate(const Quaternion q, float4 vec)
+{
+	return qtRotate( qtInvert( q ), vec );
+}
+
+
+inline void project(__global const b3ConvexPolyhedronData_t* hull,  const float4 pos, const float4 orn, 
+const float4* dir, __global const float4* vertices, float* min, float* max)
+{
+	min[0] = FLT_MAX;
+	max[0] = -FLT_MAX;
+	int numVerts = hull->m_numVertices;
+
+	const float4 localDir = qtInvRotate(orn,*dir);
+	float offset = dot(pos,*dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);
+		if(dp < min[0])	
+			min[0] = dp;
+		if(dp > max[0])	
+			max[0] = dp;
+	}
+	if(min[0]>max[0])
+	{
+		float tmp = min[0];
+		min[0] = max[0];
+		max[0] = tmp;
+	}
+	min[0] += offset;
+	max[0] += offset;
+}
+
+
+bool findSeparatingAxisUnitSphere(	__global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* vertices,
+	__global const float4* unitSphereDirections,
+	int numUnitSphereDirections,
+	float4* sep,
+	float* dmin)
+{
+	
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+
+	int curPlaneTests=0;
+
+	int curEdgeEdge = 0;
+	// Test unit sphere directions
+	for (int i=0;i<numUnitSphereDirections;i++)
+	{
+
+		float4 crossje;
+		crossje = unitSphereDirections[i];	
+
+		if (dot3F4(DeltaC2,crossje)>0)
+			crossje *= -1.f;
+		{
+			float dist;
+			bool result = true;
+			float Min0,Max0;
+			float Min1,Max1;
+			project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);
+			project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);
+		
+			if(Max0<Min1 || Max1<Min0)
+				return false;
+		
+			float d0 = Max0 - Min1;
+			float d1 = Max1 - Min0;
+			dist = d0<d1 ? d0:d1;
+			result = true;
+	
+			if(dist<*dmin)
+			{
+				*dmin = dist;
+				*sep = crossje;
+			}
+		}
+	}
+
+	
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+
+
+__kernel void   findSeparatingAxisUnitSphereKernel( __global const int4* pairs, 
+																					__global const b3RigidBodyData_t* rigidBodies, 
+																					__global const b3Collidable_t* collidables,
+																					__global const b3ConvexPolyhedronData_t* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* unitSphereDirections,
+																					__global  float4* separatingNormals,
+																					__global  int* hasSeparatingAxis,
+																					__global  float* dmins,
+																					int numUnitSphereDirections,
+																					int numPairs
+																					)
+{
+
+	int i = get_global_id(0);
+	
+	if (i<numPairs)
+	{
+
+		if (hasSeparatingAxis[i])
+		{
+	
+			int bodyIndexA = pairs[i].x;
+			int bodyIndexB = pairs[i].y;
+	
+			int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+			int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+		
+			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+			
+			
+			int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+	
+			float dmin = dmins[i];
+	
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			posA.w = 0.f;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+			posB.w = 0.f;
+			float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+			float4 ornA = rigidBodies[bodyIndexA].m_quat;
+			float4 c0 = transform(&c0local, &posA, &ornA);
+			float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+			float4 ornB =rigidBodies[bodyIndexB].m_quat;
+			float4 c1 = transform(&c1local,&posB,&ornB);
+			const float4 DeltaC2 = c0 - c1;
+			float4 sepNormal = separatingNormals[i];
+			
+			int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;
+			if (numEdgeEdgeDirections>numUnitSphereDirections)
+			{
+				bool sepEE = findSeparatingAxisUnitSphere(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																										posB,ornB,
+																										DeltaC2,
+																										vertices,unitSphereDirections,numUnitSphereDirections,&sepNormal,&dmin);
+				if (!sepEE)
+				{
+					hasSeparatingAxis[i] = 0;
+				} else
+				{
+					hasSeparatingAxis[i] = 1;
+					separatingNormals[i] = sepNormal;
+				}
+			}
+		}		//if (hasSeparatingAxis[i])
+	}//(i<numPairs)
+}
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h
new file mode 100644
index 0000000000..7ed4b382c3
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h
@@ -0,0 +1,1446 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* mprKernelsCL= \
+"/***\n"
+" * ---------------------------------\n"
+" * Copyright (c)2012 Daniel Fiser <danfis@danfis.cz>\n"
+" *\n"
+" *  This file was ported from mpr.c file, part of libccd.\n"
+" *  The Minkoski Portal Refinement implementation was ported \n"
+" *  to OpenCL by Erwin Coumans for the Bullet 3 Physics library.\n"
+" *  at http://github.com/erwincoumans/bullet3\n"
+" *\n"
+" *  Distributed under the OSI-approved BSD License (the \"License\");\n"
+" *  see <http://www.opensource.org/licenses/bsd-license.php>.\n"
+" *  This software is distributed WITHOUT ANY WARRANTY; without even the\n"
+" *  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n"
+" *  See the License for more information.\n"
+" */\n"
+"#ifndef B3_MPR_PENETRATION_H\n"
+"#define B3_MPR_PENETRATION_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#define B3_PLATFORM_DEFINITIONS_H\n"
+"struct MyTest\n"
+"{\n"
+"	int bla;\n"
+"};\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3AtomicAdd atomic_add\n"
+"#define b3Fabs fabs\n"
+"#define b3Sqrt native_sqrt\n"
+"#define b3Sin native_sin\n"
+"#define b3Cos native_cos\n"
+"#define B3_STATIC\n"
+"#endif\n"
+"#endif\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif\n"
+"#endif\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"	#define b3MinFloat4 min\n"
+"	#define b3MaxFloat4 max\n"
+"	#define b3Normalized(a) normalize(a)\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"{\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
+"}\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"{\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"}\n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_RIGIDBODY_DATA_H\n"
+"#define B3_RIGIDBODY_DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#define B3_QUAT_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif\n"
+"#endif\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"	typedef float4	b3Quat;\n"
+"	#define b3QuatConstArg const b3Quat\n"
+"	\n"
+"	\n"
+"inline float4 b3FastNormalize4(float4 v)\n"
+"{\n"
+"	v = (float4)(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"}\n"
+"	\n"
+"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
+"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
+"{\n"
+"	b3Quat ans;\n"
+"	ans = b3Cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
+"	return ans;\n"
+"}\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
+"{\n"
+"	b3Quat q;\n"
+"	q=in;\n"
+"	//return b3FastNormalize4(in);\n"
+"	float len = native_sqrt(dot(q, q));\n"
+"	if(len > 0.f)\n"
+"	{\n"
+"		q *= 1.f / len;\n"
+"	}\n"
+"	else\n"
+"	{\n"
+"		q.x = q.y = q.z = 0.f;\n"
+"		q.w = 1.f;\n"
+"	}\n"
+"	return q;\n"
+"}\n"
+"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"{\n"
+"	b3Quat qInv = b3QuatInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"}\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
+"{\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"}\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
+"{\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"}\n"
+"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"{\n"
+"	return b3QuatRotate( b3QuatInvert( q ), vec );\n"
+"}\n"
+"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)\n"
+"{\n"
+"	return b3QuatRotate( orientation, point ) + (translation);\n"
+"}\n"
+"	\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifndef B3_MAT3x3_H\n"
+"#define B3_MAT3x3_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"typedef struct\n"
+"{\n"
+"	b3Float4 m_row[3];\n"
+"}b3Mat3x3;\n"
+"#define b3Mat3x3ConstArg const b3Mat3x3\n"
+"#define b3GetRow(m,row) (m.m_row[row])\n"
+"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
+"{\n"
+"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
+"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
+"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
+"	out.m_row[0].w = 0.f;\n"
+"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
+"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
+"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
+"	out.m_row[1].w = 0.f;\n"
+"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
+"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
+"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
+"	out.m_row[2].w = 0.f;\n"
+"	return out;\n"
+"}\n"
+"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
+"{\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = fabs(matIn.m_row[0]);\n"
+"	out.m_row[1] = fabs(matIn.m_row[1]);\n"
+"	out.m_row[2] = fabs(matIn.m_row[2]);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtZero();\n"
+"__inline\n"
+"b3Mat3x3 mtIdentity();\n"
+"__inline\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
+"__inline\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
+"__inline\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
+"__inline\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
+"__inline\n"
+"b3Mat3x3 mtZero()\n"
+"{\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(0.f);\n"
+"	m.m_row[1] = (b3Float4)(0.f);\n"
+"	m.m_row[2] = (b3Float4)(0.f);\n"
+"	return m;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtIdentity()\n"
+"{\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
+"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
+"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
+"	return m;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
+"{\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
+"{\n"
+"	b3Mat3x3 transB;\n"
+"	transB = mtTranspose( b );\n"
+"	b3Mat3x3 ans;\n"
+"	//	why this doesn't run when 0ing in the for{}\n"
+"	a.m_row[0].w = 0.f;\n"
+"	a.m_row[1].w = 0.f;\n"
+"	a.m_row[2].w = 0.f;\n"
+"	for(int i=0; i<3; i++)\n"
+"	{\n"
+"//	a.m_row[i].w = 0.f;\n"
+"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
+"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
+"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
+"		ans.m_row[i].w = 0.f;\n"
+"	}\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
+"{\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
+"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
+"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
+"{\n"
+"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a, colx );\n"
+"	ans.y = b3Dot3F4( a, coly );\n"
+"	ans.z = b3Dot3F4( a, colz );\n"
+"	return ans;\n"
+"}\n"
+"#endif\n"
+"#endif //B3_MAT3x3_H\n"
+"typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
+"struct b3RigidBodyData\n"
+"{\n"
+"	b3Float4				m_pos;\n"
+"	b3Quat					m_quat;\n"
+"	b3Float4				m_linVel;\n"
+"	b3Float4				m_angVel;\n"
+"	int 					m_collidableIdx;\n"
+"	float 				m_invMass;\n"
+"	float 				m_restituitionCoeff;\n"
+"	float 				m_frictionCoeff;\n"
+"};\n"
+"typedef struct b3InertiaData b3InertiaData_t;\n"
+"struct b3InertiaData\n"
+"{\n"
+"	b3Mat3x3 m_invInertiaWorld;\n"
+"	b3Mat3x3 m_initInvInertia;\n"
+"};\n"
+"#endif //B3_RIGIDBODY_DATA_H\n"
+"	\n"
+"#ifndef B3_CONVEX_POLYHEDRON_DATA_H\n"
+"#define B3_CONVEX_POLYHEDRON_DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"typedef struct b3GpuFace b3GpuFace_t;\n"
+"struct b3GpuFace\n"
+"{\n"
+"	b3Float4 m_plane;\n"
+"	int m_indexOffset;\n"
+"	int m_numIndices;\n"
+"	int m_unusedPadding1;\n"
+"	int m_unusedPadding2;\n"
+"};\n"
+"typedef struct b3ConvexPolyhedronData b3ConvexPolyhedronData_t;\n"
+"struct b3ConvexPolyhedronData\n"
+"{\n"
+"	b3Float4		m_localCenter;\n"
+"	b3Float4		m_extents;\n"
+"	b3Float4		mC;\n"
+"	b3Float4		mE;\n"
+"	float			m_radius;\n"
+"	int	m_faceOffset;\n"
+"	int m_numFaces;\n"
+"	int	m_numVertices;\n"
+"	int m_vertexOffset;\n"
+"	int	m_uniqueEdgesOffset;\n"
+"	int	m_numUniqueEdges;\n"
+"	int m_unused;\n"
+"};\n"
+"#endif //B3_CONVEX_POLYHEDRON_DATA_H\n"
+"#ifndef B3_COLLIDABLE_H\n"
+"#define B3_COLLIDABLE_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"enum b3ShapeTypes\n"
+"{\n"
+"	SHAPE_HEIGHT_FIELD=1,\n"
+"	SHAPE_CONVEX_HULL=3,\n"
+"	SHAPE_PLANE=4,\n"
+"	SHAPE_CONCAVE_TRIMESH=5,\n"
+"	SHAPE_COMPOUND_OF_CONVEX_HULLS=6,\n"
+"	SHAPE_SPHERE=7,\n"
+"	MAX_NUM_SHAPE_TYPES,\n"
+"};\n"
+"typedef struct b3Collidable b3Collidable_t;\n"
+"struct b3Collidable\n"
+"{\n"
+"	union {\n"
+"		int m_numChildShapes;\n"
+"		int m_bvhIndex;\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float m_radius;\n"
+"		int	m_compoundBvhIndex;\n"
+"	};\n"
+"	int m_shapeType;\n"
+"	int m_shapeIndex;\n"
+"};\n"
+"typedef struct b3GpuChildShape b3GpuChildShape_t;\n"
+"struct b3GpuChildShape\n"
+"{\n"
+"	b3Float4	m_childPosition;\n"
+"	b3Quat		m_childOrientation;\n"
+"	int m_shapeIndex;\n"
+"	int m_unused0;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"};\n"
+"struct b3CompoundOverlappingPair\n"
+"{\n"
+"	int m_bodyIndexA;\n"
+"	int m_bodyIndexB;\n"
+"//	int	m_pairType;\n"
+"	int m_childShapeIndexA;\n"
+"	int m_childShapeIndexB;\n"
+"};\n"
+"#endif //B3_COLLIDABLE_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#define B3_MPR_SQRT sqrt\n"
+"#endif\n"
+"#define B3_MPR_FMIN(x, y) ((x) < (y) ? (x) : (y))\n"
+"#define B3_MPR_FABS fabs\n"
+"#define B3_MPR_TOLERANCE 1E-6f\n"
+"#define B3_MPR_MAX_ITERATIONS 1000\n"
+"struct _b3MprSupport_t \n"
+"{\n"
+"    b3Float4 v;  //!< Support point in minkowski sum\n"
+"    b3Float4 v1; //!< Support point in obj1\n"
+"    b3Float4 v2; //!< Support point in obj2\n"
+"};\n"
+"typedef struct _b3MprSupport_t b3MprSupport_t;\n"
+"struct _b3MprSimplex_t \n"
+"{\n"
+"    b3MprSupport_t ps[4];\n"
+"    int last; //!< index of last added point\n"
+"};\n"
+"typedef struct _b3MprSimplex_t b3MprSimplex_t;\n"
+"inline b3MprSupport_t* b3MprSimplexPointW(b3MprSimplex_t *s, int idx)\n"
+"{\n"
+"    return &s->ps[idx];\n"
+"}\n"
+"inline void b3MprSimplexSetSize(b3MprSimplex_t *s, int size)\n"
+"{\n"
+"    s->last = size - 1;\n"
+"}\n"
+"inline int b3MprSimplexSize(const b3MprSimplex_t *s)\n"
+"{\n"
+"    return s->last + 1;\n"
+"}\n"
+"inline const b3MprSupport_t* b3MprSimplexPoint(const b3MprSimplex_t* s, int idx)\n"
+"{\n"
+"    // here is no check on boundaries\n"
+"    return &s->ps[idx];\n"
+"}\n"
+"inline void b3MprSupportCopy(b3MprSupport_t *d, const b3MprSupport_t *s)\n"
+"{\n"
+"    *d = *s;\n"
+"}\n"
+"inline void b3MprSimplexSet(b3MprSimplex_t *s, size_t pos, const b3MprSupport_t *a)\n"
+"{\n"
+"    b3MprSupportCopy(s->ps + pos, a);\n"
+"}\n"
+"inline void b3MprSimplexSwap(b3MprSimplex_t *s, size_t pos1, size_t pos2)\n"
+"{\n"
+"    b3MprSupport_t supp;\n"
+"    b3MprSupportCopy(&supp, &s->ps[pos1]);\n"
+"    b3MprSupportCopy(&s->ps[pos1], &s->ps[pos2]);\n"
+"    b3MprSupportCopy(&s->ps[pos2], &supp);\n"
+"}\n"
+"inline int b3MprIsZero(float val)\n"
+"{\n"
+"    return B3_MPR_FABS(val) < FLT_EPSILON;\n"
+"}\n"
+"inline int b3MprEq(float _a, float _b)\n"
+"{\n"
+"    float ab;\n"
+"    float a, b;\n"
+"    ab = B3_MPR_FABS(_a - _b);\n"
+"    if (B3_MPR_FABS(ab) < FLT_EPSILON)\n"
+"        return 1;\n"
+"    a = B3_MPR_FABS(_a);\n"
+"    b = B3_MPR_FABS(_b);\n"
+"    if (b > a){\n"
+"        return ab < FLT_EPSILON * b;\n"
+"    }else{\n"
+"        return ab < FLT_EPSILON * a;\n"
+"    }\n"
+"}\n"
+"inline int b3MprVec3Eq(const b3Float4* a, const b3Float4 *b)\n"
+"{\n"
+"    return b3MprEq((*a).x, (*b).x)\n"
+"            && b3MprEq((*a).y, (*b).y)\n"
+"            && b3MprEq((*a).z, (*b).z);\n"
+"}\n"
+"inline b3Float4 b3LocalGetSupportVertex(b3Float4ConstArg supportVec,__global const b3ConvexPolyhedronData_t* hull, 	b3ConstArray(b3Float4) verticesA)\n"
+"{\n"
+"	b3Float4 supVec = b3MakeFloat4(0,0,0,0);\n"
+"	float maxDot = -B3_LARGE_FLOAT;\n"
+"    if( 0 < hull->m_numVertices )\n"
+"    {\n"
+"        const b3Float4 scaled = supportVec;\n"
+"		int index = b3MaxDot(scaled, &verticesA[hull->m_vertexOffset], hull->m_numVertices, &maxDot);\n"
+"        return verticesA[hull->m_vertexOffset+index];\n"
+"    }\n"
+"    return supVec;\n"
+"}\n"
+"B3_STATIC void b3MprConvexSupport(int pairIndex,int bodyIndex,  b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n"
+"													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n"
+"													b3ConstArray(b3Collidable_t)				cpuCollidables,\n"
+"													b3ConstArray(b3Float4)					cpuVertices,\n"
+"													__global b3Float4* sepAxis,\n"
+"														const b3Float4* _dir, b3Float4* outp, int logme)\n"
+"{\n"
+"	//dir is in worldspace, move to local space\n"
+"	\n"
+"	b3Float4 pos = cpuBodyBuf[bodyIndex].m_pos;\n"
+"	b3Quat orn = cpuBodyBuf[bodyIndex].m_quat;\n"
+"	\n"
+"	b3Float4 dir = b3MakeFloat4((*_dir).x,(*_dir).y,(*_dir).z,0.f);\n"
+"	\n"
+"	const b3Float4 localDir = b3QuatRotate(b3QuatInverse(orn),dir);\n"
+"	\n"
+"	//find local support vertex\n"
+"	int colIndex = cpuBodyBuf[bodyIndex].m_collidableIdx;\n"
+"	\n"
+"	b3Assert(cpuCollidables[colIndex].m_shapeType==SHAPE_CONVEX_HULL);\n"
+"	__global const b3ConvexPolyhedronData_t* hull = &cpuConvexData[cpuCollidables[colIndex].m_shapeIndex];\n"
+"	\n"
+"	b3Float4 pInA;\n"
+"	if (logme)\n"
+"	{\n"
+"		b3Float4 supVec = b3MakeFloat4(0,0,0,0);\n"
+"		float maxDot = -B3_LARGE_FLOAT;\n"
+"		if( 0 < hull->m_numVertices )\n"
+"		{\n"
+"			const b3Float4 scaled = localDir;\n"
+"			int index = b3MaxDot(scaled, &cpuVertices[hull->m_vertexOffset], hull->m_numVertices, &maxDot);\n"
+"			pInA = cpuVertices[hull->m_vertexOffset+index];\n"
+"			\n"
+"		}\n"
+"	} else\n"
+"	{\n"
+"		pInA = b3LocalGetSupportVertex(localDir,hull,cpuVertices);\n"
+"	}\n"
+"	//move vertex to world space\n"
+"	*outp = b3TransformPoint(pInA,pos,orn);\n"
+"	\n"
+"}\n"
+"inline void b3MprSupport(int pairIndex,int bodyIndexA, int bodyIndexB,   b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n"
+"													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n"
+"													b3ConstArray(b3Collidable_t)				cpuCollidables,\n"
+"													b3ConstArray(b3Float4)					cpuVertices,\n"
+"													__global b3Float4* sepAxis,\n"
+"													const b3Float4* _dir, b3MprSupport_t *supp)\n"
+"{\n"
+"    b3Float4 dir;\n"
+"	dir = *_dir;\n"
+"	b3MprConvexSupport(pairIndex,bodyIndexA,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,&dir, &supp->v1,0);\n"
+"    dir = *_dir*-1.f;\n"
+"	b3MprConvexSupport(pairIndex,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,&dir, &supp->v2,0);\n"
+"    supp->v = supp->v1 - supp->v2;\n"
+"}\n"
+"inline void b3FindOrigin(int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, b3MprSupport_t *center)\n"
+"{\n"
+"    center->v1 = cpuBodyBuf[bodyIndexA].m_pos;\n"
+"	center->v2 = cpuBodyBuf[bodyIndexB].m_pos;\n"
+"    center->v = center->v1 - center->v2;\n"
+"}\n"
+"inline void b3MprVec3Set(b3Float4 *v, float x, float y, float z)\n"
+"{\n"
+"	(*v).x = x;\n"
+"	(*v).y = y;\n"
+"	(*v).z = z;\n"
+"	(*v).w = 0.f;\n"
+"}\n"
+"inline void b3MprVec3Add(b3Float4 *v, const b3Float4 *w)\n"
+"{\n"
+"    (*v).x += (*w).x;\n"
+"    (*v).y += (*w).y;\n"
+"    (*v).z += (*w).z;\n"
+"}\n"
+"inline void b3MprVec3Copy(b3Float4 *v, const b3Float4 *w)\n"
+"{\n"
+"    *v = *w;\n"
+"}\n"
+"inline void b3MprVec3Scale(b3Float4 *d, float k)\n"
+"{\n"
+"    *d *= k;\n"
+"}\n"
+"inline float b3MprVec3Dot(const b3Float4 *a, const b3Float4 *b)\n"
+"{\n"
+"    float dot;\n"
+"	dot = b3Dot3F4(*a,*b);\n"
+"    return dot;\n"
+"}\n"
+"inline float b3MprVec3Len2(const b3Float4 *v)\n"
+"{\n"
+"    return b3MprVec3Dot(v, v);\n"
+"}\n"
+"inline void b3MprVec3Normalize(b3Float4 *d)\n"
+"{\n"
+"    float k = 1.f / B3_MPR_SQRT(b3MprVec3Len2(d));\n"
+"    b3MprVec3Scale(d, k);\n"
+"}\n"
+"inline void b3MprVec3Cross(b3Float4 *d, const b3Float4 *a, const b3Float4 *b)\n"
+"{\n"
+"	*d = b3Cross3(*a,*b);\n"
+"	\n"
+"}\n"
+"inline void b3MprVec3Sub2(b3Float4 *d, const b3Float4 *v, const b3Float4 *w)\n"
+"{\n"
+"	*d = *v - *w;\n"
+"}\n"
+"inline void b3PortalDir(const b3MprSimplex_t *portal, b3Float4 *dir)\n"
+"{\n"
+"    b3Float4 v2v1, v3v1;\n"
+"    b3MprVec3Sub2(&v2v1, &b3MprSimplexPoint(portal, 2)->v,\n"
+"                       &b3MprSimplexPoint(portal, 1)->v);\n"
+"    b3MprVec3Sub2(&v3v1, &b3MprSimplexPoint(portal, 3)->v,\n"
+"                       &b3MprSimplexPoint(portal, 1)->v);\n"
+"    b3MprVec3Cross(dir, &v2v1, &v3v1);\n"
+"    b3MprVec3Normalize(dir);\n"
+"}\n"
+"inline int portalEncapsulesOrigin(const b3MprSimplex_t *portal,\n"
+"                                       const b3Float4 *dir)\n"
+"{\n"
+"    float dot;\n"
+"    dot = b3MprVec3Dot(dir, &b3MprSimplexPoint(portal, 1)->v);\n"
+"    return b3MprIsZero(dot) || dot > 0.f;\n"
+"}\n"
+"inline int portalReachTolerance(const b3MprSimplex_t *portal,\n"
+"                                     const b3MprSupport_t *v4,\n"
+"                                     const b3Float4 *dir)\n"
+"{\n"
+"    float dv1, dv2, dv3, dv4;\n"
+"    float dot1, dot2, dot3;\n"
+"    // find the smallest dot product of dir and {v1-v4, v2-v4, v3-v4}\n"
+"    dv1 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, dir);\n"
+"    dv2 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, dir);\n"
+"    dv3 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, dir);\n"
+"    dv4 = b3MprVec3Dot(&v4->v, dir);\n"
+"    dot1 = dv4 - dv1;\n"
+"    dot2 = dv4 - dv2;\n"
+"    dot3 = dv4 - dv3;\n"
+"    dot1 = B3_MPR_FMIN(dot1, dot2);\n"
+"    dot1 = B3_MPR_FMIN(dot1, dot3);\n"
+"    return b3MprEq(dot1, B3_MPR_TOLERANCE) || dot1 < B3_MPR_TOLERANCE;\n"
+"}\n"
+"inline int portalCanEncapsuleOrigin(const b3MprSimplex_t *portal,   \n"
+"                                         const b3MprSupport_t *v4,\n"
+"                                         const b3Float4 *dir)\n"
+"{\n"
+"    float dot;\n"
+"    dot = b3MprVec3Dot(&v4->v, dir);\n"
+"    return b3MprIsZero(dot) || dot > 0.f;\n"
+"}\n"
+"inline void b3ExpandPortal(b3MprSimplex_t *portal,\n"
+"                              const b3MprSupport_t *v4)\n"
+"{\n"
+"    float dot;\n"
+"    b3Float4 v4v0;\n"
+"    b3MprVec3Cross(&v4v0, &v4->v, &b3MprSimplexPoint(portal, 0)->v);\n"
+"    dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, &v4v0);\n"
+"    if (dot > 0.f){\n"
+"        dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, &v4v0);\n"
+"        if (dot > 0.f){\n"
+"            b3MprSimplexSet(portal, 1, v4);\n"
+"        }else{\n"
+"            b3MprSimplexSet(portal, 3, v4);\n"
+"        }\n"
+"    }else{\n"
+"        dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, &v4v0);\n"
+"        if (dot > 0.f){\n"
+"            b3MprSimplexSet(portal, 2, v4);\n"
+"        }else{\n"
+"            b3MprSimplexSet(portal, 1, v4);\n"
+"        }\n"
+"    }\n"
+"}\n"
+"B3_STATIC int b3DiscoverPortal(int pairIndex, int bodyIndexA, int bodyIndexB,  b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n"
+"													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n"
+"													b3ConstArray(b3Collidable_t)				cpuCollidables,\n"
+"													b3ConstArray(b3Float4)					cpuVertices,\n"
+"													__global b3Float4* sepAxis,\n"
+"													__global int*	hasSepAxis,\n"
+"													b3MprSimplex_t *portal)\n"
+"{\n"
+"    b3Float4 dir, va, vb;\n"
+"    float dot;\n"
+"    int cont;\n"
+"	\n"
+"	\n"
+"    // vertex 0 is center of portal\n"
+"    b3FindOrigin(bodyIndexA,bodyIndexB,cpuBodyBuf, b3MprSimplexPointW(portal, 0));\n"
+"    // vertex 0 is center of portal\n"
+"    b3MprSimplexSetSize(portal, 1);\n"
+"	\n"
+"	b3Float4 zero = b3MakeFloat4(0,0,0,0);\n"
+"	b3Float4* b3mpr_vec3_origin = &zero;\n"
+"    if (b3MprVec3Eq(&b3MprSimplexPoint(portal, 0)->v, b3mpr_vec3_origin)){\n"
+"        // Portal's center lies on origin (0,0,0) => we know that objects\n"
+"        // intersect but we would need to know penetration info.\n"
+"        // So move center little bit...\n"
+"        b3MprVec3Set(&va, FLT_EPSILON * 10.f, 0.f, 0.f);\n"
+"        b3MprVec3Add(&b3MprSimplexPointW(portal, 0)->v, &va);\n"
+"    }\n"
+"    // vertex 1 = support in direction of origin\n"
+"    b3MprVec3Copy(&dir, &b3MprSimplexPoint(portal, 0)->v);\n"
+"    b3MprVec3Scale(&dir, -1.f);\n"
+"    b3MprVec3Normalize(&dir);\n"
+"    b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 1));\n"
+"    b3MprSimplexSetSize(portal, 2);\n"
+"    // test if origin isn't outside of v1\n"
+"    dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, &dir);\n"
+"	\n"
+"    if (b3MprIsZero(dot) || dot < 0.f)\n"
+"        return -1;\n"
+"    // vertex 2\n"
+"    b3MprVec3Cross(&dir, &b3MprSimplexPoint(portal, 0)->v,\n"
+"                       &b3MprSimplexPoint(portal, 1)->v);\n"
+"    if (b3MprIsZero(b3MprVec3Len2(&dir))){\n"
+"        if (b3MprVec3Eq(&b3MprSimplexPoint(portal, 1)->v, b3mpr_vec3_origin)){\n"
+"            // origin lies on v1\n"
+"            return 1;\n"
+"        }else{\n"
+"            // origin lies on v0-v1 segment\n"
+"            return 2;\n"
+"        }\n"
+"    }\n"
+"    b3MprVec3Normalize(&dir);\n"
+"	 b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 2));\n"
+"    \n"
+"    dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, &dir);\n"
+"    if (b3MprIsZero(dot) || dot < 0.f)\n"
+"        return -1;\n"
+"    b3MprSimplexSetSize(portal, 3);\n"
+"    // vertex 3 direction\n"
+"    b3MprVec3Sub2(&va, &b3MprSimplexPoint(portal, 1)->v,\n"
+"                     &b3MprSimplexPoint(portal, 0)->v);\n"
+"    b3MprVec3Sub2(&vb, &b3MprSimplexPoint(portal, 2)->v,\n"
+"                     &b3MprSimplexPoint(portal, 0)->v);\n"
+"    b3MprVec3Cross(&dir, &va, &vb);\n"
+"    b3MprVec3Normalize(&dir);\n"
+"    // it is better to form portal faces to be oriented \"outside\" origin\n"
+"    dot = b3MprVec3Dot(&dir, &b3MprSimplexPoint(portal, 0)->v);\n"
+"    if (dot > 0.f){\n"
+"        b3MprSimplexSwap(portal, 1, 2);\n"
+"        b3MprVec3Scale(&dir, -1.f);\n"
+"    }\n"
+"    while (b3MprSimplexSize(portal) < 4){\n"
+"		 b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 3));\n"
+"        \n"
+"        dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, &dir);\n"
+"        if (b3MprIsZero(dot) || dot < 0.f)\n"
+"            return -1;\n"
+"        cont = 0;\n"
+"        // test if origin is outside (v1, v0, v3) - set v2 as v3 and\n"
+"        // continue\n"
+"        b3MprVec3Cross(&va, &b3MprSimplexPoint(portal, 1)->v,\n"
+"                          &b3MprSimplexPoint(portal, 3)->v);\n"
+"        dot = b3MprVec3Dot(&va, &b3MprSimplexPoint(portal, 0)->v);\n"
+"        if (dot < 0.f && !b3MprIsZero(dot)){\n"
+"            b3MprSimplexSet(portal, 2, b3MprSimplexPoint(portal, 3));\n"
+"            cont = 1;\n"
+"        }\n"
+"        if (!cont){\n"
+"            // test if origin is outside (v3, v0, v2) - set v1 as v3 and\n"
+"            // continue\n"
+"            b3MprVec3Cross(&va, &b3MprSimplexPoint(portal, 3)->v,\n"
+"                              &b3MprSimplexPoint(portal, 2)->v);\n"
+"            dot = b3MprVec3Dot(&va, &b3MprSimplexPoint(portal, 0)->v);\n"
+"            if (dot < 0.f && !b3MprIsZero(dot)){\n"
+"                b3MprSimplexSet(portal, 1, b3MprSimplexPoint(portal, 3));\n"
+"                cont = 1;\n"
+"            }\n"
+"        }\n"
+"        if (cont){\n"
+"            b3MprVec3Sub2(&va, &b3MprSimplexPoint(portal, 1)->v,\n"
+"                             &b3MprSimplexPoint(portal, 0)->v);\n"
+"            b3MprVec3Sub2(&vb, &b3MprSimplexPoint(portal, 2)->v,\n"
+"                             &b3MprSimplexPoint(portal, 0)->v);\n"
+"            b3MprVec3Cross(&dir, &va, &vb);\n"
+"            b3MprVec3Normalize(&dir);\n"
+"        }else{\n"
+"            b3MprSimplexSetSize(portal, 4);\n"
+"        }\n"
+"    }\n"
+"    return 0;\n"
+"}\n"
+"B3_STATIC int b3RefinePortal(int pairIndex,int bodyIndexA, int bodyIndexB,  b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n"
+"													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n"
+"													b3ConstArray(b3Collidable_t)				cpuCollidables,\n"
+"													b3ConstArray(b3Float4)					cpuVertices,\n"
+"													__global b3Float4* sepAxis,\n"
+"													b3MprSimplex_t *portal)\n"
+"{\n"
+"    b3Float4 dir;\n"
+"    b3MprSupport_t v4;\n"
+"	for (int i=0;i<B3_MPR_MAX_ITERATIONS;i++)\n"
+"    //while (1)\n"
+"	{\n"
+"        // compute direction outside the portal (from v0 throught v1,v2,v3\n"
+"        // face)\n"
+"        b3PortalDir(portal, &dir);\n"
+"        // test if origin is inside the portal\n"
+"        if (portalEncapsulesOrigin(portal, &dir))\n"
+"            return 0;\n"
+"        // get next support point\n"
+"        \n"
+"		 b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, &v4);\n"
+"        // test if v4 can expand portal to contain origin and if portal\n"
+"        // expanding doesn't reach given tolerance\n"
+"        if (!portalCanEncapsuleOrigin(portal, &v4, &dir)\n"
+"                || portalReachTolerance(portal, &v4, &dir))\n"
+"		{\n"
+"            return -1;\n"
+"        }\n"
+"        // v1-v2-v3 triangle must be rearranged to face outside Minkowski\n"
+"        // difference (direction from v0).\n"
+"        b3ExpandPortal(portal, &v4);\n"
+"    }\n"
+"    return -1;\n"
+"}\n"
+"B3_STATIC void b3FindPos(const b3MprSimplex_t *portal, b3Float4 *pos)\n"
+"{\n"
+"	b3Float4 zero = b3MakeFloat4(0,0,0,0);\n"
+"	b3Float4* b3mpr_vec3_origin = &zero;\n"
+"    b3Float4 dir;\n"
+"    size_t i;\n"
+"    float b[4], sum, inv;\n"
+"    b3Float4 vec, p1, p2;\n"
+"    b3PortalDir(portal, &dir);\n"
+"    // use barycentric coordinates of tetrahedron to find origin\n"
+"    b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 1)->v,\n"
+"                       &b3MprSimplexPoint(portal, 2)->v);\n"
+"    b[0] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 3)->v);\n"
+"    b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 3)->v,\n"
+"                       &b3MprSimplexPoint(portal, 2)->v);\n"
+"    b[1] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 0)->v);\n"
+"    b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 0)->v,\n"
+"                       &b3MprSimplexPoint(portal, 1)->v);\n"
+"    b[2] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 3)->v);\n"
+"    b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 2)->v,\n"
+"                       &b3MprSimplexPoint(portal, 1)->v);\n"
+"    b[3] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 0)->v);\n"
+"	sum = b[0] + b[1] + b[2] + b[3];\n"
+"    if (b3MprIsZero(sum) || sum < 0.f){\n"
+"		b[0] = 0.f;\n"
+"        b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 2)->v,\n"
+"                           &b3MprSimplexPoint(portal, 3)->v);\n"
+"        b[1] = b3MprVec3Dot(&vec, &dir);\n"
+"        b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 3)->v,\n"
+"                           &b3MprSimplexPoint(portal, 1)->v);\n"
+"        b[2] = b3MprVec3Dot(&vec, &dir);\n"
+"        b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 1)->v,\n"
+"                           &b3MprSimplexPoint(portal, 2)->v);\n"
+"        b[3] = b3MprVec3Dot(&vec, &dir);\n"
+"		sum = b[1] + b[2] + b[3];\n"
+"	}\n"
+"	inv = 1.f / sum;\n"
+"    b3MprVec3Copy(&p1, b3mpr_vec3_origin);\n"
+"    b3MprVec3Copy(&p2, b3mpr_vec3_origin);\n"
+"    for (i = 0; i < 4; i++){\n"
+"        b3MprVec3Copy(&vec, &b3MprSimplexPoint(portal, i)->v1);\n"
+"        b3MprVec3Scale(&vec, b[i]);\n"
+"        b3MprVec3Add(&p1, &vec);\n"
+"        b3MprVec3Copy(&vec, &b3MprSimplexPoint(portal, i)->v2);\n"
+"        b3MprVec3Scale(&vec, b[i]);\n"
+"        b3MprVec3Add(&p2, &vec);\n"
+"    }\n"
+"    b3MprVec3Scale(&p1, inv);\n"
+"    b3MprVec3Scale(&p2, inv);\n"
+"    b3MprVec3Copy(pos, &p1);\n"
+"    b3MprVec3Add(pos, &p2);\n"
+"    b3MprVec3Scale(pos, 0.5);\n"
+"}\n"
+"inline float b3MprVec3Dist2(const b3Float4 *a, const b3Float4 *b)\n"
+"{\n"
+"    b3Float4 ab;\n"
+"    b3MprVec3Sub2(&ab, a, b);\n"
+"    return b3MprVec3Len2(&ab);\n"
+"}\n"
+"inline float _b3MprVec3PointSegmentDist2(const b3Float4 *P,\n"
+"                                                  const b3Float4 *x0,\n"
+"                                                  const b3Float4 *b,\n"
+"                                                  b3Float4 *witness)\n"
+"{\n"
+"    // The computation comes from solving equation of segment:\n"
+"    //      S(t) = x0 + t.d\n"
+"    //          where - x0 is initial point of segment\n"
+"    //                - d is direction of segment from x0 (|d| > 0)\n"
+"    //                - t belongs to <0, 1> interval\n"
+"    // \n"
+"    // Than, distance from a segment to some point P can be expressed:\n"
+"    //      D(t) = |x0 + t.d - P|^2\n"
+"    //          which is distance from any point on segment. Minimization\n"
+"    //          of this function brings distance from P to segment.\n"
+"    // Minimization of D(t) leads to simple quadratic equation that's\n"
+"    // solving is straightforward.\n"
+"    //\n"
+"    // Bonus of this method is witness point for free.\n"
+"    float dist, t;\n"
+"    b3Float4 d, a;\n"
+"    // direction of segment\n"
+"    b3MprVec3Sub2(&d, b, x0);\n"
+"    // precompute vector from P to x0\n"
+"    b3MprVec3Sub2(&a, x0, P);\n"
+"    t  = -1.f * b3MprVec3Dot(&a, &d);\n"
+"    t /= b3MprVec3Len2(&d);\n"
+"    if (t < 0.f || b3MprIsZero(t)){\n"
+"        dist = b3MprVec3Dist2(x0, P);\n"
+"        if (witness)\n"
+"            b3MprVec3Copy(witness, x0);\n"
+"    }else if (t > 1.f || b3MprEq(t, 1.f)){\n"
+"        dist = b3MprVec3Dist2(b, P);\n"
+"        if (witness)\n"
+"            b3MprVec3Copy(witness, b);\n"
+"    }else{\n"
+"        if (witness){\n"
+"            b3MprVec3Copy(witness, &d);\n"
+"            b3MprVec3Scale(witness, t);\n"
+"            b3MprVec3Add(witness, x0);\n"
+"            dist = b3MprVec3Dist2(witness, P);\n"
+"        }else{\n"
+"            // recycling variables\n"
+"            b3MprVec3Scale(&d, t);\n"
+"            b3MprVec3Add(&d, &a);\n"
+"            dist = b3MprVec3Len2(&d);\n"
+"        }\n"
+"    }\n"
+"    return dist;\n"
+"}\n"
+"inline float b3MprVec3PointTriDist2(const b3Float4 *P,\n"
+"                                const b3Float4 *x0, const b3Float4 *B,\n"
+"                                const b3Float4 *C,\n"
+"                                b3Float4 *witness)\n"
+"{\n"
+"    // Computation comes from analytic expression for triangle (x0, B, C)\n"
+"    //      T(s, t) = x0 + s.d1 + t.d2, where d1 = B - x0 and d2 = C - x0 and\n"
+"    // Then equation for distance is:\n"
+"    //      D(s, t) = | T(s, t) - P |^2\n"
+"    // This leads to minimization of quadratic function of two variables.\n"
+"    // The solution from is taken only if s is between 0 and 1, t is\n"
+"    // between 0 and 1 and t + s < 1, otherwise distance from segment is\n"
+"    // computed.\n"
+"    b3Float4 d1, d2, a;\n"
+"    float u, v, w, p, q, r;\n"
+"    float s, t, dist, dist2;\n"
+"    b3Float4 witness2;\n"
+"    b3MprVec3Sub2(&d1, B, x0);\n"
+"    b3MprVec3Sub2(&d2, C, x0);\n"
+"    b3MprVec3Sub2(&a, x0, P);\n"
+"    u = b3MprVec3Dot(&a, &a);\n"
+"    v = b3MprVec3Dot(&d1, &d1);\n"
+"    w = b3MprVec3Dot(&d2, &d2);\n"
+"    p = b3MprVec3Dot(&a, &d1);\n"
+"    q = b3MprVec3Dot(&a, &d2);\n"
+"    r = b3MprVec3Dot(&d1, &d2);\n"
+"    s = (q * r - w * p) / (w * v - r * r);\n"
+"    t = (-s * r - q) / w;\n"
+"    if ((b3MprIsZero(s) || s > 0.f)\n"
+"            && (b3MprEq(s, 1.f) || s < 1.f)\n"
+"            && (b3MprIsZero(t) || t > 0.f)\n"
+"            && (b3MprEq(t, 1.f) || t < 1.f)\n"
+"            && (b3MprEq(t + s, 1.f) || t + s < 1.f)){\n"
+"        if (witness){\n"
+"            b3MprVec3Scale(&d1, s);\n"
+"            b3MprVec3Scale(&d2, t);\n"
+"            b3MprVec3Copy(witness, x0);\n"
+"            b3MprVec3Add(witness, &d1);\n"
+"            b3MprVec3Add(witness, &d2);\n"
+"            dist = b3MprVec3Dist2(witness, P);\n"
+"        }else{\n"
+"            dist  = s * s * v;\n"
+"            dist += t * t * w;\n"
+"            dist += 2.f * s * t * r;\n"
+"            dist += 2.f * s * p;\n"
+"            dist += 2.f * t * q;\n"
+"            dist += u;\n"
+"        }\n"
+"    }else{\n"
+"        dist = _b3MprVec3PointSegmentDist2(P, x0, B, witness);\n"
+"        dist2 = _b3MprVec3PointSegmentDist2(P, x0, C, &witness2);\n"
+"        if (dist2 < dist){\n"
+"            dist = dist2;\n"
+"            if (witness)\n"
+"                b3MprVec3Copy(witness, &witness2);\n"
+"        }\n"
+"        dist2 = _b3MprVec3PointSegmentDist2(P, B, C, &witness2);\n"
+"        if (dist2 < dist){\n"
+"            dist = dist2;\n"
+"            if (witness)\n"
+"                b3MprVec3Copy(witness, &witness2);\n"
+"        }\n"
+"    }\n"
+"    return dist;\n"
+"}\n"
+"B3_STATIC void b3FindPenetr(int pairIndex,int bodyIndexA, int bodyIndexB,  b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n"
+"													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n"
+"													b3ConstArray(b3Collidable_t)				cpuCollidables,\n"
+"													b3ConstArray(b3Float4)					cpuVertices,\n"
+"													__global b3Float4* sepAxis,\n"
+"                       b3MprSimplex_t *portal,\n"
+"                       float *depth, b3Float4 *pdir, b3Float4 *pos)\n"
+"{\n"
+"    b3Float4 dir;\n"
+"    b3MprSupport_t v4;\n"
+"    unsigned long iterations;\n"
+"	b3Float4 zero = b3MakeFloat4(0,0,0,0);\n"
+"	b3Float4* b3mpr_vec3_origin = &zero;\n"
+"    iterations = 1UL;\n"
+"	for (int i=0;i<B3_MPR_MAX_ITERATIONS;i++)\n"
+"    //while (1)\n"
+"	{\n"
+"        // compute portal direction and obtain next support point\n"
+"        b3PortalDir(portal, &dir);\n"
+"        \n"
+"		 b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, &v4);\n"
+"        // reached tolerance -> find penetration info\n"
+"        if (portalReachTolerance(portal, &v4, &dir)\n"
+"                || iterations ==B3_MPR_MAX_ITERATIONS)\n"
+"		{\n"
+"            *depth = b3MprVec3PointTriDist2(b3mpr_vec3_origin,&b3MprSimplexPoint(portal, 1)->v,&b3MprSimplexPoint(portal, 2)->v,&b3MprSimplexPoint(portal, 3)->v,pdir);\n"
+"            *depth = B3_MPR_SQRT(*depth);\n"
+"			\n"
+"			if (b3MprIsZero((*pdir).x) && b3MprIsZero((*pdir).y) && b3MprIsZero((*pdir).z))\n"
+"			{\n"
+"				\n"
+"				*pdir = dir;\n"
+"			} \n"
+"			b3MprVec3Normalize(pdir);\n"
+"			\n"
+"            // barycentric coordinates:\n"
+"            b3FindPos(portal, pos);\n"
+"            return;\n"
+"        }\n"
+"        b3ExpandPortal(portal, &v4);\n"
+"        iterations++;\n"
+"    }\n"
+"}\n"
+"B3_STATIC void b3FindPenetrTouch(b3MprSimplex_t *portal,float *depth, b3Float4 *dir, b3Float4 *pos)\n"
+"{\n"
+"    // Touching contact on portal's v1 - so depth is zero and direction\n"
+"    // is unimportant and pos can be guessed\n"
+"    *depth = 0.f;\n"
+"    b3Float4 zero = b3MakeFloat4(0,0,0,0);\n"
+"	b3Float4* b3mpr_vec3_origin = &zero;\n"
+"	b3MprVec3Copy(dir, b3mpr_vec3_origin);\n"
+"    b3MprVec3Copy(pos, &b3MprSimplexPoint(portal, 1)->v1);\n"
+"    b3MprVec3Add(pos, &b3MprSimplexPoint(portal, 1)->v2);\n"
+"    b3MprVec3Scale(pos, 0.5);\n"
+"}\n"
+"B3_STATIC void b3FindPenetrSegment(b3MprSimplex_t *portal,\n"
+"                              float *depth, b3Float4 *dir, b3Float4 *pos)\n"
+"{\n"
+"    \n"
+"    // Origin lies on v0-v1 segment.\n"
+"    // Depth is distance to v1, direction also and position must be\n"
+"    // computed\n"
+"    b3MprVec3Copy(pos, &b3MprSimplexPoint(portal, 1)->v1);\n"
+"    b3MprVec3Add(pos, &b3MprSimplexPoint(portal, 1)->v2);\n"
+"    b3MprVec3Scale(pos, 0.5f);\n"
+"    \n"
+"    b3MprVec3Copy(dir, &b3MprSimplexPoint(portal, 1)->v);\n"
+"    *depth = B3_MPR_SQRT(b3MprVec3Len2(dir));\n"
+"    b3MprVec3Normalize(dir);\n"
+"}\n"
+"inline int b3MprPenetration(int pairIndex, int bodyIndexA, int bodyIndexB,\n"
+"					b3ConstArray(b3RigidBodyData_t) cpuBodyBuf,\n"
+"					b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n"
+"					b3ConstArray(b3Collidable_t)	cpuCollidables,\n"
+"					b3ConstArray(b3Float4)	cpuVertices,\n"
+"					__global b3Float4* sepAxis,\n"
+"					__global int*	hasSepAxis,\n"
+"					float *depthOut, b3Float4* dirOut, b3Float4* posOut)\n"
+"{\n"
+"	\n"
+"	 b3MprSimplex_t portal;\n"
+"	 \n"
+"//	if (!hasSepAxis[pairIndex])\n"
+"	//	return -1;\n"
+"	\n"
+"	hasSepAxis[pairIndex] = 0;\n"
+"	 int res;\n"
+"    // Phase 1: Portal discovery\n"
+"    res = b3DiscoverPortal(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,hasSepAxis, &portal);\n"
+"	\n"
+"	  \n"
+"	//sepAxis[pairIndex] = *pdir;//or -dir?\n"
+"	switch (res)\n"
+"	{\n"
+"	case 0:\n"
+"		{\n"
+"			// Phase 2: Portal refinement\n"
+"		\n"
+"			res = b3RefinePortal(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&portal);\n"
+"			if (res < 0)\n"
+"				return -1;\n"
+"			// Phase 3. Penetration info\n"
+"			b3FindPenetr(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&portal, depthOut, dirOut, posOut);\n"
+"			hasSepAxis[pairIndex] = 1;\n"
+"			sepAxis[pairIndex] = -*dirOut;\n"
+"			break;\n"
+"		}\n"
+"	case 1:\n"
+"		{\n"
+"			 // Touching contact on portal's v1.\n"
+"			b3FindPenetrTouch(&portal, depthOut, dirOut, posOut);\n"
+"			break;\n"
+"		}\n"
+"	case 2:\n"
+"		{\n"
+"			\n"
+"			b3FindPenetrSegment( &portal, depthOut, dirOut, posOut);\n"
+"			break;\n"
+"		}\n"
+"	default:\n"
+"		{\n"
+"			hasSepAxis[pairIndex]=0;\n"
+"			//if (res < 0)\n"
+"			//{\n"
+"				// Origin isn't inside portal - no collision.\n"
+"				return -1;\n"
+"			//}\n"
+"		}\n"
+"	};\n"
+"	\n"
+"	return 0;\n"
+"};\n"
+"#endif //B3_MPR_PENETRATION_H\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"{\n"
+"	b3Float4	m_worldPosB[4];\n"
+"//	b3Float4	m_localPosA[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormalOnB;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"};\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"{\n"
+"	return (int)contact->m_worldNormalOnB.w;\n"
+"};\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"{\n"
+"	contact->m_worldNormalOnB.w = (float)numPoints;\n"
+"};\n"
+"#endif //B3_CONTACT4DATA_H\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define GET_NPOINTS(x) (x).m_worldNormalOnB.w\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"	#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"#else\n"
+"	#define counter32_t volatile __global int*\n"
+"#endif\n"
+"__kernel void   mprPenetrationKernel( __global int4* pairs,\n"
+"																					__global const b3RigidBodyData_t* rigidBodies, \n"
+"																					__global const b3Collidable_t* collidables,\n"
+"																					__global const b3ConvexPolyhedronData_t* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global float4* separatingNormals,\n"
+"																					__global int* hasSeparatingAxis,\n"
+"																					__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"																					counter32_t nGlobalContactsOut,\n"
+"																					int contactCapacity,\n"
+"																					int numPairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"	\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"		\n"
+"		\n"
+"		//once the broadphase avoids static-static pairs, we can remove this test\n"
+"		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
+"		{\n"
+"			return;\n"
+"		}\n"
+"		\n"
+"		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))\n"
+"		{\n"
+"			return;\n"
+"		}\n"
+"		float depthOut;\n"
+"		b3Float4 dirOut;\n"
+"		b3Float4 posOut;\n"
+"		int res = b3MprPenetration(pairIndex, bodyIndexA, bodyIndexB,rigidBodies,convexShapes,collidables,vertices,separatingNormals,hasSeparatingAxis,&depthOut, &dirOut, &posOut);\n"
+"		\n"
+"		\n"
+"		\n"
+"		\n"
+"		if (res==0)\n"
+"		{\n"
+"			//add a contact\n"
+"			int dstIdx;\n"
+"			AppendInc( nGlobalContactsOut, dstIdx );\n"
+"			if (dstIdx<contactCapacity)\n"
+"			{\n"
+"				pairs[pairIndex].z = dstIdx;\n"
+"				__global struct b3Contact4Data* c = globalContactsOut + dstIdx;\n"
+"				c->m_worldNormalOnB = -dirOut;//normal;\n"
+"				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"				c->m_batchIdx = pairIndex;\n"
+"				int bodyA = pairs[pairIndex].x;\n"
+"				int bodyB = pairs[pairIndex].y;\n"
+"				c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;\n"
+"				c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;\n"
+"				c->m_childIndexA = -1;\n"
+"				c->m_childIndexB = -1;\n"
+"				//for (int i=0;i<nContacts;i++)\n"
+"				posOut.w = -depthOut;\n"
+"				c->m_worldPosB[0] = posOut;//localPoints[contactIdx[i]];\n"
+"				GET_NPOINTS(*c) = 1;//nContacts;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"typedef float4 Quaternion;\n"
+"#define make_float4 (float4)\n"
+"__inline\n"
+"float dot3F4(float4 a, float4 b)\n"
+"{\n"
+"	float4 a1 = make_float4(a.xyz,0.f);\n"
+"	float4 b1 = make_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"}\n"
+"__inline\n"
+"float4 cross3(float4 a, float4 b)\n"
+"{\n"
+"	return cross(a,b);\n"
+"}\n"
+"__inline\n"
+"Quaternion qtMul(Quaternion a, Quaternion b)\n"
+"{\n"
+"	Quaternion ans;\n"
+"	ans = cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - dot3F4(a, b);\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"Quaternion qtInvert(Quaternion q)\n"
+"{\n"
+"	return (Quaternion)(-q.xyz, q.w);\n"
+"}\n"
+"__inline\n"
+"float4 qtRotate(Quaternion q, float4 vec)\n"
+"{\n"
+"	Quaternion qInv = qtInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
+"{\n"
+"	return qtRotate( *orientation, *p ) + (*translation);\n"
+"}\n"
+"__inline\n"
+"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
+"{\n"
+"	return qtRotate( qtInvert( q ), vec );\n"
+"}\n"
+"inline void project(__global const b3ConvexPolyhedronData_t* hull,  const float4 pos, const float4 orn, \n"
+"const float4* dir, __global const float4* vertices, float* min, float* max)\n"
+"{\n"
+"	min[0] = FLT_MAX;\n"
+"	max[0] = -FLT_MAX;\n"
+"	int numVerts = hull->m_numVertices;\n"
+"	const float4 localDir = qtInvRotate(orn,*dir);\n"
+"	float offset = dot(pos,*dir);\n"
+"	for(int i=0;i<numVerts;i++)\n"
+"	{\n"
+"		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n"
+"		if(dp < min[0])	\n"
+"			min[0] = dp;\n"
+"		if(dp > max[0])	\n"
+"			max[0] = dp;\n"
+"	}\n"
+"	if(min[0]>max[0])\n"
+"	{\n"
+"		float tmp = min[0];\n"
+"		min[0] = max[0];\n"
+"		max[0] = tmp;\n"
+"	}\n"
+"	min[0] += offset;\n"
+"	max[0] += offset;\n"
+"}\n"
+"bool findSeparatingAxisUnitSphere(	__global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	__global const float4* vertices,\n"
+"	__global const float4* unitSphereDirections,\n"
+"	int numUnitSphereDirections,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"{\n"
+"	\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	int curEdgeEdge = 0;\n"
+"	// Test unit sphere directions\n"
+"	for (int i=0;i<numUnitSphereDirections;i++)\n"
+"	{\n"
+"		float4 crossje;\n"
+"		crossje = unitSphereDirections[i];	\n"
+"		if (dot3F4(DeltaC2,crossje)>0)\n"
+"			crossje *= -1.f;\n"
+"		{\n"
+"			float dist;\n"
+"			bool result = true;\n"
+"			float Min0,Max0;\n"
+"			float Min1,Max1;\n"
+"			project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);\n"
+"			project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);\n"
+"		\n"
+"			if(Max0<Min1 || Max1<Min0)\n"
+"				return false;\n"
+"		\n"
+"			float d0 = Max0 - Min1;\n"
+"			float d1 = Max1 - Min0;\n"
+"			dist = d0<d1 ? d0:d1;\n"
+"			result = true;\n"
+"	\n"
+"			if(dist<*dmin)\n"
+"			{\n"
+"				*dmin = dist;\n"
+"				*sep = crossje;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"}\n"
+"__kernel void   findSeparatingAxisUnitSphereKernel( __global const int4* pairs, \n"
+"																					__global const b3RigidBodyData_t* rigidBodies, \n"
+"																					__global const b3Collidable_t* collidables,\n"
+"																					__global const b3ConvexPolyhedronData_t* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* unitSphereDirections,\n"
+"																					__global  float4* separatingNormals,\n"
+"																					__global  int* hasSeparatingAxis,\n"
+"																					__global  float* dmins,\n"
+"																					int numUnitSphereDirections,\n"
+"																					int numPairs\n"
+"																					)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"		if (hasSeparatingAxis[i])\n"
+"		{\n"
+"	\n"
+"			int bodyIndexA = pairs[i].x;\n"
+"			int bodyIndexB = pairs[i].y;\n"
+"	\n"
+"			int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"			int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"		\n"
+"			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"			\n"
+"			\n"
+"			int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"	\n"
+"			float dmin = dmins[i];\n"
+"	\n"
+"			float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"			posA.w = 0.f;\n"
+"			float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"			posB.w = 0.f;\n"
+"			float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"			float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"			float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"			float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"			float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"			float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"			const float4 DeltaC2 = c0 - c1;\n"
+"			float4 sepNormal = separatingNormals[i];\n"
+"			\n"
+"			int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;\n"
+"			if (numEdgeEdgeDirections>numUnitSphereDirections)\n"
+"			{\n"
+"				bool sepEE = findSeparatingAxisUnitSphere(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n"
+"																										posB,ornB,\n"
+"																										DeltaC2,\n"
+"																										vertices,unitSphereDirections,numUnitSphereDirections,&sepNormal,&dmin);\n"
+"				if (!sepEE)\n"
+"				{\n"
+"					hasSeparatingAxis[i] = 0;\n"
+"				} else\n"
+"				{\n"
+"					hasSeparatingAxis[i] = 1;\n"
+"					separatingNormals[i] = sepNormal;\n"
+"				}\n"
+"			}\n"
+"		}		//if (hasSeparatingAxis[i])\n"
+"	}//(i<numPairs)\n"
+"}\n"
+;
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl
new file mode 100644
index 0000000000..9c9e920f13
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl
@@ -0,0 +1,1374 @@
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+
+#define SHAPE_CONVEX_HULL 3
+#define SHAPE_PLANE 4
+#define SHAPE_CONCAVE_TRIMESH 5
+#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6
+#define SHAPE_SPHERE 7
+
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+
+#ifdef cl_ext_atomic_counters_32
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+#else
+#define counter32_t volatile __global int*
+#endif
+
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+
+#define max2 max
+#define min2 min
+
+typedef unsigned int u32;
+
+
+
+
+typedef struct 
+{
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+
+///keep this in sync with btCollidable.h
+typedef struct
+{
+	int m_numChildShapes;
+	float m_radius;
+	int m_shapeType;
+	int m_shapeIndex;
+	
+} btCollidableGpu;
+
+typedef struct
+{
+	float4	m_childPosition;
+	float4	m_childOrientation;
+	int m_shapeIndex;
+	int m_unused0;
+	int m_unused1;
+	int m_unused2;
+} btGpuChildShape;
+
+#define GET_NPOINTS(x) (x).m_worldNormalOnB.w
+
+typedef struct
+{
+	float4 m_pos;
+	float4 m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+
+	u32 m_collidableIdx;	
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} BodyData;
+
+
+typedef struct  
+{
+	float4		m_localCenter;
+	float4		m_extents;
+	float4		mC;
+	float4		mE;
+	
+	float			m_radius;
+	int	m_faceOffset;
+	int m_numFaces;
+	int	m_numVertices;
+	
+	int m_vertexOffset;
+	int	m_uniqueEdgesOffset;
+	int	m_numUniqueEdges;
+	int m_unused;
+
+} ConvexPolyhedronCL;
+
+typedef struct
+{
+	float4 m_plane;
+	int m_indexOffset;
+	int m_numIndices;
+} btGpuFace;
+
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+
+#define make_float4 (float4)
+#define make_float2 (float2)
+#define make_uint4 (uint4)
+#define make_int4 (int4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+
+
+__inline
+float fastDiv(float numerator, float denominator)
+{
+	return native_divide(numerator, denominator);	
+//	return numerator/denominator;	
+}
+
+__inline
+float4 fastDiv4(float4 numerator, float4 denominator)
+{
+	return native_divide(numerator, denominator);	
+}
+
+
+__inline
+float4 cross3(float4 a, float4 b)
+{
+	return cross(a,b);
+}
+
+//#define dot3F4 dot
+
+__inline
+float dot3F4(float4 a, float4 b)
+{
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+}
+
+__inline
+float4 fastNormalize4(float4 v)
+{
+	return fast_normalize(v);
+}
+
+
+///////////////////////////////////////
+//	Quaternion
+///////////////////////////////////////
+
+typedef float4 Quaternion;
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b);
+
+__inline
+Quaternion qtNormalize(Quaternion in);
+
+__inline
+float4 qtRotate(Quaternion q, float4 vec);
+
+__inline
+Quaternion qtInvert(Quaternion q);
+
+
+
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b)
+{
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+}
+
+__inline
+Quaternion qtNormalize(Quaternion in)
+{
+	return fastNormalize4(in);
+//	in /= length( in );
+//	return in;
+}
+__inline
+float4 qtRotate(Quaternion q, float4 vec)
+{
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+}
+
+__inline
+Quaternion qtInvert(Quaternion q)
+{
+	return (Quaternion)(-q.xyz, q.w);
+}
+
+__inline
+float4 qtInvRotate(const Quaternion q, float4 vec)
+{
+	return qtRotate( qtInvert( q ), vec );
+}
+
+__inline
+float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)
+{
+	return qtRotate( *orientation, *p ) + (*translation);
+}
+
+void	trInverse(float4 translationIn, Quaternion orientationIn,
+		float4* translationOut, Quaternion* orientationOut)
+{
+	*orientationOut = qtInvert(orientationIn);
+	*translationOut = qtRotate(*orientationOut, -translationIn);
+}
+
+void	trMul(float4 translationA, Quaternion orientationA,
+						float4 translationB, Quaternion orientationB,
+		float4* translationOut, Quaternion* orientationOut)
+{
+	*orientationOut = qtMul(orientationA,orientationB);
+	*translationOut = transform(&translationB,&translationA,&orientationA);
+}
+
+
+
+__inline
+float4 normalize3(const float4 a)
+{
+	float4 n = make_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+}
+
+
+__inline float4 lerp3(const float4 a,const float4 b, float  t)
+{
+	return make_float4(	a.x + (b.x - a.x) * t,
+						a.y + (b.y - a.y) * t,
+						a.z + (b.z - a.z) * t,
+						0.f);
+}
+
+
+float signedDistanceFromPointToPlane(float4 point, float4 planeEqn, float4* closestPointOnFace)
+{
+	float4 n = (float4)(planeEqn.x, planeEqn.y, planeEqn.z, 0);
+	float dist = dot3F4(n, point) + planeEqn.w;
+	*closestPointOnFace = point - dist * n;
+	return dist;
+}
+
+
+
+inline bool IsPointInPolygon(float4 p, 
+							const btGpuFace* face,
+							__global const float4* baseVertex,
+							__global const  int* convexIndices,
+							float4* out)
+{
+    float4 a;
+    float4 b;
+    float4 ab;
+    float4 ap;
+    float4 v;
+
+	float4 plane = make_float4(face->m_plane.x,face->m_plane.y,face->m_plane.z,0.f);
+	
+	if (face->m_numIndices<2)
+		return false;
+
+	
+	float4 v0 = baseVertex[convexIndices[face->m_indexOffset + face->m_numIndices-1]];
+	
+	b = v0;
+
+    for(unsigned i=0; i != face->m_numIndices; ++i)
+    {
+		a = b;
+		float4 vi = baseVertex[convexIndices[face->m_indexOffset + i]];
+		b = vi;
+        ab = b-a;
+        ap = p-a;
+        v = cross3(ab,plane);
+
+        if (dot(ap, v) > 0.f)
+        {
+            float ab_m2 = dot(ab, ab);
+            float rt = ab_m2 != 0.f ? dot(ab, ap) / ab_m2 : 0.f;
+            if (rt <= 0.f)
+            {
+                *out = a;
+            }
+            else if (rt >= 1.f) 
+            {
+                *out = b;
+            }
+            else
+            {
+            	float s = 1.f - rt;
+				out[0].x = s * a.x + rt * b.x;
+				out[0].y = s * a.y + rt * b.y;
+				out[0].z = s * a.z + rt * b.z;
+            }
+            return false;
+        }
+    }
+    return true;
+}
+
+
+
+
+void	computeContactSphereConvex(int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																__global const BodyData* rigidBodies, 
+																__global const btCollidableGpu* collidables,
+																__global const ConvexPolyhedronCL* convexShapes,
+																__global const float4* convexVertices,
+																__global const int* convexIndices,
+																__global const btGpuFace* faces,
+																__global struct b3Contact4Data* restrict globalContactsOut,
+																counter32_t nGlobalContactsOut,
+																int maxContactCapacity,
+																float4 spherePos2,
+																float radius,
+																float4 pos,
+																float4 quat
+																)
+{
+
+	float4 invPos;
+	float4 invOrn;
+
+	trInverse(pos,quat, &invPos,&invOrn);
+
+	float4 spherePos = transform(&spherePos2,&invPos,&invOrn);
+
+	int shapeIndex = collidables[collidableIndexB].m_shapeIndex;
+	int numFaces = convexShapes[shapeIndex].m_numFaces;
+	float4 closestPnt = (float4)(0, 0, 0, 0);
+	float4 hitNormalWorld = (float4)(0, 0, 0, 0);
+	float minDist = -1000000.f;
+	bool bCollide = true;
+
+	for ( int f = 0; f < numFaces; f++ )
+	{
+		btGpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f];
+
+		// set up a plane equation 
+		float4 planeEqn;
+		float4 n1 = face.m_plane;
+		n1.w = 0.f;
+		planeEqn = n1;
+		planeEqn.w = face.m_plane.w;
+		
+	
+		// compute a signed distance from the vertex in cloth to the face of rigidbody.
+		float4 pntReturn;
+		float dist = signedDistanceFromPointToPlane(spherePos, planeEqn, &pntReturn);
+
+		// If the distance is positive, the plane is a separating plane. 
+		if ( dist > radius )
+		{
+			bCollide = false;
+			break;
+		}
+
+
+		if (dist>0)
+		{
+			//might hit an edge or vertex
+			float4 out;
+			float4 zeroPos = make_float4(0,0,0,0);
+
+			bool isInPoly = IsPointInPolygon(spherePos,
+					&face,
+					&convexVertices[convexShapes[shapeIndex].m_vertexOffset],
+					convexIndices,
+           &out);
+			if (isInPoly)
+			{
+				if (dist>minDist)
+				{
+					minDist = dist;
+					closestPnt = pntReturn;
+					hitNormalWorld = planeEqn;
+					
+				}
+			} else
+			{
+				float4 tmp = spherePos-out;
+				float l2 = dot(tmp,tmp);
+				if (l2<radius*radius)
+				{
+					dist  = sqrt(l2);
+					if (dist>minDist)
+					{
+						minDist = dist;
+						closestPnt = out;
+						hitNormalWorld = tmp/dist;
+						
+					}
+					
+				} else
+				{
+					bCollide = false;
+					break;
+				}
+			}
+		} else
+		{
+			if ( dist > minDist )
+			{
+				minDist = dist;
+				closestPnt = pntReturn;
+				hitNormalWorld.xyz = planeEqn.xyz;
+			}
+		}
+		
+	}
+
+	
+
+	if (bCollide && minDist > -10000)
+	{
+		float4 normalOnSurfaceB1 = qtRotate(quat,-hitNormalWorld);
+		float4 pOnB1 = transform(&closestPnt,&pos,&quat);
+		
+		float actualDepth = minDist-radius;
+		if (actualDepth<=0.f)
+		{
+			
+
+			pOnB1.w = actualDepth;
+
+			int dstIdx;
+			AppendInc( nGlobalContactsOut, dstIdx );
+		
+			
+			if (1)//dstIdx < maxContactCapacity)
+			{
+				__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
+				c->m_worldNormalOnB = -normalOnSurfaceB1;
+				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+				c->m_batchIdx = pairIndex;
+				c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
+				c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
+				c->m_worldPosB[0] = pOnB1;
+				c->m_childIndexA = -1;
+				c->m_childIndexB = -1;
+
+				GET_NPOINTS(*c) = 1;
+			} 
+
+		}
+	}//if (hasCollision)
+
+}
+							
+
+
+int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int4* contactIdx)
+{
+	if( nPoints == 0 )
+        return 0;
+    
+    if (nPoints <=4)
+        return nPoints;
+    
+    
+    if (nPoints >64)
+        nPoints = 64;
+    
+	float4 center = make_float4(0.f);
+	{
+		
+		for (int i=0;i<nPoints;i++)
+			center += p[i];
+		center /= (float)nPoints;
+	}
+    
+	
+    
+	//	sample 4 directions
+    
+    float4 aVector = p[0] - center;
+    float4 u = cross3( nearNormal, aVector );
+    float4 v = cross3( nearNormal, u );
+    u = normalize3( u );
+    v = normalize3( v );
+    
+    
+    //keep point with deepest penetration
+    float minW= FLT_MAX;
+    
+    int minIndex=-1;
+    
+    float4 maxDots;
+    maxDots.x = FLT_MIN;
+    maxDots.y = FLT_MIN;
+    maxDots.z = FLT_MIN;
+    maxDots.w = FLT_MIN;
+    
+    //	idx, distance
+    for(int ie = 0; ie<nPoints; ie++ )
+    {
+        if (p[ie].w<minW)
+        {
+            minW = p[ie].w;
+            minIndex=ie;
+        }
+        float f;
+        float4 r = p[ie]-center;
+        f = dot3F4( u, r );
+        if (f<maxDots.x)
+        {
+            maxDots.x = f;
+            contactIdx[0].x = ie;
+        }
+        
+        f = dot3F4( -u, r );
+        if (f<maxDots.y)
+        {
+            maxDots.y = f;
+            contactIdx[0].y = ie;
+        }
+        
+        
+        f = dot3F4( v, r );
+        if (f<maxDots.z)
+        {
+            maxDots.z = f;
+            contactIdx[0].z = ie;
+        }
+        
+        f = dot3F4( -v, r );
+        if (f<maxDots.w)
+        {
+            maxDots.w = f;
+            contactIdx[0].w = ie;
+        }
+        
+    }
+    
+    if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)
+    {
+        //replace the first contact with minimum (todo: replace contact with least penetration)
+        contactIdx[0].x = minIndex;
+    }
+    
+    return 4;
+    
+}
+
+#define MAX_PLANE_CONVEX_POINTS 64
+
+int computeContactPlaneConvex(int pairIndex,
+								int bodyIndexA, int bodyIndexB, 
+								int collidableIndexA, int collidableIndexB, 
+								__global const BodyData* rigidBodies, 
+								__global const btCollidableGpu*collidables,
+								__global const ConvexPolyhedronCL* convexShapes,
+								__global const float4* convexVertices,
+								__global const int* convexIndices,
+								__global const btGpuFace* faces,
+								__global struct b3Contact4Data* restrict globalContactsOut,
+								counter32_t nGlobalContactsOut,
+								int maxContactCapacity,
+								float4 posB,
+								Quaternion ornB
+								)
+{
+	int resultIndex=-1;
+
+		int shapeIndex = collidables[collidableIndexB].m_shapeIndex;
+	__global const ConvexPolyhedronCL* hullB = &convexShapes[shapeIndex];
+	
+	float4 posA;
+	posA = rigidBodies[bodyIndexA].m_pos;
+	Quaternion ornA;
+	ornA = rigidBodies[bodyIndexA].m_quat;
+
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+
+	float4 planeEq;
+	 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;
+	float4 planeNormal = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f);
+	float4 planeNormalWorld;
+	planeNormalWorld = qtRotate(ornA,planeNormal);
+	float planeConstant = planeEq.w;
+	
+	float4 invPosA;Quaternion invOrnA;
+	float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1;
+	{
+		
+		trInverse(posA,ornA,&invPosA,&invOrnA);
+		trMul(invPosA,invOrnA,posB,ornB,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);
+	}
+	float4 invPosB;Quaternion invOrnB;
+	float4 planeInConvexPos1;	Quaternion planeInConvexOrn1;
+	{
+		
+		trInverse(posB,ornB,&invPosB,&invOrnB);
+		trMul(invPosB,invOrnB,posA,ornA,&planeInConvexPos1,&planeInConvexOrn1);	
+	}
+
+	
+	float4 planeNormalInConvex = qtRotate(planeInConvexOrn1,-planeNormal);
+	float maxDot = -1e30;
+	int hitVertex=-1;
+	float4 hitVtx;
+
+
+
+	float4 contactPoints[MAX_PLANE_CONVEX_POINTS];
+	int numPoints = 0;
+
+	int4 contactIdx;
+	contactIdx=make_int4(0,1,2,3);
+    
+	
+	for (int i=0;i<hullB->m_numVertices;i++)
+	{
+		float4 vtx = convexVertices[hullB->m_vertexOffset+i];
+		float curDot = dot(vtx,planeNormalInConvex);
+
+
+		if (curDot>maxDot)
+		{
+			hitVertex=i;
+			maxDot=curDot;
+			hitVtx = vtx;
+			//make sure the deepest points is always included
+			if (numPoints==MAX_PLANE_CONVEX_POINTS)
+				numPoints--;
+		}
+
+		if (numPoints<MAX_PLANE_CONVEX_POINTS)
+		{
+			float4 vtxWorld = transform(&vtx, &posB, &ornB);
+			float4 vtxInPlane = transform(&vtxWorld, &invPosA, &invOrnA);//oplaneTransform.inverse()*vtxWorld;
+			float dist = dot(planeNormal,vtxInPlane)-planeConstant;
+			if (dist<0.f)
+			{
+				vtxWorld.w = dist;
+				contactPoints[numPoints] = vtxWorld;
+				numPoints++;
+			}
+		}
+
+	}
+
+	int numReducedPoints  = numPoints;
+	if (numPoints>4)
+	{
+		numReducedPoints = extractManifoldSequential( contactPoints, numPoints, planeNormalInConvex, &contactIdx);
+	}
+
+	if (numReducedPoints>0)
+	{
+		int dstIdx;
+	    AppendInc( nGlobalContactsOut, dstIdx );
+
+		if (dstIdx < maxContactCapacity)
+		{
+			resultIndex = dstIdx;
+			__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
+			c->m_worldNormalOnB = -planeNormalWorld;
+			//c->setFrictionCoeff(0.7);
+			//c->setRestituitionCoeff(0.f);
+			c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+			c->m_batchIdx = pairIndex;
+			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
+			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
+			c->m_childIndexA = -1;
+			c->m_childIndexB = -1;
+
+			switch (numReducedPoints)
+            {
+                case 4:
+                    c->m_worldPosB[3] = contactPoints[contactIdx.w];
+                case 3:
+                    c->m_worldPosB[2] = contactPoints[contactIdx.z];
+                case 2:
+                    c->m_worldPosB[1] = contactPoints[contactIdx.y];
+                case 1:
+                    c->m_worldPosB[0] = contactPoints[contactIdx.x];
+                default:
+                {
+                }
+            };
+			
+			GET_NPOINTS(*c) = numReducedPoints;
+		}//if (dstIdx < numPairs)
+	}	
+
+	return resultIndex;
+}
+
+
+void	computeContactPlaneSphere(int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																__global const BodyData* rigidBodies, 
+																__global const btCollidableGpu* collidables,
+																__global const btGpuFace* faces,
+																__global struct b3Contact4Data* restrict globalContactsOut,
+																counter32_t nGlobalContactsOut,
+																int maxContactCapacity)
+{
+	float4 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;
+	float radius = collidables[collidableIndexB].m_radius;
+	float4 posA1 = rigidBodies[bodyIndexA].m_pos;
+	float4 ornA1 = rigidBodies[bodyIndexA].m_quat;
+	float4 posB1 = rigidBodies[bodyIndexB].m_pos;
+	float4 ornB1 = rigidBodies[bodyIndexB].m_quat;
+	
+	bool hasCollision = false;
+	float4 planeNormal1 = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f);
+	float planeConstant = planeEq.w;
+	float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1;
+	{
+		float4 invPosA;Quaternion invOrnA;
+		trInverse(posA1,ornA1,&invPosA,&invOrnA);
+		trMul(invPosA,invOrnA,posB1,ornB1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);
+	}
+	float4 planeInConvexPos1;	Quaternion planeInConvexOrn1;
+	{
+		float4 invPosB;Quaternion invOrnB;
+		trInverse(posB1,ornB1,&invPosB,&invOrnB);
+		trMul(invPosB,invOrnB,posA1,ornA1,&planeInConvexPos1,&planeInConvexOrn1);	
+	}
+	float4 vtx1 = qtRotate(planeInConvexOrn1,-planeNormal1)*radius;
+	float4 vtxInPlane1 = transform(&vtx1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);
+	float distance = dot3F4(planeNormal1,vtxInPlane1) - planeConstant;
+	hasCollision = distance < 0.f;//m_manifoldPtr->getContactBreakingThreshold();
+	if (hasCollision)
+	{
+		float4 vtxInPlaneProjected1 = vtxInPlane1 -   distance*planeNormal1;
+		float4 vtxInPlaneWorld1 = transform(&vtxInPlaneProjected1,&posA1,&ornA1);
+		float4 normalOnSurfaceB1 = qtRotate(ornA1,planeNormal1);
+		float4 pOnB1 = vtxInPlaneWorld1+normalOnSurfaceB1*distance;
+		pOnB1.w = distance;
+
+		int dstIdx;
+    AppendInc( nGlobalContactsOut, dstIdx );
+		
+		if (dstIdx < maxContactCapacity)
+		{
+			__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
+			c->m_worldNormalOnB = -normalOnSurfaceB1;
+			c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+			c->m_batchIdx = pairIndex;
+			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
+			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
+			c->m_worldPosB[0] = pOnB1;
+			c->m_childIndexA = -1;
+			c->m_childIndexB = -1;
+			GET_NPOINTS(*c) = 1;
+		}//if (dstIdx < numPairs)
+	}//if (hasCollision)
+}
+
+
+__kernel void   primitiveContactsKernel( __global int4* pairs, 
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global struct b3Contact4Data* restrict globalContactsOut,
+																					counter32_t nGlobalContactsOut,
+																					int numPairs, int maxContactCapacity)
+{
+
+	int i = get_global_id(0);
+	int pairIndex = i;
+	
+	float4 worldVertsB1[64];
+	float4 worldVertsB2[64];
+	int capacityWorldVerts = 64;	
+
+	float4 localContactsOut[64];
+	int localContactCapacity=64;
+	
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+
+	if (i<numPairs)
+	{
+
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+			
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+	
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)
+		{
+
+			float4 posB;
+			posB = rigidBodies[bodyIndexB].m_pos;
+			Quaternion ornB;
+			ornB = rigidBodies[bodyIndexB].m_quat;
+			int contactIndex = computeContactPlaneConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, 
+																rigidBodies,collidables,convexShapes,vertices,indices,
+																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity, posB,ornB);
+			if (contactIndex>=0)
+				pairs[pairIndex].z = contactIndex;
+
+			return;
+		}
+
+
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)
+		{
+
+			float4 posA;
+			posA = rigidBodies[bodyIndexA].m_pos;
+			Quaternion ornA;
+			ornA = rigidBodies[bodyIndexA].m_quat;
+
+
+			int contactIndex = computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA,  collidableIndexB,collidableIndexA, 
+																rigidBodies,collidables,convexShapes,vertices,indices,
+																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA);
+
+			if (contactIndex>=0)
+				pairs[pairIndex].z = contactIndex;
+
+			return;
+		}
+
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)
+		{
+			computeContactPlaneSphere(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, 
+																rigidBodies,collidables,faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity);
+			return;
+		}
+
+
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)
+		{
+
+
+			computeContactPlaneSphere( pairIndex, bodyIndexB,bodyIndexA,  collidableIndexB,collidableIndexA, 
+																rigidBodies,collidables,
+																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity);
+
+			return;
+		}
+
+		
+
+	
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)
+		{
+		
+			float4 spherePos = rigidBodies[bodyIndexA].m_pos;
+			float sphereRadius = collidables[collidableIndexA].m_radius;
+			float4 convexPos = rigidBodies[bodyIndexB].m_pos;
+			float4 convexOrn = rigidBodies[bodyIndexB].m_quat;
+
+			computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, 
+																rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,
+																spherePos,sphereRadius,convexPos,convexOrn);
+
+			return;
+		}
+
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)
+		{
+		
+			float4 spherePos = rigidBodies[bodyIndexB].m_pos;
+			float sphereRadius = collidables[collidableIndexB].m_radius;
+			float4 convexPos = rigidBodies[bodyIndexA].m_pos;
+			float4 convexOrn = rigidBodies[bodyIndexA].m_quat;
+
+			computeContactSphereConvex(pairIndex, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, 
+																rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,
+																spherePos,sphereRadius,convexPos,convexOrn);
+			return;
+		}
+	
+	
+	
+		
+	
+	
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)
+		{
+			//sphere-sphere
+			float radiusA = collidables[collidableIndexA].m_radius;
+			float radiusB = collidables[collidableIndexB].m_radius;
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+
+			float4 diff = posA-posB;
+			float len = length(diff);
+			
+			///iff distance positive, don't generate a new contact
+			if ( len <= (radiusA+radiusB))
+			{
+				///distance (negative means penetration)
+				float dist = len - (radiusA+radiusB);
+				float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);
+				if (len > 0.00001)
+				{
+					normalOnSurfaceB = diff / len;
+				}
+				float4 contactPosB = posB + normalOnSurfaceB*radiusB;
+				contactPosB.w = dist;
+								
+				int dstIdx;
+				 AppendInc( nGlobalContactsOut, dstIdx );
+				
+				if (dstIdx < maxContactCapacity)
+				{
+					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
+					c->m_worldNormalOnB = normalOnSurfaceB;
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+					c->m_batchIdx = pairIndex;
+					int bodyA = pairs[pairIndex].x;
+					int bodyB = pairs[pairIndex].y;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+					c->m_worldPosB[0] = contactPosB;
+					c->m_childIndexA = -1;
+					c->m_childIndexB = -1;
+					GET_NPOINTS(*c) = 1;
+				}//if (dstIdx < numPairs)
+			}//if ( len <= (radiusA+radiusB))
+
+			return;
+		}//SHAPE_SPHERE SHAPE_SPHERE
+
+	}//	if (i<numPairs)
+
+}
+
+
+// work-in-progress
+__kernel void   processCompoundPairsPrimitivesKernel( __global const int4* gpuCompoundPairs,
+													__global const BodyData* rigidBodies, 
+													__global const btCollidableGpu* collidables,
+													__global const ConvexPolyhedronCL* convexShapes, 
+													__global const float4* vertices,
+													__global const float4* uniqueEdges,
+													__global const btGpuFace* faces,
+													__global const int* indices,
+													__global btAabbCL* aabbs,
+													__global const btGpuChildShape* gpuChildShapes,
+													__global struct b3Contact4Data* restrict globalContactsOut,
+													counter32_t nGlobalContactsOut,
+													int numCompoundPairs, int maxContactCapacity
+													)
+{
+
+	int i = get_global_id(0);
+	if (i<numCompoundPairs)
+	{
+		int bodyIndexA = gpuCompoundPairs[i].x;
+		int bodyIndexB = gpuCompoundPairs[i].y;
+
+		int childShapeIndexA = gpuCompoundPairs[i].z;
+		int childShapeIndexB = gpuCompoundPairs[i].w;
+		
+		int collidableIndexA = -1;
+		int collidableIndexB = -1;
+		
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		
+		float4 ornB = rigidBodies[bodyIndexB].m_quat;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+							
+		if (childShapeIndexA >= 0)
+		{
+			collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+			float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+			float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+			float4 newPosA = qtRotate(ornA,childPosA)+posA;
+			float4 newOrnA = qtMul(ornA,childOrnA);
+			posA = newPosA;
+			ornA = newOrnA;
+		} else
+		{
+			collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		}
+		
+		if (childShapeIndexB>=0)
+		{
+			collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			float4 newPosB = transform(&childPosB,&posB,&ornB);
+			float4 newOrnB = qtMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+		} else
+		{
+			collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	
+		}
+	
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+	
+		int shapeTypeA = collidables[collidableIndexA].m_shapeType;
+		int shapeTypeB = collidables[collidableIndexB].m_shapeType;
+
+		int pairIndex = i;
+		if ((shapeTypeA == SHAPE_PLANE) && (shapeTypeB==SHAPE_CONVEX_HULL))
+		{
+
+			computeContactPlaneConvex( pairIndex, bodyIndexA,bodyIndexB,  collidableIndexA,collidableIndexB, 
+																rigidBodies,collidables,convexShapes,vertices,indices,
+																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity,posB,ornB);
+			return;
+		}
+
+		if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB==SHAPE_PLANE))
+		{
+
+			computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA,  collidableIndexB,collidableIndexA, 
+																rigidBodies,collidables,convexShapes,vertices,indices,
+																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA);
+			return;
+		}
+
+		if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB == SHAPE_SPHERE))
+		{
+			float4 spherePos = rigidBodies[bodyIndexB].m_pos;
+			float sphereRadius = collidables[collidableIndexB].m_radius;
+			float4 convexPos = posA;
+			float4 convexOrn = ornA;
+			
+			computeContactSphereConvex(pairIndex, bodyIndexB, bodyIndexA , collidableIndexB,collidableIndexA, 
+										rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,
+										spherePos,sphereRadius,convexPos,convexOrn);
+	
+			return;
+		}
+
+		if ((shapeTypeA == SHAPE_SPHERE) && (shapeTypeB == SHAPE_CONVEX_HULL))
+		{
+
+			float4 spherePos = rigidBodies[bodyIndexA].m_pos;
+			float sphereRadius = collidables[collidableIndexA].m_radius;
+			float4 convexPos = posB;
+			float4 convexOrn = ornB;
+
+			
+			computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, 
+										rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,
+										spherePos,sphereRadius,convexPos,convexOrn);
+	
+			return;
+		}
+	}//	if (i<numCompoundPairs)
+}
+
+
+bool pointInTriangle(const float4* vertices, const float4* normal, float4 *p )
+{
+
+	const float4* p1 = &vertices[0];
+	const float4* p2 = &vertices[1];
+	const float4* p3 = &vertices[2];
+
+	float4 edge1;	edge1 = (*p2 - *p1);
+	float4 edge2;	edge2 = ( *p3 - *p2 );
+	float4 edge3;	edge3 = ( *p1 - *p3 );
+
+	
+	float4 p1_to_p; p1_to_p = ( *p - *p1 );
+	float4 p2_to_p; p2_to_p = ( *p - *p2 );
+	float4 p3_to_p; p3_to_p = ( *p - *p3 );
+
+	float4 edge1_normal; edge1_normal = ( cross(edge1,*normal));
+	float4 edge2_normal; edge2_normal = ( cross(edge2,*normal));
+	float4 edge3_normal; edge3_normal = ( cross(edge3,*normal));
+
+	
+	
+	float r1, r2, r3;
+	r1 = dot(edge1_normal,p1_to_p );
+	r2 = dot(edge2_normal,p2_to_p );
+	r3 = dot(edge3_normal,p3_to_p );
+	
+	if ( r1 > 0 && r2 > 0 && r3 > 0 )
+		return true;
+    if ( r1 <= 0 && r2 <= 0 && r3 <= 0 ) 
+		return true;
+	return false;
+
+}
+
+
+float segmentSqrDistance(float4 from, float4 to,float4 p, float4* nearest) 
+{
+	float4 diff = p - from;
+	float4 v = to - from;
+	float t = dot(v,diff);
+	
+	if (t > 0) 
+	{
+		float dotVV = dot(v,v);
+		if (t < dotVV) 
+		{
+			t /= dotVV;
+			diff -= t*v;
+		} else 
+		{
+			t = 1;
+			diff -= v;
+		}
+	} else
+	{
+		t = 0;
+	}
+	*nearest = from + t*v;
+	return dot(diff,diff);	
+}
+
+
+void	computeContactSphereTriangle(int pairIndex,
+									int bodyIndexA, int bodyIndexB,
+									int collidableIndexA, int collidableIndexB, 
+									__global const BodyData* rigidBodies, 
+									__global const btCollidableGpu* collidables,
+									const float4* triangleVertices,
+									__global struct b3Contact4Data* restrict globalContactsOut,
+									counter32_t nGlobalContactsOut,
+									int maxContactCapacity,
+									float4 spherePos2,
+									float radius,
+									float4 pos,
+									float4 quat,
+									int faceIndex
+									)
+{
+
+	float4 invPos;
+	float4 invOrn;
+
+	trInverse(pos,quat, &invPos,&invOrn);
+	float4 spherePos = transform(&spherePos2,&invPos,&invOrn);
+	int numFaces = 3;
+	float4 closestPnt = (float4)(0, 0, 0, 0);
+	float4 hitNormalWorld = (float4)(0, 0, 0, 0);
+	float minDist = -1000000.f;
+	bool bCollide = false;
+
+	
+	//////////////////////////////////////
+
+	float4 sphereCenter;
+	sphereCenter = spherePos;
+
+	const float4* vertices = triangleVertices;
+	float contactBreakingThreshold = 0.f;//todo?
+	float radiusWithThreshold = radius + contactBreakingThreshold;
+	float4 edge10;
+	edge10 = vertices[1]-vertices[0];
+	edge10.w = 0.f;//is this needed?
+	float4 edge20;
+	edge20 = vertices[2]-vertices[0];
+	edge20.w = 0.f;//is this needed?
+	float4 normal = cross3(edge10,edge20);
+	normal = normalize(normal);
+	float4 p1ToCenter;
+	p1ToCenter = sphereCenter - vertices[0];
+	
+	float distanceFromPlane = dot(p1ToCenter,normal);
+
+	if (distanceFromPlane < 0.f)
+	{
+		//triangle facing the other way
+		distanceFromPlane *= -1.f;
+		normal *= -1.f;
+	}
+	hitNormalWorld = normal;
+
+	bool isInsideContactPlane = distanceFromPlane < radiusWithThreshold;
+	
+	// Check for contact / intersection
+	bool hasContact = false;
+	float4 contactPoint;
+	if (isInsideContactPlane) 
+	{
+	
+		if (pointInTriangle(vertices,&normal, &sphereCenter)) 
+		{
+			// Inside the contact wedge - touches a point on the shell plane
+			hasContact = true;
+			contactPoint = sphereCenter - normal*distanceFromPlane;
+			
+		} else {
+			// Could be inside one of the contact capsules
+			float contactCapsuleRadiusSqr = radiusWithThreshold*radiusWithThreshold;
+			float4 nearestOnEdge;
+			int numEdges = 3;
+			for (int i = 0; i < numEdges; i++) 
+			{
+				float4 pa =vertices[i];
+				float4 pb = vertices[(i+1)%3];
+
+				float distanceSqr = segmentSqrDistance(pa,pb,sphereCenter, &nearestOnEdge);
+				if (distanceSqr < contactCapsuleRadiusSqr) 
+				{
+					// Yep, we're inside a capsule
+					hasContact = true;
+					contactPoint = nearestOnEdge;
+					
+				}
+				
+			}
+		}
+	}
+
+	if (hasContact) 
+	{
+
+		closestPnt = contactPoint;
+		float4 contactToCenter = sphereCenter - contactPoint;
+		minDist = length(contactToCenter);
+		if (minDist>FLT_EPSILON)
+		{
+			hitNormalWorld = normalize(contactToCenter);//*(1./minDist);
+			bCollide  = true;
+		}
+		
+	}
+
+
+	/////////////////////////////////////
+
+	if (bCollide && minDist > -10000)
+	{
+		
+		float4 normalOnSurfaceB1 = qtRotate(quat,-hitNormalWorld);
+		float4 pOnB1 = transform(&closestPnt,&pos,&quat);
+		float actualDepth = minDist-radius;
+
+		
+		if (actualDepth<=0.f)
+		{
+			pOnB1.w = actualDepth;
+			int dstIdx;
+
+			
+			float lenSqr = dot3F4(normalOnSurfaceB1,normalOnSurfaceB1);
+			if (lenSqr>FLT_EPSILON)
+			{
+				AppendInc( nGlobalContactsOut, dstIdx );
+			
+				if (dstIdx < maxContactCapacity)
+				{
+					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
+					c->m_worldNormalOnB = -normalOnSurfaceB1;
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+					c->m_batchIdx = pairIndex;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
+					c->m_worldPosB[0] = pOnB1;
+
+					c->m_childIndexA = -1;
+					c->m_childIndexB = faceIndex;
+
+					GET_NPOINTS(*c) = 1;
+				} 
+			}
+
+		}
+	}//if (hasCollision)
+
+}
+
+
+
+// work-in-progress
+__kernel void   findConcaveSphereContactsKernel( __global int4* concavePairs,
+												__global const BodyData* rigidBodies,
+												__global const btCollidableGpu* collidables,
+												__global const ConvexPolyhedronCL* convexShapes, 
+												__global const float4* vertices,
+												__global const float4* uniqueEdges,
+												__global const btGpuFace* faces,
+												__global const int* indices,
+												__global btAabbCL* aabbs,
+												__global struct b3Contact4Data* restrict globalContactsOut,
+												counter32_t nGlobalContactsOut,
+													int numConcavePairs, int maxContactCapacity
+												)
+{
+
+	int i = get_global_id(0);
+	if (i>=numConcavePairs)
+		return;
+	int pairIdx = i;
+
+	int bodyIndexA = concavePairs[i].x;
+	int bodyIndexB = concavePairs[i].y;
+
+	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+
+	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+
+	if (collidables[collidableIndexB].m_shapeType==SHAPE_SPHERE)
+	{
+		int f = concavePairs[i].z;
+		btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];
+		
+		float4 verticesA[3];
+		for (int i=0;i<3;i++)
+		{
+			int index = indices[face.m_indexOffset+i];
+			float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];
+			verticesA[i] = vert;
+		}
+
+		float4 spherePos = rigidBodies[bodyIndexB].m_pos;
+		float sphereRadius = collidables[collidableIndexB].m_radius;
+		float4 convexPos = rigidBodies[bodyIndexA].m_pos;
+		float4 convexOrn = rigidBodies[bodyIndexA].m_quat;
+
+		computeContactSphereTriangle(i, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, 
+																rigidBodies,collidables,
+																verticesA,
+																globalContactsOut, nGlobalContactsOut,maxContactCapacity,
+																spherePos,sphereRadius,convexPos,convexOrn, f);
+
+		return;
+	}
+}
+\ No newline at end of file
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h
new file mode 100644
index 0000000000..b0103fe674
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h
@@ -0,0 +1,1289 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* primitiveContactsKernelsCL= \
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#define B3_PLATFORM_DEFINITIONS_H\n"
+"struct MyTest\n"
+"{\n"
+"	int bla;\n"
+"};\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3AtomicAdd atomic_add\n"
+"#define b3Fabs fabs\n"
+"#define b3Sqrt native_sqrt\n"
+"#define b3Sin native_sin\n"
+"#define b3Cos native_cos\n"
+"#define B3_STATIC\n"
+"#endif\n"
+"#endif\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"	#define b3MinFloat4 min\n"
+"	#define b3MaxFloat4 max\n"
+"	#define b3Normalized(a) normalize(a)\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"{\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
+"}\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"{\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"}\n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"{\n"
+"	b3Float4	m_worldPosB[4];\n"
+"//	b3Float4	m_localPosA[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormalOnB;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"};\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"{\n"
+"	return (int)contact->m_worldNormalOnB.w;\n"
+"};\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"{\n"
+"	contact->m_worldNormalOnB.w = (float)numPoints;\n"
+"};\n"
+"#endif //B3_CONTACT4DATA_H\n"
+"#define SHAPE_CONVEX_HULL 3\n"
+"#define SHAPE_PLANE 4\n"
+"#define SHAPE_CONCAVE_TRIMESH 5\n"
+"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
+"#define SHAPE_SPHERE 7\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"#else\n"
+"#define counter32_t volatile __global int*\n"
+"#endif\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GET_NUM_GROUPS get_num_groups(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+"#define max2 max\n"
+"#define min2 min\n"
+"typedef unsigned int u32;\n"
+"typedef struct \n"
+"{\n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} btAabbCL;\n"
+"///keep this in sync with btCollidable.h\n"
+"typedef struct\n"
+"{\n"
+"	int m_numChildShapes;\n"
+"	float m_radius;\n"
+"	int m_shapeType;\n"
+"	int m_shapeIndex;\n"
+"	\n"
+"} btCollidableGpu;\n"
+"typedef struct\n"
+"{\n"
+"	float4	m_childPosition;\n"
+"	float4	m_childOrientation;\n"
+"	int m_shapeIndex;\n"
+"	int m_unused0;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"} btGpuChildShape;\n"
+"#define GET_NPOINTS(x) (x).m_worldNormalOnB.w\n"
+"typedef struct\n"
+"{\n"
+"	float4 m_pos;\n"
+"	float4 m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"	u32 m_collidableIdx;	\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} BodyData;\n"
+"typedef struct  \n"
+"{\n"
+"	float4		m_localCenter;\n"
+"	float4		m_extents;\n"
+"	float4		mC;\n"
+"	float4		mE;\n"
+"	\n"
+"	float			m_radius;\n"
+"	int	m_faceOffset;\n"
+"	int m_numFaces;\n"
+"	int	m_numVertices;\n"
+"	\n"
+"	int m_vertexOffset;\n"
+"	int	m_uniqueEdgesOffset;\n"
+"	int	m_numUniqueEdges;\n"
+"	int m_unused;\n"
+"} ConvexPolyhedronCL;\n"
+"typedef struct\n"
+"{\n"
+"	float4 m_plane;\n"
+"	int m_indexOffset;\n"
+"	int m_numIndices;\n"
+"} btGpuFace;\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"#define make_float4 (float4)\n"
+"#define make_float2 (float2)\n"
+"#define make_uint4 (uint4)\n"
+"#define make_int4 (int4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"__inline\n"
+"float fastDiv(float numerator, float denominator)\n"
+"{\n"
+"	return native_divide(numerator, denominator);	\n"
+"//	return numerator/denominator;	\n"
+"}\n"
+"__inline\n"
+"float4 fastDiv4(float4 numerator, float4 denominator)\n"
+"{\n"
+"	return native_divide(numerator, denominator);	\n"
+"}\n"
+"__inline\n"
+"float4 cross3(float4 a, float4 b)\n"
+"{\n"
+"	return cross(a,b);\n"
+"}\n"
+"//#define dot3F4 dot\n"
+"__inline\n"
+"float dot3F4(float4 a, float4 b)\n"
+"{\n"
+"	float4 a1 = make_float4(a.xyz,0.f);\n"
+"	float4 b1 = make_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"}\n"
+"__inline\n"
+"float4 fastNormalize4(float4 v)\n"
+"{\n"
+"	return fast_normalize(v);\n"
+"}\n"
+"///////////////////////////////////////\n"
+"//	Quaternion\n"
+"///////////////////////////////////////\n"
+"typedef float4 Quaternion;\n"
+"__inline\n"
+"Quaternion qtMul(Quaternion a, Quaternion b);\n"
+"__inline\n"
+"Quaternion qtNormalize(Quaternion in);\n"
+"__inline\n"
+"float4 qtRotate(Quaternion q, float4 vec);\n"
+"__inline\n"
+"Quaternion qtInvert(Quaternion q);\n"
+"__inline\n"
+"Quaternion qtMul(Quaternion a, Quaternion b)\n"
+"{\n"
+"	Quaternion ans;\n"
+"	ans = cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - dot3F4(a, b);\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"Quaternion qtNormalize(Quaternion in)\n"
+"{\n"
+"	return fastNormalize4(in);\n"
+"//	in /= length( in );\n"
+"//	return in;\n"
+"}\n"
+"__inline\n"
+"float4 qtRotate(Quaternion q, float4 vec)\n"
+"{\n"
+"	Quaternion qInv = qtInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"Quaternion qtInvert(Quaternion q)\n"
+"{\n"
+"	return (Quaternion)(-q.xyz, q.w);\n"
+"}\n"
+"__inline\n"
+"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
+"{\n"
+"	return qtRotate( qtInvert( q ), vec );\n"
+"}\n"
+"__inline\n"
+"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
+"{\n"
+"	return qtRotate( *orientation, *p ) + (*translation);\n"
+"}\n"
+"void	trInverse(float4 translationIn, Quaternion orientationIn,\n"
+"		float4* translationOut, Quaternion* orientationOut)\n"
+"{\n"
+"	*orientationOut = qtInvert(orientationIn);\n"
+"	*translationOut = qtRotate(*orientationOut, -translationIn);\n"
+"}\n"
+"void	trMul(float4 translationA, Quaternion orientationA,\n"
+"						float4 translationB, Quaternion orientationB,\n"
+"		float4* translationOut, Quaternion* orientationOut)\n"
+"{\n"
+"	*orientationOut = qtMul(orientationA,orientationB);\n"
+"	*translationOut = transform(&translationB,&translationA,&orientationA);\n"
+"}\n"
+"__inline\n"
+"float4 normalize3(const float4 a)\n"
+"{\n"
+"	float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
+"	return fastNormalize4( n );\n"
+"}\n"
+"__inline float4 lerp3(const float4 a,const float4 b, float  t)\n"
+"{\n"
+"	return make_float4(	a.x + (b.x - a.x) * t,\n"
+"						a.y + (b.y - a.y) * t,\n"
+"						a.z + (b.z - a.z) * t,\n"
+"						0.f);\n"
+"}\n"
+"float signedDistanceFromPointToPlane(float4 point, float4 planeEqn, float4* closestPointOnFace)\n"
+"{\n"
+"	float4 n = (float4)(planeEqn.x, planeEqn.y, planeEqn.z, 0);\n"
+"	float dist = dot3F4(n, point) + planeEqn.w;\n"
+"	*closestPointOnFace = point - dist * n;\n"
+"	return dist;\n"
+"}\n"
+"inline bool IsPointInPolygon(float4 p, \n"
+"							const btGpuFace* face,\n"
+"							__global const float4* baseVertex,\n"
+"							__global const  int* convexIndices,\n"
+"							float4* out)\n"
+"{\n"
+"    float4 a;\n"
+"    float4 b;\n"
+"    float4 ab;\n"
+"    float4 ap;\n"
+"    float4 v;\n"
+"	float4 plane = make_float4(face->m_plane.x,face->m_plane.y,face->m_plane.z,0.f);\n"
+"	\n"
+"	if (face->m_numIndices<2)\n"
+"		return false;\n"
+"	\n"
+"	float4 v0 = baseVertex[convexIndices[face->m_indexOffset + face->m_numIndices-1]];\n"
+"	\n"
+"	b = v0;\n"
+"    for(unsigned i=0; i != face->m_numIndices; ++i)\n"
+"    {\n"
+"		a = b;\n"
+"		float4 vi = baseVertex[convexIndices[face->m_indexOffset + i]];\n"
+"		b = vi;\n"
+"        ab = b-a;\n"
+"        ap = p-a;\n"
+"        v = cross3(ab,plane);\n"
+"        if (dot(ap, v) > 0.f)\n"
+"        {\n"
+"            float ab_m2 = dot(ab, ab);\n"
+"            float rt = ab_m2 != 0.f ? dot(ab, ap) / ab_m2 : 0.f;\n"
+"            if (rt <= 0.f)\n"
+"            {\n"
+"                *out = a;\n"
+"            }\n"
+"            else if (rt >= 1.f) \n"
+"            {\n"
+"                *out = b;\n"
+"            }\n"
+"            else\n"
+"            {\n"
+"            	float s = 1.f - rt;\n"
+"				out[0].x = s * a.x + rt * b.x;\n"
+"				out[0].y = s * a.y + rt * b.y;\n"
+"				out[0].z = s * a.z + rt * b.z;\n"
+"            }\n"
+"            return false;\n"
+"        }\n"
+"    }\n"
+"    return true;\n"
+"}\n"
+"void	computeContactSphereConvex(int pairIndex,\n"
+"																int bodyIndexA, int bodyIndexB, \n"
+"																int collidableIndexA, int collidableIndexB, \n"
+"																__global const BodyData* rigidBodies, \n"
+"																__global const btCollidableGpu* collidables,\n"
+"																__global const ConvexPolyhedronCL* convexShapes,\n"
+"																__global const float4* convexVertices,\n"
+"																__global const int* convexIndices,\n"
+"																__global const btGpuFace* faces,\n"
+"																__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"																counter32_t nGlobalContactsOut,\n"
+"																int maxContactCapacity,\n"
+"																float4 spherePos2,\n"
+"																float radius,\n"
+"																float4 pos,\n"
+"																float4 quat\n"
+"																)\n"
+"{\n"
+"	float4 invPos;\n"
+"	float4 invOrn;\n"
+"	trInverse(pos,quat, &invPos,&invOrn);\n"
+"	float4 spherePos = transform(&spherePos2,&invPos,&invOrn);\n"
+"	int shapeIndex = collidables[collidableIndexB].m_shapeIndex;\n"
+"	int numFaces = convexShapes[shapeIndex].m_numFaces;\n"
+"	float4 closestPnt = (float4)(0, 0, 0, 0);\n"
+"	float4 hitNormalWorld = (float4)(0, 0, 0, 0);\n"
+"	float minDist = -1000000.f;\n"
+"	bool bCollide = true;\n"
+"	for ( int f = 0; f < numFaces; f++ )\n"
+"	{\n"
+"		btGpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f];\n"
+"		// set up a plane equation \n"
+"		float4 planeEqn;\n"
+"		float4 n1 = face.m_plane;\n"
+"		n1.w = 0.f;\n"
+"		planeEqn = n1;\n"
+"		planeEqn.w = face.m_plane.w;\n"
+"		\n"
+"	\n"
+"		// compute a signed distance from the vertex in cloth to the face of rigidbody.\n"
+"		float4 pntReturn;\n"
+"		float dist = signedDistanceFromPointToPlane(spherePos, planeEqn, &pntReturn);\n"
+"		// If the distance is positive, the plane is a separating plane. \n"
+"		if ( dist > radius )\n"
+"		{\n"
+"			bCollide = false;\n"
+"			break;\n"
+"		}\n"
+"		if (dist>0)\n"
+"		{\n"
+"			//might hit an edge or vertex\n"
+"			float4 out;\n"
+"			float4 zeroPos = make_float4(0,0,0,0);\n"
+"			bool isInPoly = IsPointInPolygon(spherePos,\n"
+"					&face,\n"
+"					&convexVertices[convexShapes[shapeIndex].m_vertexOffset],\n"
+"					convexIndices,\n"
+"           &out);\n"
+"			if (isInPoly)\n"
+"			{\n"
+"				if (dist>minDist)\n"
+"				{\n"
+"					minDist = dist;\n"
+"					closestPnt = pntReturn;\n"
+"					hitNormalWorld = planeEqn;\n"
+"					\n"
+"				}\n"
+"			} else\n"
+"			{\n"
+"				float4 tmp = spherePos-out;\n"
+"				float l2 = dot(tmp,tmp);\n"
+"				if (l2<radius*radius)\n"
+"				{\n"
+"					dist  = sqrt(l2);\n"
+"					if (dist>minDist)\n"
+"					{\n"
+"						minDist = dist;\n"
+"						closestPnt = out;\n"
+"						hitNormalWorld = tmp/dist;\n"
+"						\n"
+"					}\n"
+"					\n"
+"				} else\n"
+"				{\n"
+"					bCollide = false;\n"
+"					break;\n"
+"				}\n"
+"			}\n"
+"		} else\n"
+"		{\n"
+"			if ( dist > minDist )\n"
+"			{\n"
+"				minDist = dist;\n"
+"				closestPnt = pntReturn;\n"
+"				hitNormalWorld.xyz = planeEqn.xyz;\n"
+"			}\n"
+"		}\n"
+"		\n"
+"	}\n"
+"	\n"
+"	if (bCollide && minDist > -10000)\n"
+"	{\n"
+"		float4 normalOnSurfaceB1 = qtRotate(quat,-hitNormalWorld);\n"
+"		float4 pOnB1 = transform(&closestPnt,&pos,&quat);\n"
+"		\n"
+"		float actualDepth = minDist-radius;\n"
+"		if (actualDepth<=0.f)\n"
+"		{\n"
+"			\n"
+"			pOnB1.w = actualDepth;\n"
+"			int dstIdx;\n"
+"			AppendInc( nGlobalContactsOut, dstIdx );\n"
+"		\n"
+"			\n"
+"			if (1)//dstIdx < maxContactCapacity)\n"
+"			{\n"
+"				__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"				c->m_worldNormalOnB = -normalOnSurfaceB1;\n"
+"				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"				c->m_batchIdx = pairIndex;\n"
+"				c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n"
+"				c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n"
+"				c->m_worldPosB[0] = pOnB1;\n"
+"				c->m_childIndexA = -1;\n"
+"				c->m_childIndexB = -1;\n"
+"				GET_NPOINTS(*c) = 1;\n"
+"			} \n"
+"		}\n"
+"	}//if (hasCollision)\n"
+"}\n"
+"							\n"
+"int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int4* contactIdx)\n"
+"{\n"
+"	if( nPoints == 0 )\n"
+"        return 0;\n"
+"    \n"
+"    if (nPoints <=4)\n"
+"        return nPoints;\n"
+"    \n"
+"    \n"
+"    if (nPoints >64)\n"
+"        nPoints = 64;\n"
+"    \n"
+"	float4 center = make_float4(0.f);\n"
+"	{\n"
+"		\n"
+"		for (int i=0;i<nPoints;i++)\n"
+"			center += p[i];\n"
+"		center /= (float)nPoints;\n"
+"	}\n"
+"    \n"
+"	\n"
+"    \n"
+"	//	sample 4 directions\n"
+"    \n"
+"    float4 aVector = p[0] - center;\n"
+"    float4 u = cross3( nearNormal, aVector );\n"
+"    float4 v = cross3( nearNormal, u );\n"
+"    u = normalize3( u );\n"
+"    v = normalize3( v );\n"
+"    \n"
+"    \n"
+"    //keep point with deepest penetration\n"
+"    float minW= FLT_MAX;\n"
+"    \n"
+"    int minIndex=-1;\n"
+"    \n"
+"    float4 maxDots;\n"
+"    maxDots.x = FLT_MIN;\n"
+"    maxDots.y = FLT_MIN;\n"
+"    maxDots.z = FLT_MIN;\n"
+"    maxDots.w = FLT_MIN;\n"
+"    \n"
+"    //	idx, distance\n"
+"    for(int ie = 0; ie<nPoints; ie++ )\n"
+"    {\n"
+"        if (p[ie].w<minW)\n"
+"        {\n"
+"            minW = p[ie].w;\n"
+"            minIndex=ie;\n"
+"        }\n"
+"        float f;\n"
+"        float4 r = p[ie]-center;\n"
+"        f = dot3F4( u, r );\n"
+"        if (f<maxDots.x)\n"
+"        {\n"
+"            maxDots.x = f;\n"
+"            contactIdx[0].x = ie;\n"
+"        }\n"
+"        \n"
+"        f = dot3F4( -u, r );\n"
+"        if (f<maxDots.y)\n"
+"        {\n"
+"            maxDots.y = f;\n"
+"            contactIdx[0].y = ie;\n"
+"        }\n"
+"        \n"
+"        \n"
+"        f = dot3F4( v, r );\n"
+"        if (f<maxDots.z)\n"
+"        {\n"
+"            maxDots.z = f;\n"
+"            contactIdx[0].z = ie;\n"
+"        }\n"
+"        \n"
+"        f = dot3F4( -v, r );\n"
+"        if (f<maxDots.w)\n"
+"        {\n"
+"            maxDots.w = f;\n"
+"            contactIdx[0].w = ie;\n"
+"        }\n"
+"        \n"
+"    }\n"
+"    \n"
+"    if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)\n"
+"    {\n"
+"        //replace the first contact with minimum (todo: replace contact with least penetration)\n"
+"        contactIdx[0].x = minIndex;\n"
+"    }\n"
+"    \n"
+"    return 4;\n"
+"    \n"
+"}\n"
+"#define MAX_PLANE_CONVEX_POINTS 64\n"
+"int computeContactPlaneConvex(int pairIndex,\n"
+"								int bodyIndexA, int bodyIndexB, \n"
+"								int collidableIndexA, int collidableIndexB, \n"
+"								__global const BodyData* rigidBodies, \n"
+"								__global const btCollidableGpu*collidables,\n"
+"								__global const ConvexPolyhedronCL* convexShapes,\n"
+"								__global const float4* convexVertices,\n"
+"								__global const int* convexIndices,\n"
+"								__global const btGpuFace* faces,\n"
+"								__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"								counter32_t nGlobalContactsOut,\n"
+"								int maxContactCapacity,\n"
+"								float4 posB,\n"
+"								Quaternion ornB\n"
+"								)\n"
+"{\n"
+"	int resultIndex=-1;\n"
+"		int shapeIndex = collidables[collidableIndexB].m_shapeIndex;\n"
+"	__global const ConvexPolyhedronCL* hullB = &convexShapes[shapeIndex];\n"
+"	\n"
+"	float4 posA;\n"
+"	posA = rigidBodies[bodyIndexA].m_pos;\n"
+"	Quaternion ornA;\n"
+"	ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"	int numContactsOut = 0;\n"
+"	int numWorldVertsB1= 0;\n"
+"	float4 planeEq;\n"
+"	 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;\n"
+"	float4 planeNormal = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f);\n"
+"	float4 planeNormalWorld;\n"
+"	planeNormalWorld = qtRotate(ornA,planeNormal);\n"
+"	float planeConstant = planeEq.w;\n"
+"	\n"
+"	float4 invPosA;Quaternion invOrnA;\n"
+"	float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1;\n"
+"	{\n"
+"		\n"
+"		trInverse(posA,ornA,&invPosA,&invOrnA);\n"
+"		trMul(invPosA,invOrnA,posB,ornB,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);\n"
+"	}\n"
+"	float4 invPosB;Quaternion invOrnB;\n"
+"	float4 planeInConvexPos1;	Quaternion planeInConvexOrn1;\n"
+"	{\n"
+"		\n"
+"		trInverse(posB,ornB,&invPosB,&invOrnB);\n"
+"		trMul(invPosB,invOrnB,posA,ornA,&planeInConvexPos1,&planeInConvexOrn1);	\n"
+"	}\n"
+"	\n"
+"	float4 planeNormalInConvex = qtRotate(planeInConvexOrn1,-planeNormal);\n"
+"	float maxDot = -1e30;\n"
+"	int hitVertex=-1;\n"
+"	float4 hitVtx;\n"
+"	float4 contactPoints[MAX_PLANE_CONVEX_POINTS];\n"
+"	int numPoints = 0;\n"
+"	int4 contactIdx;\n"
+"	contactIdx=make_int4(0,1,2,3);\n"
+"    \n"
+"	\n"
+"	for (int i=0;i<hullB->m_numVertices;i++)\n"
+"	{\n"
+"		float4 vtx = convexVertices[hullB->m_vertexOffset+i];\n"
+"		float curDot = dot(vtx,planeNormalInConvex);\n"
+"		if (curDot>maxDot)\n"
+"		{\n"
+"			hitVertex=i;\n"
+"			maxDot=curDot;\n"
+"			hitVtx = vtx;\n"
+"			//make sure the deepest points is always included\n"
+"			if (numPoints==MAX_PLANE_CONVEX_POINTS)\n"
+"				numPoints--;\n"
+"		}\n"
+"		if (numPoints<MAX_PLANE_CONVEX_POINTS)\n"
+"		{\n"
+"			float4 vtxWorld = transform(&vtx, &posB, &ornB);\n"
+"			float4 vtxInPlane = transform(&vtxWorld, &invPosA, &invOrnA);//oplaneTransform.inverse()*vtxWorld;\n"
+"			float dist = dot(planeNormal,vtxInPlane)-planeConstant;\n"
+"			if (dist<0.f)\n"
+"			{\n"
+"				vtxWorld.w = dist;\n"
+"				contactPoints[numPoints] = vtxWorld;\n"
+"				numPoints++;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	int numReducedPoints  = numPoints;\n"
+"	if (numPoints>4)\n"
+"	{\n"
+"		numReducedPoints = extractManifoldSequential( contactPoints, numPoints, planeNormalInConvex, &contactIdx);\n"
+"	}\n"
+"	if (numReducedPoints>0)\n"
+"	{\n"
+"		int dstIdx;\n"
+"	    AppendInc( nGlobalContactsOut, dstIdx );\n"
+"		if (dstIdx < maxContactCapacity)\n"
+"		{\n"
+"			resultIndex = dstIdx;\n"
+"			__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"			c->m_worldNormalOnB = -planeNormalWorld;\n"
+"			//c->setFrictionCoeff(0.7);\n"
+"			//c->setRestituitionCoeff(0.f);\n"
+"			c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"			c->m_batchIdx = pairIndex;\n"
+"			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n"
+"			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n"
+"			c->m_childIndexA = -1;\n"
+"			c->m_childIndexB = -1;\n"
+"			switch (numReducedPoints)\n"
+"            {\n"
+"                case 4:\n"
+"                    c->m_worldPosB[3] = contactPoints[contactIdx.w];\n"
+"                case 3:\n"
+"                    c->m_worldPosB[2] = contactPoints[contactIdx.z];\n"
+"                case 2:\n"
+"                    c->m_worldPosB[1] = contactPoints[contactIdx.y];\n"
+"                case 1:\n"
+"                    c->m_worldPosB[0] = contactPoints[contactIdx.x];\n"
+"                default:\n"
+"                {\n"
+"                }\n"
+"            };\n"
+"			\n"
+"			GET_NPOINTS(*c) = numReducedPoints;\n"
+"		}//if (dstIdx < numPairs)\n"
+"	}	\n"
+"	return resultIndex;\n"
+"}\n"
+"void	computeContactPlaneSphere(int pairIndex,\n"
+"																int bodyIndexA, int bodyIndexB, \n"
+"																int collidableIndexA, int collidableIndexB, \n"
+"																__global const BodyData* rigidBodies, \n"
+"																__global const btCollidableGpu* collidables,\n"
+"																__global const btGpuFace* faces,\n"
+"																__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"																counter32_t nGlobalContactsOut,\n"
+"																int maxContactCapacity)\n"
+"{\n"
+"	float4 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;\n"
+"	float radius = collidables[collidableIndexB].m_radius;\n"
+"	float4 posA1 = rigidBodies[bodyIndexA].m_pos;\n"
+"	float4 ornA1 = rigidBodies[bodyIndexA].m_quat;\n"
+"	float4 posB1 = rigidBodies[bodyIndexB].m_pos;\n"
+"	float4 ornB1 = rigidBodies[bodyIndexB].m_quat;\n"
+"	\n"
+"	bool hasCollision = false;\n"
+"	float4 planeNormal1 = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f);\n"
+"	float planeConstant = planeEq.w;\n"
+"	float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1;\n"
+"	{\n"
+"		float4 invPosA;Quaternion invOrnA;\n"
+"		trInverse(posA1,ornA1,&invPosA,&invOrnA);\n"
+"		trMul(invPosA,invOrnA,posB1,ornB1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);\n"
+"	}\n"
+"	float4 planeInConvexPos1;	Quaternion planeInConvexOrn1;\n"
+"	{\n"
+"		float4 invPosB;Quaternion invOrnB;\n"
+"		trInverse(posB1,ornB1,&invPosB,&invOrnB);\n"
+"		trMul(invPosB,invOrnB,posA1,ornA1,&planeInConvexPos1,&planeInConvexOrn1);	\n"
+"	}\n"
+"	float4 vtx1 = qtRotate(planeInConvexOrn1,-planeNormal1)*radius;\n"
+"	float4 vtxInPlane1 = transform(&vtx1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);\n"
+"	float distance = dot3F4(planeNormal1,vtxInPlane1) - planeConstant;\n"
+"	hasCollision = distance < 0.f;//m_manifoldPtr->getContactBreakingThreshold();\n"
+"	if (hasCollision)\n"
+"	{\n"
+"		float4 vtxInPlaneProjected1 = vtxInPlane1 -   distance*planeNormal1;\n"
+"		float4 vtxInPlaneWorld1 = transform(&vtxInPlaneProjected1,&posA1,&ornA1);\n"
+"		float4 normalOnSurfaceB1 = qtRotate(ornA1,planeNormal1);\n"
+"		float4 pOnB1 = vtxInPlaneWorld1+normalOnSurfaceB1*distance;\n"
+"		pOnB1.w = distance;\n"
+"		int dstIdx;\n"
+"    AppendInc( nGlobalContactsOut, dstIdx );\n"
+"		\n"
+"		if (dstIdx < maxContactCapacity)\n"
+"		{\n"
+"			__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"			c->m_worldNormalOnB = -normalOnSurfaceB1;\n"
+"			c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"			c->m_batchIdx = pairIndex;\n"
+"			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n"
+"			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n"
+"			c->m_worldPosB[0] = pOnB1;\n"
+"			c->m_childIndexA = -1;\n"
+"			c->m_childIndexB = -1;\n"
+"			GET_NPOINTS(*c) = 1;\n"
+"		}//if (dstIdx < numPairs)\n"
+"	}//if (hasCollision)\n"
+"}\n"
+"__kernel void   primitiveContactsKernel( __global int4* pairs, \n"
+"																					__global const BodyData* rigidBodies, \n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const ConvexPolyhedronCL* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const btGpuFace* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"																					counter32_t nGlobalContactsOut,\n"
+"																					int numPairs, int maxContactCapacity)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"	float4 worldVertsB1[64];\n"
+"	float4 worldVertsB2[64];\n"
+"	int capacityWorldVerts = 64;	\n"
+"	float4 localContactsOut[64];\n"
+"	int localContactCapacity=64;\n"
+"	\n"
+"	float minDist = -1e30f;\n"
+"	float maxDist = 0.02f;\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"			\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"	\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)\n"
+"		{\n"
+"			float4 posB;\n"
+"			posB = rigidBodies[bodyIndexB].m_pos;\n"
+"			Quaternion ornB;\n"
+"			ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"			int contactIndex = computeContactPlaneConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n"
+"																rigidBodies,collidables,convexShapes,vertices,indices,\n"
+"																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity, posB,ornB);\n"
+"			if (contactIndex>=0)\n"
+"				pairs[pairIndex].z = contactIndex;\n"
+"			return;\n"
+"		}\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)\n"
+"		{\n"
+"			float4 posA;\n"
+"			posA = rigidBodies[bodyIndexA].m_pos;\n"
+"			Quaternion ornA;\n"
+"			ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"			int contactIndex = computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA,  collidableIndexB,collidableIndexA, \n"
+"																rigidBodies,collidables,convexShapes,vertices,indices,\n"
+"																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA);\n"
+"			if (contactIndex>=0)\n"
+"				pairs[pairIndex].z = contactIndex;\n"
+"			return;\n"
+"		}\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n"
+"		{\n"
+"			computeContactPlaneSphere(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n"
+"																rigidBodies,collidables,faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity);\n"
+"			return;\n"
+"		}\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)\n"
+"		{\n"
+"			computeContactPlaneSphere( pairIndex, bodyIndexB,bodyIndexA,  collidableIndexB,collidableIndexA, \n"
+"																rigidBodies,collidables,\n"
+"																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity);\n"
+"			return;\n"
+"		}\n"
+"		\n"
+"	\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)\n"
+"		{\n"
+"		\n"
+"			float4 spherePos = rigidBodies[bodyIndexA].m_pos;\n"
+"			float sphereRadius = collidables[collidableIndexA].m_radius;\n"
+"			float4 convexPos = rigidBodies[bodyIndexB].m_pos;\n"
+"			float4 convexOrn = rigidBodies[bodyIndexB].m_quat;\n"
+"			computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n"
+"																rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n"
+"																spherePos,sphereRadius,convexPos,convexOrn);\n"
+"			return;\n"
+"		}\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n"
+"		{\n"
+"		\n"
+"			float4 spherePos = rigidBodies[bodyIndexB].m_pos;\n"
+"			float sphereRadius = collidables[collidableIndexB].m_radius;\n"
+"			float4 convexPos = rigidBodies[bodyIndexA].m_pos;\n"
+"			float4 convexOrn = rigidBodies[bodyIndexA].m_quat;\n"
+"			computeContactSphereConvex(pairIndex, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, \n"
+"																rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n"
+"																spherePos,sphereRadius,convexPos,convexOrn);\n"
+"			return;\n"
+"		}\n"
+"	\n"
+"	\n"
+"	\n"
+"		\n"
+"	\n"
+"	\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n"
+"		{\n"
+"			//sphere-sphere\n"
+"			float radiusA = collidables[collidableIndexA].m_radius;\n"
+"			float radiusB = collidables[collidableIndexB].m_radius;\n"
+"			float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"			float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"			float4 diff = posA-posB;\n"
+"			float len = length(diff);\n"
+"			\n"
+"			///iff distance positive, don't generate a new contact\n"
+"			if ( len <= (radiusA+radiusB))\n"
+"			{\n"
+"				///distance (negative means penetration)\n"
+"				float dist = len - (radiusA+radiusB);\n"
+"				float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);\n"
+"				if (len > 0.00001)\n"
+"				{\n"
+"					normalOnSurfaceB = diff / len;\n"
+"				}\n"
+"				float4 contactPosB = posB + normalOnSurfaceB*radiusB;\n"
+"				contactPosB.w = dist;\n"
+"								\n"
+"				int dstIdx;\n"
+"				 AppendInc( nGlobalContactsOut, dstIdx );\n"
+"				\n"
+"				if (dstIdx < maxContactCapacity)\n"
+"				{\n"
+"					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"					c->m_worldNormalOnB = normalOnSurfaceB;\n"
+"					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"					c->m_batchIdx = pairIndex;\n"
+"					int bodyA = pairs[pairIndex].x;\n"
+"					int bodyB = pairs[pairIndex].y;\n"
+"					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"					c->m_worldPosB[0] = contactPosB;\n"
+"					c->m_childIndexA = -1;\n"
+"					c->m_childIndexB = -1;\n"
+"					GET_NPOINTS(*c) = 1;\n"
+"				}//if (dstIdx < numPairs)\n"
+"			}//if ( len <= (radiusA+radiusB))\n"
+"			return;\n"
+"		}//SHAPE_SPHERE SHAPE_SPHERE\n"
+"	}//	if (i<numPairs)\n"
+"}\n"
+"// work-in-progress\n"
+"__kernel void   processCompoundPairsPrimitivesKernel( __global const int4* gpuCompoundPairs,\n"
+"													__global const BodyData* rigidBodies, \n"
+"													__global const btCollidableGpu* collidables,\n"
+"													__global const ConvexPolyhedronCL* convexShapes, \n"
+"													__global const float4* vertices,\n"
+"													__global const float4* uniqueEdges,\n"
+"													__global const btGpuFace* faces,\n"
+"													__global const int* indices,\n"
+"													__global btAabbCL* aabbs,\n"
+"													__global const btGpuChildShape* gpuChildShapes,\n"
+"													__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"													counter32_t nGlobalContactsOut,\n"
+"													int numCompoundPairs, int maxContactCapacity\n"
+"													)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i<numCompoundPairs)\n"
+"	{\n"
+"		int bodyIndexA = gpuCompoundPairs[i].x;\n"
+"		int bodyIndexB = gpuCompoundPairs[i].y;\n"
+"		int childShapeIndexA = gpuCompoundPairs[i].z;\n"
+"		int childShapeIndexB = gpuCompoundPairs[i].w;\n"
+"		\n"
+"		int collidableIndexA = -1;\n"
+"		int collidableIndexB = -1;\n"
+"		\n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		\n"
+"		float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"							\n"
+"		if (childShapeIndexA >= 0)\n"
+"		{\n"
+"			collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n"
+"			float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n"
+"			float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n"
+"			float4 newPosA = qtRotate(ornA,childPosA)+posA;\n"
+"			float4 newOrnA = qtMul(ornA,childOrnA);\n"
+"			posA = newPosA;\n"
+"			ornA = newOrnA;\n"
+"		} else\n"
+"		{\n"
+"			collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		}\n"
+"		\n"
+"		if (childShapeIndexB>=0)\n"
+"		{\n"
+"			collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"			float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"			float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"			posB = newPosB;\n"
+"			ornB = newOrnB;\n"
+"		} else\n"
+"		{\n"
+"			collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	\n"
+"		}\n"
+"	\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"	\n"
+"		int shapeTypeA = collidables[collidableIndexA].m_shapeType;\n"
+"		int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
+"		int pairIndex = i;\n"
+"		if ((shapeTypeA == SHAPE_PLANE) && (shapeTypeB==SHAPE_CONVEX_HULL))\n"
+"		{\n"
+"			computeContactPlaneConvex( pairIndex, bodyIndexA,bodyIndexB,  collidableIndexA,collidableIndexB, \n"
+"																rigidBodies,collidables,convexShapes,vertices,indices,\n"
+"																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity,posB,ornB);\n"
+"			return;\n"
+"		}\n"
+"		if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB==SHAPE_PLANE))\n"
+"		{\n"
+"			computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA,  collidableIndexB,collidableIndexA, \n"
+"																rigidBodies,collidables,convexShapes,vertices,indices,\n"
+"																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA);\n"
+"			return;\n"
+"		}\n"
+"		if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB == SHAPE_SPHERE))\n"
+"		{\n"
+"			float4 spherePos = rigidBodies[bodyIndexB].m_pos;\n"
+"			float sphereRadius = collidables[collidableIndexB].m_radius;\n"
+"			float4 convexPos = posA;\n"
+"			float4 convexOrn = ornA;\n"
+"			\n"
+"			computeContactSphereConvex(pairIndex, bodyIndexB, bodyIndexA , collidableIndexB,collidableIndexA, \n"
+"										rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n"
+"										spherePos,sphereRadius,convexPos,convexOrn);\n"
+"	\n"
+"			return;\n"
+"		}\n"
+"		if ((shapeTypeA == SHAPE_SPHERE) && (shapeTypeB == SHAPE_CONVEX_HULL))\n"
+"		{\n"
+"			float4 spherePos = rigidBodies[bodyIndexA].m_pos;\n"
+"			float sphereRadius = collidables[collidableIndexA].m_radius;\n"
+"			float4 convexPos = posB;\n"
+"			float4 convexOrn = ornB;\n"
+"			\n"
+"			computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n"
+"										rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n"
+"										spherePos,sphereRadius,convexPos,convexOrn);\n"
+"	\n"
+"			return;\n"
+"		}\n"
+"	}//	if (i<numCompoundPairs)\n"
+"}\n"
+"bool pointInTriangle(const float4* vertices, const float4* normal, float4 *p )\n"
+"{\n"
+"	const float4* p1 = &vertices[0];\n"
+"	const float4* p2 = &vertices[1];\n"
+"	const float4* p3 = &vertices[2];\n"
+"	float4 edge1;	edge1 = (*p2 - *p1);\n"
+"	float4 edge2;	edge2 = ( *p3 - *p2 );\n"
+"	float4 edge3;	edge3 = ( *p1 - *p3 );\n"
+"	\n"
+"	float4 p1_to_p; p1_to_p = ( *p - *p1 );\n"
+"	float4 p2_to_p; p2_to_p = ( *p - *p2 );\n"
+"	float4 p3_to_p; p3_to_p = ( *p - *p3 );\n"
+"	float4 edge1_normal; edge1_normal = ( cross(edge1,*normal));\n"
+"	float4 edge2_normal; edge2_normal = ( cross(edge2,*normal));\n"
+"	float4 edge3_normal; edge3_normal = ( cross(edge3,*normal));\n"
+"	\n"
+"	\n"
+"	float r1, r2, r3;\n"
+"	r1 = dot(edge1_normal,p1_to_p );\n"
+"	r2 = dot(edge2_normal,p2_to_p );\n"
+"	r3 = dot(edge3_normal,p3_to_p );\n"
+"	\n"
+"	if ( r1 > 0 && r2 > 0 && r3 > 0 )\n"
+"		return true;\n"
+"    if ( r1 <= 0 && r2 <= 0 && r3 <= 0 ) \n"
+"		return true;\n"
+"	return false;\n"
+"}\n"
+"float segmentSqrDistance(float4 from, float4 to,float4 p, float4* nearest) \n"
+"{\n"
+"	float4 diff = p - from;\n"
+"	float4 v = to - from;\n"
+"	float t = dot(v,diff);\n"
+"	\n"
+"	if (t > 0) \n"
+"	{\n"
+"		float dotVV = dot(v,v);\n"
+"		if (t < dotVV) \n"
+"		{\n"
+"			t /= dotVV;\n"
+"			diff -= t*v;\n"
+"		} else \n"
+"		{\n"
+"			t = 1;\n"
+"			diff -= v;\n"
+"		}\n"
+"	} else\n"
+"	{\n"
+"		t = 0;\n"
+"	}\n"
+"	*nearest = from + t*v;\n"
+"	return dot(diff,diff);	\n"
+"}\n"
+"void	computeContactSphereTriangle(int pairIndex,\n"
+"									int bodyIndexA, int bodyIndexB,\n"
+"									int collidableIndexA, int collidableIndexB, \n"
+"									__global const BodyData* rigidBodies, \n"
+"									__global const btCollidableGpu* collidables,\n"
+"									const float4* triangleVertices,\n"
+"									__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"									counter32_t nGlobalContactsOut,\n"
+"									int maxContactCapacity,\n"
+"									float4 spherePos2,\n"
+"									float radius,\n"
+"									float4 pos,\n"
+"									float4 quat,\n"
+"									int faceIndex\n"
+"									)\n"
+"{\n"
+"	float4 invPos;\n"
+"	float4 invOrn;\n"
+"	trInverse(pos,quat, &invPos,&invOrn);\n"
+"	float4 spherePos = transform(&spherePos2,&invPos,&invOrn);\n"
+"	int numFaces = 3;\n"
+"	float4 closestPnt = (float4)(0, 0, 0, 0);\n"
+"	float4 hitNormalWorld = (float4)(0, 0, 0, 0);\n"
+"	float minDist = -1000000.f;\n"
+"	bool bCollide = false;\n"
+"	\n"
+"	//////////////////////////////////////\n"
+"	float4 sphereCenter;\n"
+"	sphereCenter = spherePos;\n"
+"	const float4* vertices = triangleVertices;\n"
+"	float contactBreakingThreshold = 0.f;//todo?\n"
+"	float radiusWithThreshold = radius + contactBreakingThreshold;\n"
+"	float4 edge10;\n"
+"	edge10 = vertices[1]-vertices[0];\n"
+"	edge10.w = 0.f;//is this needed?\n"
+"	float4 edge20;\n"
+"	edge20 = vertices[2]-vertices[0];\n"
+"	edge20.w = 0.f;//is this needed?\n"
+"	float4 normal = cross3(edge10,edge20);\n"
+"	normal = normalize(normal);\n"
+"	float4 p1ToCenter;\n"
+"	p1ToCenter = sphereCenter - vertices[0];\n"
+"	\n"
+"	float distanceFromPlane = dot(p1ToCenter,normal);\n"
+"	if (distanceFromPlane < 0.f)\n"
+"	{\n"
+"		//triangle facing the other way\n"
+"		distanceFromPlane *= -1.f;\n"
+"		normal *= -1.f;\n"
+"	}\n"
+"	hitNormalWorld = normal;\n"
+"	bool isInsideContactPlane = distanceFromPlane < radiusWithThreshold;\n"
+"	\n"
+"	// Check for contact / intersection\n"
+"	bool hasContact = false;\n"
+"	float4 contactPoint;\n"
+"	if (isInsideContactPlane) \n"
+"	{\n"
+"	\n"
+"		if (pointInTriangle(vertices,&normal, &sphereCenter)) \n"
+"		{\n"
+"			// Inside the contact wedge - touches a point on the shell plane\n"
+"			hasContact = true;\n"
+"			contactPoint = sphereCenter - normal*distanceFromPlane;\n"
+"			\n"
+"		} else {\n"
+"			// Could be inside one of the contact capsules\n"
+"			float contactCapsuleRadiusSqr = radiusWithThreshold*radiusWithThreshold;\n"
+"			float4 nearestOnEdge;\n"
+"			int numEdges = 3;\n"
+"			for (int i = 0; i < numEdges; i++) \n"
+"			{\n"
+"				float4 pa =vertices[i];\n"
+"				float4 pb = vertices[(i+1)%3];\n"
+"				float distanceSqr = segmentSqrDistance(pa,pb,sphereCenter, &nearestOnEdge);\n"
+"				if (distanceSqr < contactCapsuleRadiusSqr) \n"
+"				{\n"
+"					// Yep, we're inside a capsule\n"
+"					hasContact = true;\n"
+"					contactPoint = nearestOnEdge;\n"
+"					\n"
+"				}\n"
+"				\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if (hasContact) \n"
+"	{\n"
+"		closestPnt = contactPoint;\n"
+"		float4 contactToCenter = sphereCenter - contactPoint;\n"
+"		minDist = length(contactToCenter);\n"
+"		if (minDist>FLT_EPSILON)\n"
+"		{\n"
+"			hitNormalWorld = normalize(contactToCenter);//*(1./minDist);\n"
+"			bCollide  = true;\n"
+"		}\n"
+"		\n"
+"	}\n"
+"	/////////////////////////////////////\n"
+"	if (bCollide && minDist > -10000)\n"
+"	{\n"
+"		\n"
+"		float4 normalOnSurfaceB1 = qtRotate(quat,-hitNormalWorld);\n"
+"		float4 pOnB1 = transform(&closestPnt,&pos,&quat);\n"
+"		float actualDepth = minDist-radius;\n"
+"		\n"
+"		if (actualDepth<=0.f)\n"
+"		{\n"
+"			pOnB1.w = actualDepth;\n"
+"			int dstIdx;\n"
+"			\n"
+"			float lenSqr = dot3F4(normalOnSurfaceB1,normalOnSurfaceB1);\n"
+"			if (lenSqr>FLT_EPSILON)\n"
+"			{\n"
+"				AppendInc( nGlobalContactsOut, dstIdx );\n"
+"			\n"
+"				if (dstIdx < maxContactCapacity)\n"
+"				{\n"
+"					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"					c->m_worldNormalOnB = -normalOnSurfaceB1;\n"
+"					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"					c->m_batchIdx = pairIndex;\n"
+"					c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n"
+"					c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n"
+"					c->m_worldPosB[0] = pOnB1;\n"
+"					c->m_childIndexA = -1;\n"
+"					c->m_childIndexB = faceIndex;\n"
+"					GET_NPOINTS(*c) = 1;\n"
+"				} \n"
+"			}\n"
+"		}\n"
+"	}//if (hasCollision)\n"
+"}\n"
+"// work-in-progress\n"
+"__kernel void   findConcaveSphereContactsKernel( __global int4* concavePairs,\n"
+"												__global const BodyData* rigidBodies,\n"
+"												__global const btCollidableGpu* collidables,\n"
+"												__global const ConvexPolyhedronCL* convexShapes, \n"
+"												__global const float4* vertices,\n"
+"												__global const float4* uniqueEdges,\n"
+"												__global const btGpuFace* faces,\n"
+"												__global const int* indices,\n"
+"												__global btAabbCL* aabbs,\n"
+"												__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"												counter32_t nGlobalContactsOut,\n"
+"													int numConcavePairs, int maxContactCapacity\n"
+"												)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numConcavePairs)\n"
+"		return;\n"
+"	int pairIdx = i;\n"
+"	int bodyIndexA = concavePairs[i].x;\n"
+"	int bodyIndexB = concavePairs[i].y;\n"
+"	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"	if (collidables[collidableIndexB].m_shapeType==SHAPE_SPHERE)\n"
+"	{\n"
+"		int f = concavePairs[i].z;\n"
+"		btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
+"		\n"
+"		float4 verticesA[3];\n"
+"		for (int i=0;i<3;i++)\n"
+"		{\n"
+"			int index = indices[face.m_indexOffset+i];\n"
+"			float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n"
+"			verticesA[i] = vert;\n"
+"		}\n"
+"		float4 spherePos = rigidBodies[bodyIndexB].m_pos;\n"
+"		float sphereRadius = collidables[collidableIndexB].m_radius;\n"
+"		float4 convexPos = rigidBodies[bodyIndexA].m_pos;\n"
+"		float4 convexOrn = rigidBodies[bodyIndexA].m_quat;\n"
+"		computeContactSphereTriangle(i, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, \n"
+"																rigidBodies,collidables,\n"
+"																verticesA,\n"
+"																globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n"
+"																spherePos,sphereRadius,convexPos,convexOrn, f);\n"
+"		return;\n"
+"	}\n"
+"}\n"
+;
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl
new file mode 100644
index 0000000000..a6565fd6fa
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl
@@ -0,0 +1,2018 @@
+//keep this enum in sync with the CPU version (in btCollidable.h)
+//written by Erwin Coumans
+
+
+#define SHAPE_CONVEX_HULL 3
+#define SHAPE_CONCAVE_TRIMESH 5
+#define TRIANGLE_NUM_CONVEX_FACES 5
+#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6
+
+#define B3_MAX_STACK_DEPTH 256
+
+
+typedef unsigned int u32;
+
+///keep this in sync with btCollidable.h
+typedef struct
+{
+	union {
+		int m_numChildShapes;
+		int m_bvhIndex;
+	};
+	union
+	{
+		float m_radius;
+		int	m_compoundBvhIndex;
+	};
+	
+	int m_shapeType;
+	int m_shapeIndex;
+	
+} btCollidableGpu;
+
+#define MAX_NUM_PARTS_IN_BITS 10
+
+///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.
+///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
+typedef struct
+{
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes
+	int	m_escapeIndexOrTriangleIndex;
+} b3QuantizedBvhNode;
+
+typedef struct
+{
+	float4		m_aabbMin;
+	float4		m_aabbMax;
+	float4		m_quantization;
+	int			m_numNodes;
+	int			m_numSubTrees;
+	int			m_nodeOffset;
+	int			m_subTreeOffset;
+
+} b3BvhInfo;
+
+
+int	getTriangleIndex(const b3QuantizedBvhNode* rootNode)
+{
+	unsigned int x=0;
+	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+	// Get only the lower bits where the triangle index is stored
+	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));
+}
+
+int	getTriangleIndexGlobal(__global const b3QuantizedBvhNode* rootNode)
+{
+	unsigned int x=0;
+	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+	// Get only the lower bits where the triangle index is stored
+	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));
+}
+
+int isLeafNode(const b3QuantizedBvhNode* rootNode)
+{
+	//skipindex is negative (internal node), triangleindex >=0 (leafnode)
+	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;
+}
+
+int isLeafNodeGlobal(__global const b3QuantizedBvhNode* rootNode)
+{
+	//skipindex is negative (internal node), triangleindex >=0 (leafnode)
+	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;
+}
+	
+int getEscapeIndex(const b3QuantizedBvhNode* rootNode)
+{
+	return -rootNode->m_escapeIndexOrTriangleIndex;
+}
+
+int getEscapeIndexGlobal(__global const b3QuantizedBvhNode* rootNode)
+{
+	return -rootNode->m_escapeIndexOrTriangleIndex;
+}
+
+
+typedef struct
+{
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes, points to the root of the subtree
+	int			m_rootNodeIndex;
+	//4 bytes
+	int			m_subtreeSize;
+	int			m_padding[3];
+} b3BvhSubtreeInfo;
+
+
+
+
+
+
+
+typedef struct
+{
+	float4	m_childPosition;
+	float4	m_childOrientation;
+	int m_shapeIndex;
+	int m_unused0;
+	int m_unused1;
+	int m_unused2;
+} btGpuChildShape;
+
+
+typedef struct
+{
+	float4 m_pos;
+	float4 m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+
+	u32 m_collidableIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} BodyData;
+
+
+typedef struct  
+{
+	float4		m_localCenter;
+	float4		m_extents;
+	float4		mC;
+	float4		mE;
+	
+	float			m_radius;
+	int	m_faceOffset;
+	int m_numFaces;
+	int	m_numVertices;
+
+	int m_vertexOffset;
+	int	m_uniqueEdgesOffset;
+	int	m_numUniqueEdges;
+	int m_unused;
+} ConvexPolyhedronCL;
+
+typedef struct 
+{
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3Common/shared/b3Int2.h"
+
+
+
+typedef struct
+{
+	float4 m_plane;
+	int m_indexOffset;
+	int m_numIndices;
+} btGpuFace;
+
+#define make_float4 (float4)
+
+
+__inline
+float4 cross3(float4 a, float4 b)
+{
+	return cross(a,b);
+
+	
+//	float4 a1 = make_float4(a.xyz,0.f);
+//	float4 b1 = make_float4(b.xyz,0.f);
+
+//	return cross(a1,b1);
+
+//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);
+	
+	//	float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);
+	
+	//return c;
+}
+
+__inline
+float dot3F4(float4 a, float4 b)
+{
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+}
+
+__inline
+float4 fastNormalize4(float4 v)
+{
+	v = make_float4(v.xyz,0.f);
+	return fast_normalize(v);
+}
+
+
+///////////////////////////////////////
+//	Quaternion
+///////////////////////////////////////
+
+typedef float4 Quaternion;
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b);
+
+__inline
+Quaternion qtNormalize(Quaternion in);
+
+__inline
+float4 qtRotate(Quaternion q, float4 vec);
+
+__inline
+Quaternion qtInvert(Quaternion q);
+
+
+
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b)
+{
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+}
+
+__inline
+Quaternion qtNormalize(Quaternion in)
+{
+	return fastNormalize4(in);
+//	in /= length( in );
+//	return in;
+}
+__inline
+float4 qtRotate(Quaternion q, float4 vec)
+{
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+}
+
+__inline
+Quaternion qtInvert(Quaternion q)
+{
+	return (Quaternion)(-q.xyz, q.w);
+}
+
+__inline
+float4 qtInvRotate(const Quaternion q, float4 vec)
+{
+	return qtRotate( qtInvert( q ), vec );
+}
+
+__inline
+float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)
+{
+	return qtRotate( *orientation, *p ) + (*translation);
+}
+
+
+
+__inline
+float4 normalize3(const float4 a)
+{
+	float4 n = make_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+}
+
+inline void projectLocal(const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, 
+const float4* dir, const float4* vertices, float* min, float* max)
+{
+	min[0] = FLT_MAX;
+	max[0] = -FLT_MAX;
+	int numVerts = hull->m_numVertices;
+
+	const float4 localDir = qtInvRotate(orn,*dir);
+	float offset = dot(pos,*dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);
+		if(dp < min[0])	
+			min[0] = dp;
+		if(dp > max[0])	
+			max[0] = dp;
+	}
+	if(min[0]>max[0])
+	{
+		float tmp = min[0];
+		min[0] = max[0];
+		max[0] = tmp;
+	}
+	min[0] += offset;
+	max[0] += offset;
+}
+
+inline void project(__global const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, 
+const float4* dir, __global const float4* vertices, float* min, float* max)
+{
+	min[0] = FLT_MAX;
+	max[0] = -FLT_MAX;
+	int numVerts = hull->m_numVertices;
+
+	const float4 localDir = qtInvRotate(orn,*dir);
+	float offset = dot(pos,*dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);
+		if(dp < min[0])	
+			min[0] = dp;
+		if(dp > max[0])	
+			max[0] = dp;
+	}
+	if(min[0]>max[0])
+	{
+		float tmp = min[0];
+		min[0] = max[0];
+		max[0] = tmp;
+	}
+	min[0] += offset;
+	max[0] += offset;
+}
+
+inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA,const float4 ornA,
+	const float4 posB,const float4 ornB,
+	float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth)
+{
+	float Min0,Max0;
+	float Min1,Max1;
+	projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);
+	project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);
+
+	if(Max0<Min1 || Max1<Min0)
+		return false;
+
+	float d0 = Max0 - Min1;
+	float d1 = Max1 - Min0;
+	*depth = d0<d1 ? d0:d1;
+	return true;
+}
+
+
+
+
+inline bool IsAlmostZero(const float4 v)
+{
+	if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)
+		return false;
+	return true;
+}
+
+
+
+bool findSeparatingAxisLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	
+	const float4* verticesA, 
+	const float4* uniqueEdgesA, 
+	const btGpuFace* facesA,
+	const int*  indicesA,
+
+	__global const float4* verticesB, 
+	__global const float4* uniqueEdgesB, 
+	__global const btGpuFace* facesB,
+	__global const int*  indicesB,
+	float4* sep,
+	float* dmin)
+{
+	
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for(int i=0;i<numFacesA;i++)
+		{
+			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;
+			float4 faceANormalWS = qtRotate(ornA,normal);
+			if (dot3F4(DeltaC2,faceANormalWS)<0)
+				faceANormalWS*=-1.f;
+			curPlaneTests++;
+			float d;
+			if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))
+				return false;
+			if(d<*dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+bool findSeparatingAxisLocalB(	__global const ConvexPolyhedronCL* hullA,  const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* verticesA, 
+	__global const float4* uniqueEdgesA, 
+	__global const btGpuFace* facesA,
+	__global const int*  indicesA,
+	const float4* verticesB,
+	const float4* uniqueEdgesB, 
+	const btGpuFace* facesB,
+	const int*  indicesB,
+	float4* sep,
+	float* dmin)
+{
+
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for(int i=0;i<numFacesA;i++)
+		{
+			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;
+			float4 faceANormalWS = qtRotate(ornA,normal);
+			if (dot3F4(DeltaC2,faceANormalWS)<0)
+				faceANormalWS *= -1.f;
+			curPlaneTests++;
+			float d;
+			if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d))
+				return false;
+			if(d<*dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+
+
+bool findSeparatingAxisEdgeEdgeLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	const float4* verticesA, 
+	const float4* uniqueEdgesA, 
+	const btGpuFace* facesA,
+	const int*  indicesA,
+	__global const float4* verticesB, 
+	__global const float4* uniqueEdgesB, 
+	__global const btGpuFace* facesB,
+	__global const int*  indicesB,
+		float4* sep,
+	float* dmin)
+{
+
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+
+	int curPlaneTests=0;
+
+	int curEdgeEdge = 0;
+	// Test edges
+	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)
+	{
+		const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];
+		float4 edge0World = qtRotate(ornA,edge0);
+
+		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)
+		{
+			const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];
+			float4 edge1World = qtRotate(ornB,edge1);
+
+
+			float4 crossje = cross3(edge0World,edge1World);
+
+			curEdgeEdge++;
+			if(!IsAlmostZero(crossje))
+			{
+				crossje = normalize3(crossje);
+				if (dot3F4(DeltaC2,crossje)<0)
+					crossje *= -1.f;
+
+				float dist;
+				bool result = true;
+				{
+					float Min0,Max0;
+					float Min1,Max1;
+					projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);
+					project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);
+				
+					if(Max0<Min1 || Max1<Min0)
+						result = false;
+				
+					float d0 = Max0 - Min1;
+					float d1 = Max1 - Min0;
+					dist = d0<d1 ? d0:d1;
+					result = true;
+
+				}
+				
+
+				if(dist<*dmin)
+				{
+					*dmin = dist;
+					*sep = crossje;
+				}
+			}
+		}
+
+	}
+
+	
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+
+inline bool TestSepAxis(__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA,const float4 ornA,
+	const float4 posB,const float4 ornB,
+	float4* sep_axis, __global const float4* vertices,float* depth)
+{
+	float Min0,Max0;
+	float Min1,Max1;
+	project(hullA,posA,ornA,sep_axis,vertices, &Min0, &Max0);
+	project(hullB,posB,ornB, sep_axis,vertices, &Min1, &Max1);
+
+	if(Max0<Min1 || Max1<Min0)
+		return false;
+
+	float d0 = Max0 - Min1;
+	float d1 = Max1 - Min0;
+	*depth = d0<d1 ? d0:d1;
+	return true;
+}
+
+
+bool findSeparatingAxis(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* vertices, 
+	__global const float4* uniqueEdges, 
+	__global const btGpuFace* faces,
+	__global const int*  indices,
+	float4* sep,
+	float* dmin)
+{
+	
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	
+	int curPlaneTests=0;
+
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for(int i=0;i<numFacesA;i++)
+		{
+			const float4 normal = faces[hullA->m_faceOffset+i].m_plane;
+			float4 faceANormalWS = qtRotate(ornA,normal);
+	
+			if (dot3F4(DeltaC2,faceANormalWS)<0)
+				faceANormalWS*=-1.f;
+				
+			curPlaneTests++;
+	
+			float d;
+			if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, vertices,&d))
+				return false;
+	
+			if(d<*dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+
+
+		if((dot3F4(-DeltaC2,*sep))>0.0f)
+		{
+			*sep = -(*sep);
+		}
+	
+	return true;
+}
+
+
+
+
+bool findSeparatingAxisUnitSphere(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* vertices,
+	__global const float4* unitSphereDirections,
+	int numUnitSphereDirections,
+	float4* sep,
+	float* dmin)
+{
+	
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+
+	int curPlaneTests=0;
+
+	int curEdgeEdge = 0;
+	// Test unit sphere directions
+	for (int i=0;i<numUnitSphereDirections;i++)
+	{
+
+		float4 crossje;
+		crossje = unitSphereDirections[i];	
+
+		if (dot3F4(DeltaC2,crossje)>0)
+			crossje *= -1.f;
+		{
+			float dist;
+			bool result = true;
+			float Min0,Max0;
+			float Min1,Max1;
+			project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);
+			project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);
+		
+			if(Max0<Min1 || Max1<Min0)
+				return false;
+		
+			float d0 = Max0 - Min1;
+			float d1 = Max1 - Min0;
+			dist = d0<d1 ? d0:d1;
+			result = true;
+	
+			if(dist<*dmin)
+			{
+				*dmin = dist;
+				*sep = crossje;
+			}
+		}
+	}
+
+	
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+
+bool findSeparatingAxisEdgeEdge(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* vertices, 
+	__global const float4* uniqueEdges, 
+	__global const btGpuFace* faces,
+	__global const int*  indices,
+	float4* sep,
+	float* dmin)
+{
+	
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+
+	int curPlaneTests=0;
+
+	int curEdgeEdge = 0;
+	// Test edges
+	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)
+	{
+		const float4 edge0 = uniqueEdges[hullA->m_uniqueEdgesOffset+e0];
+		float4 edge0World = qtRotate(ornA,edge0);
+
+		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)
+		{
+			const float4 edge1 = uniqueEdges[hullB->m_uniqueEdgesOffset+e1];
+			float4 edge1World = qtRotate(ornB,edge1);
+
+
+			float4 crossje = cross3(edge0World,edge1World);
+
+			curEdgeEdge++;
+			if(!IsAlmostZero(crossje))
+			{
+				crossje = normalize3(crossje);
+				if (dot3F4(DeltaC2,crossje)<0)
+					crossje*=-1.f;
+					
+				float dist;
+				bool result = true;
+				{
+					float Min0,Max0;
+					float Min1,Max1;
+					project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);
+					project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);
+				
+					if(Max0<Min1 || Max1<Min0)
+						return false;
+				
+					float d0 = Max0 - Min1;
+					float d1 = Max1 - Min0;
+					dist = d0<d1 ? d0:d1;
+					result = true;
+
+				}
+				
+
+				if(dist<*dmin)
+				{
+					*dmin = dist;
+					*sep = crossje;
+				}
+			}
+		}
+
+	}
+
+	
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+
+// work-in-progress
+__kernel void   processCompoundPairsKernel( __global const int4* gpuCompoundPairs,
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global btAabbCL* aabbs,
+																					__global const btGpuChildShape* gpuChildShapes,
+																					__global volatile float4* gpuCompoundSepNormalsOut,
+																					__global volatile int* gpuHasCompoundSepNormalsOut,
+																					int numCompoundPairs
+																					)
+{
+
+	int i = get_global_id(0);
+	if (i<numCompoundPairs)
+	{
+		int bodyIndexA = gpuCompoundPairs[i].x;
+		int bodyIndexB = gpuCompoundPairs[i].y;
+
+		int childShapeIndexA = gpuCompoundPairs[i].z;
+		int childShapeIndexB = gpuCompoundPairs[i].w;
+		
+		int collidableIndexA = -1;
+		int collidableIndexB = -1;
+		
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		
+		float4 ornB = rigidBodies[bodyIndexB].m_quat;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+							
+		if (childShapeIndexA >= 0)
+		{
+			collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+			float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+			float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+			float4 newPosA = qtRotate(ornA,childPosA)+posA;
+			float4 newOrnA = qtMul(ornA,childOrnA);
+			posA = newPosA;
+			ornA = newOrnA;
+		} else
+		{
+			collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		}
+		
+		if (childShapeIndexB>=0)
+		{
+			collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			float4 newPosB = transform(&childPosB,&posB,&ornB);
+			float4 newOrnB = qtMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+		} else
+		{
+			collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	
+		}
+	
+		gpuHasCompoundSepNormalsOut[i] = 0;
+	
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+	
+		int shapeTypeA = collidables[collidableIndexA].m_shapeType;
+		int shapeTypeB = collidables[collidableIndexB].m_shapeType;
+	
+
+		if ((shapeTypeA != SHAPE_CONVEX_HULL) || (shapeTypeB != SHAPE_CONVEX_HULL))
+		{
+			return;
+		}
+
+		int hasSeparatingAxis = 5;
+							
+		int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+		float dmin = FLT_MAX;
+		posA.w = 0.f;
+		posB.w = 0.f;
+		float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+		float4 sepNormal = make_float4(1,0,0,0);
+		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);
+		hasSeparatingAxis = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,posA,ornA,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);
+
+			if (!sepB)
+			{
+				hasSeparatingAxis = 0;
+			} else//(!sepB)
+			{
+				bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);
+				if (sepEE)
+				{
+						gpuCompoundSepNormalsOut[i] = sepNormal;//fastNormalize4(sepNormal);
+						gpuHasCompoundSepNormalsOut[i] = 1;
+				}//sepEE
+			}//(!sepB)
+		}//(!sepA)
+		
+		
+	}
+		
+}
+
+
+inline b3Float4 MyUnQuantize(const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin)
+{
+		b3Float4 vecOut;
+		vecOut = b3MakeFloat4(
+			(float)(vecIn[0]) / (quantization.x),
+			(float)(vecIn[1]) / (quantization.y),
+			(float)(vecIn[2]) / (quantization.z),
+			0.f);
+
+		vecOut += bvhAabbMin;
+		return vecOut;
+}
+
+inline b3Float4 MyUnQuantizeGlobal(__global const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin)
+{
+		b3Float4 vecOut;
+		vecOut = b3MakeFloat4(
+			(float)(vecIn[0]) / (quantization.x),
+			(float)(vecIn[1]) / (quantization.y),
+			(float)(vecIn[2]) / (quantization.z),
+			0.f);
+
+		vecOut += bvhAabbMin;
+		return vecOut;
+}
+
+
+// work-in-progress
+__kernel void   findCompoundPairsKernel( __global const int4* pairs, 
+	__global const BodyData* rigidBodies, 
+	__global const btCollidableGpu* collidables,
+	__global const ConvexPolyhedronCL* convexShapes, 
+	__global const float4* vertices,
+	__global const float4* uniqueEdges,
+	__global const btGpuFace* faces,
+	__global const int* indices,
+	__global b3Aabb_t* aabbLocalSpace,
+	__global const btGpuChildShape* gpuChildShapes,
+	__global volatile int4* gpuCompoundPairsOut,
+	__global volatile int* numCompoundPairsOut,
+	__global const b3BvhSubtreeInfo* subtrees,
+	__global const b3QuantizedBvhNode* quantizedNodes,
+	__global const b3BvhInfo* bvhInfos,
+	int numPairs,
+	int maxNumCompoundPairsCapacity
+	)
+{
+
+	int i = get_global_id(0);
+
+	if (i<numPairs)
+	{
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+
+
+		//once the broadphase avoids static-static pairs, we can remove this test
+		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+		{
+			return;
+		}
+
+		if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) &&(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+		{
+			int bvhA = collidables[collidableIndexA].m_compoundBvhIndex;
+			int bvhB = collidables[collidableIndexB].m_compoundBvhIndex;
+			int numSubTreesA = bvhInfos[bvhA].m_numSubTrees;
+			int subTreesOffsetA = bvhInfos[bvhA].m_subTreeOffset;
+			int subTreesOffsetB = bvhInfos[bvhB].m_subTreeOffset;
+
+
+			int numSubTreesB = bvhInfos[bvhB].m_numSubTrees;
+			
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			b3Quat ornA = rigidBodies[bodyIndexA].m_quat;
+
+			b3Quat ornB = rigidBodies[bodyIndexB].m_quat;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+
+			
+			for (int p=0;p<numSubTreesA;p++)
+			{
+				b3BvhSubtreeInfo subtreeA = subtrees[subTreesOffsetA+p];
+				//bvhInfos[bvhA].m_quantization
+				b3Float4 treeAminLocal = MyUnQuantize(subtreeA.m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);
+				b3Float4 treeAmaxLocal = MyUnQuantize(subtreeA.m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);
+
+				b3Float4 aabbAMinOut,aabbAMaxOut;
+				float margin=0.f;
+				b3TransformAabb2(treeAminLocal,treeAmaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut);
+				
+				for (int q=0;q<numSubTreesB;q++)
+				{
+					b3BvhSubtreeInfo subtreeB = subtrees[subTreesOffsetB+q];
+
+					b3Float4 treeBminLocal = MyUnQuantize(subtreeB.m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);
+					b3Float4 treeBmaxLocal = MyUnQuantize(subtreeB.m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);
+
+					b3Float4 aabbBMinOut,aabbBMaxOut;
+					float margin=0.f;
+					b3TransformAabb2(treeBminLocal,treeBmaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut);
+
+					
+					
+					bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);
+					if (aabbOverlap)
+					{
+						
+						int startNodeIndexA = subtreeA.m_rootNodeIndex+bvhInfos[bvhA].m_nodeOffset;
+						int endNodeIndexA = startNodeIndexA+subtreeA.m_subtreeSize;
+
+						int startNodeIndexB = subtreeB.m_rootNodeIndex+bvhInfos[bvhB].m_nodeOffset;
+						int endNodeIndexB = startNodeIndexB+subtreeB.m_subtreeSize;
+
+
+						b3Int2 nodeStack[B3_MAX_STACK_DEPTH];
+						b3Int2 node0;
+						node0.x = startNodeIndexA;
+						node0.y = startNodeIndexB;
+						int maxStackDepth = B3_MAX_STACK_DEPTH;
+						int depth=0;
+						nodeStack[depth++]=node0;
+
+						do
+						{
+							b3Int2 node = nodeStack[--depth];
+
+							b3Float4 aMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);
+							b3Float4 aMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);
+
+							b3Float4 bMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);
+							b3Float4 bMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);
+
+							float margin=0.f;
+							b3Float4 aabbAMinOut,aabbAMaxOut;
+							b3TransformAabb2(aMinLocal,aMaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut);
+
+							b3Float4 aabbBMinOut,aabbBMaxOut;
+							b3TransformAabb2(bMinLocal,bMaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut);
+
+							
+							bool nodeOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);
+							if (nodeOverlap)
+							{
+								bool isLeafA = isLeafNodeGlobal(&quantizedNodes[node.x]);
+								bool isLeafB = isLeafNodeGlobal(&quantizedNodes[node.y]);
+								bool isInternalA = !isLeafA;
+								bool isInternalB = !isLeafB;
+
+								//fail, even though it might hit two leaf nodes
+								if (depth+4>maxStackDepth && !(isLeafA && isLeafB))
+								{
+									//printf("Error: traversal exceeded maxStackDepth");
+									continue;
+								}
+
+								if(isInternalA)
+								{
+									int nodeAleftChild = node.x+1;
+									bool isNodeALeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.x+1]);
+									int nodeArightChild = isNodeALeftChildLeaf? node.x+2 : node.x+1 + getEscapeIndexGlobal(&quantizedNodes[node.x+1]);
+
+									if(isInternalB)
+									{					
+										int nodeBleftChild = node.y+1;
+										bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]);
+										int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]);
+
+										nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBleftChild);
+										nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBleftChild);
+										nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBrightChild);
+										nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBrightChild);
+									}
+									else
+									{
+										nodeStack[depth++] = b3MakeInt2(nodeAleftChild,node.y);
+										nodeStack[depth++] = b3MakeInt2(nodeArightChild,node.y);
+									}
+								}
+								else
+								{
+									if(isInternalB)
+									{
+										int nodeBleftChild = node.y+1;
+										bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]);
+										int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]);
+										nodeStack[depth++] = b3MakeInt2(node.x,nodeBleftChild);
+										nodeStack[depth++] = b3MakeInt2(node.x,nodeBrightChild);
+									}
+									else
+									{
+										int compoundPairIdx = atomic_inc(numCompoundPairsOut);
+										if (compoundPairIdx<maxNumCompoundPairsCapacity)
+										{
+											int childShapeIndexA = getTriangleIndexGlobal(&quantizedNodes[node.x]);
+											int childShapeIndexB = getTriangleIndexGlobal(&quantizedNodes[node.y]);
+											gpuCompoundPairsOut[compoundPairIdx]  = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);
+										}
+									}
+								}
+							}
+						} while (depth);
+					}
+				}
+			}
+			
+			return;
+		}
+
+
+
+
+
+		if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+		{
+
+			if (collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) 
+			{
+
+				int numChildrenA = collidables[collidableIndexA].m_numChildShapes;
+				for (int c=0;c<numChildrenA;c++)
+				{
+					int childShapeIndexA = collidables[collidableIndexA].m_shapeIndex+c;
+					int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+
+					float4 posA = rigidBodies[bodyIndexA].m_pos;
+					float4 ornA = rigidBodies[bodyIndexA].m_quat;
+					float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+					float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+					float4 newPosA = qtRotate(ornA,childPosA)+posA;
+					float4 newOrnA = qtMul(ornA,childOrnA);
+
+					int shapeIndexA = collidables[childColIndexA].m_shapeIndex;
+					b3Aabb_t aabbAlocal = aabbLocalSpace[shapeIndexA];
+					float margin = 0.f;
+					
+					b3Float4 aabbAMinWS;
+					b3Float4 aabbAMaxWS;
+					
+					b3TransformAabb2(aabbAlocal.m_minVec,aabbAlocal.m_maxVec,margin,
+						newPosA,
+						newOrnA,
+						&aabbAMinWS,&aabbAMaxWS);
+						
+					
+					if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+					{
+						int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+						for (int b=0;b<numChildrenB;b++)
+						{
+							int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
+							int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+							float4 ornB = rigidBodies[bodyIndexB].m_quat;
+							float4 posB = rigidBodies[bodyIndexB].m_pos;
+							float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+							float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+							float4 newPosB = transform(&childPosB,&posB,&ornB);
+							float4 newOrnB = qtMul(ornB,childOrnB);
+
+							int shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+							b3Aabb_t aabbBlocal = aabbLocalSpace[shapeIndexB];
+							
+							b3Float4 aabbBMinWS;
+							b3Float4 aabbBMaxWS;
+							
+							b3TransformAabb2(aabbBlocal.m_minVec,aabbBlocal.m_maxVec,margin,
+								newPosB,
+								newOrnB,
+								&aabbBMinWS,&aabbBMaxWS);
+								
+								
+							
+							bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinWS,aabbAMaxWS,aabbBMinWS,aabbBMaxWS);
+							if (aabbOverlap)
+							{
+								int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+								float dmin = FLT_MAX;
+								float4 posA = newPosA;
+								posA.w = 0.f;
+								float4 posB = newPosB;
+								posB.w = 0.f;
+								float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+								float4 ornA = newOrnA;
+								float4 c0 = transform(&c0local, &posA, &ornA);
+								float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+								float4 ornB =newOrnB;
+								float4 c1 = transform(&c1local,&posB,&ornB);
+								const float4 DeltaC2 = c0 - c1;
+
+								{//
+									int compoundPairIdx = atomic_inc(numCompoundPairsOut);
+									if (compoundPairIdx<maxNumCompoundPairsCapacity)
+									{
+										gpuCompoundPairsOut[compoundPairIdx]  = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);
+									}
+								}//
+							}//fi(1)
+						} //for (int b=0
+					}//if (collidables[collidableIndexB].
+					else//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+					{
+						if (1)
+						{
+							int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+							float dmin = FLT_MAX;
+							float4 posA = newPosA;
+							posA.w = 0.f;
+							float4 posB = rigidBodies[bodyIndexB].m_pos;
+							posB.w = 0.f;
+							float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+							float4 ornA = newOrnA;
+							float4 c0 = transform(&c0local, &posA, &ornA);
+							float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+							float4 ornB = rigidBodies[bodyIndexB].m_quat;
+							float4 c1 = transform(&c1local,&posB,&ornB);
+							const float4 DeltaC2 = c0 - c1;
+
+							{
+								int compoundPairIdx = atomic_inc(numCompoundPairsOut);
+								if (compoundPairIdx<maxNumCompoundPairsCapacity)
+								{
+									gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,-1);
+								}//if (compoundPairIdx<maxNumCompoundPairsCapacity)
+							}//
+						}//fi (1)
+					}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+				}//for (int b=0;b<numChildrenB;b++)	
+				return;
+			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+			if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH) 
+				&& (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+			{
+				int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+				for (int b=0;b<numChildrenB;b++)
+				{
+					int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
+					int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+					float4 ornB = rigidBodies[bodyIndexB].m_quat;
+					float4 posB = rigidBodies[bodyIndexB].m_pos;
+					float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+					float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+					float4 newPosB = qtRotate(ornB,childPosB)+posB;
+					float4 newOrnB = qtMul(ornB,childOrnB);
+
+					int shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+
+
+					//////////////////////////////////////
+
+					if (1)
+					{
+						int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+						float dmin = FLT_MAX;
+						float4 posA = rigidBodies[bodyIndexA].m_pos;
+						posA.w = 0.f;
+						float4 posB = newPosB;
+						posB.w = 0.f;
+						float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+						float4 ornA = rigidBodies[bodyIndexA].m_quat;
+						float4 c0 = transform(&c0local, &posA, &ornA);
+						float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+						float4 ornB =newOrnB;
+						float4 c1 = transform(&c1local,&posB,&ornB);
+						const float4 DeltaC2 = c0 - c1;
+						{//
+							int compoundPairIdx = atomic_inc(numCompoundPairsOut);
+							if (compoundPairIdx<maxNumCompoundPairsCapacity)
+							{
+								gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,-1,childShapeIndexB);
+							}//fi (compoundPairIdx<maxNumCompoundPairsCapacity)
+						}//
+					}//fi (1)	
+				}//for (int b=0;b<numChildrenB;b++)
+				return;
+			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+			return;
+		}//fi ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+	}//i<numPairs
+}
+
+// work-in-progress
+__kernel void   findSeparatingAxisKernel( __global const int4* pairs, 
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global btAabbCL* aabbs,
+																					__global volatile float4* separatingNormals,
+																					__global volatile int* hasSeparatingAxis,
+																					int numPairs
+																					)
+{
+
+	int i = get_global_id(0);
+	
+	if (i<numPairs)
+	{
+
+	
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+	
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		
+		
+		//once the broadphase avoids static-static pairs, we can remove this test
+		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+		{
+			hasSeparatingAxis[i] = 0;
+			return;
+		}
+		
+
+		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))
+		{
+			hasSeparatingAxis[i] = 0;
+			return;
+		}
+			
+		if ((collidables[collidableIndexA].m_shapeType==SHAPE_CONCAVE_TRIMESH))
+		{
+			hasSeparatingAxis[i] = 0;
+			return;
+		}
+
+		int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+
+		float dmin = FLT_MAX;
+
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+		float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 ornB =rigidBodies[bodyIndexB].m_quat;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+		float4 sepNormal;
+		
+		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																								posB,ornB,
+																								DeltaC2,
+																								vertices,uniqueEdges,faces,
+																								indices,&sepNormal,&dmin);
+		hasSeparatingAxis[i] = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis[i] = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,
+																									posA,ornA,
+																									DeltaC2,
+																									vertices,uniqueEdges,faces,
+																									indices,&sepNormal,&dmin);
+
+			if (!sepB)
+			{
+				hasSeparatingAxis[i] = 0;
+			} else
+			{
+				bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																									posB,ornB,
+																									DeltaC2,
+																									vertices,uniqueEdges,faces,
+																									indices,&sepNormal,&dmin);
+				if (!sepEE)
+				{
+					hasSeparatingAxis[i] = 0;
+				} else
+				{
+					hasSeparatingAxis[i] = 1;
+					separatingNormals[i] = sepNormal;
+				}
+			}
+		}
+		
+	}
+
+}
+
+
+__kernel void   findSeparatingAxisVertexFaceKernel( __global const int4* pairs, 
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global btAabbCL* aabbs,
+																					__global volatile float4* separatingNormals,
+																					__global volatile int* hasSeparatingAxis,
+																					__global  float* dmins,
+																					int numPairs
+																					)
+{
+
+	int i = get_global_id(0);
+	
+	if (i<numPairs)
+	{
+
+	
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+	
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+	
+		hasSeparatingAxis[i] = 0;	
+		
+		//once the broadphase avoids static-static pairs, we can remove this test
+		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+		{
+			return;
+		}
+		
+
+		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))
+		{
+			return;
+		}
+			
+
+		int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+
+		float dmin = FLT_MAX;
+
+		dmins[i] = dmin;
+		
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+		float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 ornB =rigidBodies[bodyIndexB].m_quat;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+		float4 sepNormal;
+		
+		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																								posB,ornB,
+																								DeltaC2,
+																								vertices,uniqueEdges,faces,
+																								indices,&sepNormal,&dmin);
+		hasSeparatingAxis[i] = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis[i] = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,
+																									posA,ornA,
+																									DeltaC2,
+																									vertices,uniqueEdges,faces,
+																									indices,&sepNormal,&dmin);
+
+			if (sepB)
+			{
+				dmins[i] = dmin;
+				hasSeparatingAxis[i] = 1;
+				separatingNormals[i] = sepNormal;
+			}
+		}
+		
+	}
+
+}
+
+
+__kernel void   findSeparatingAxisEdgeEdgeKernel( __global const int4* pairs, 
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global btAabbCL* aabbs,
+																					__global  float4* separatingNormals,
+																					__global  int* hasSeparatingAxis,
+																					__global  float* dmins,
+																					__global const float4* unitSphereDirections,
+																					int numUnitSphereDirections,
+																					int numPairs
+																					)
+{
+
+	int i = get_global_id(0);
+	
+	if (i<numPairs)
+	{
+
+		if (hasSeparatingAxis[i])
+		{
+	
+			int bodyIndexA = pairs[i].x;
+			int bodyIndexB = pairs[i].y;
+	
+			int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+			int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+		
+			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+			
+			
+			int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+	
+			float dmin = dmins[i];
+	
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			posA.w = 0.f;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+			posB.w = 0.f;
+			float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+			float4 ornA = rigidBodies[bodyIndexA].m_quat;
+			float4 c0 = transform(&c0local, &posA, &ornA);
+			float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+			float4 ornB =rigidBodies[bodyIndexB].m_quat;
+			float4 c1 = transform(&c1local,&posB,&ornB);
+			const float4 DeltaC2 = c0 - c1;
+			float4 sepNormal = separatingNormals[i];
+			
+			
+			
+			bool sepEE = false;
+			int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;
+			if (numEdgeEdgeDirections<=numUnitSphereDirections)
+			{
+				sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																									posB,ornB,
+																									DeltaC2,
+																									vertices,uniqueEdges,faces,
+																									indices,&sepNormal,&dmin);
+																									
+					if (!sepEE)
+					{
+						hasSeparatingAxis[i] = 0;
+					} else
+					{
+						hasSeparatingAxis[i] = 1;
+						separatingNormals[i] = sepNormal;
+					}
+			}
+			/*
+			///else case is a separate kernel, to make Mac OSX OpenCL compiler happy
+			else
+			{
+				sepEE = findSeparatingAxisUnitSphere(&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																									posB,ornB,
+																									DeltaC2,
+																									vertices,unitSphereDirections,numUnitSphereDirections,
+																									&sepNormal,&dmin);
+					if (!sepEE)
+					{
+						hasSeparatingAxis[i] = 0;
+					} else
+					{
+						hasSeparatingAxis[i] = 1;
+						separatingNormals[i] = sepNormal;
+					}
+			}
+			*/
+		}		//if (hasSeparatingAxis[i])
+	}//(i<numPairs)
+}
+
+
+
+
+
+inline int	findClippingFaces(const float4 separatingNormal,
+                      const ConvexPolyhedronCL* hullA, 
+					  __global const ConvexPolyhedronCL* hullB,
+                      const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,
+                       __global float4* worldVertsA1,
+                      __global float4* worldNormalsA1,
+                      __global float4* worldVertsB1,
+                      int capacityWorldVerts,
+                      const float minDist, float maxDist,
+					  const float4* verticesA,
+                      const btGpuFace* facesA,
+                      const int* indicesA,
+					  __global const float4* verticesB,
+                      __global const btGpuFace* facesB,
+                      __global const int* indicesB,
+                      __global int4* clippingFaces, int pairIndex)
+{
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+    
+    
+	int closestFaceB=0;
+	float dmax = -FLT_MAX;
+    
+	{
+		for(int face=0;face<hullB->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x,
+                                              facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);
+			const float4 WorldNormal = qtRotate(ornB, Normal);
+			float d = dot3F4(WorldNormal,separatingNormal);
+			if (d > dmax)
+			{
+				dmax = d;
+				closestFaceB = face;
+			}
+		}
+	}
+    
+	{
+		const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];
+		int numVertices = polyB.m_numIndices;
+        if (numVertices>capacityWorldVerts)
+            numVertices = capacityWorldVerts;
+        
+		for(int e0=0;e0<numVertices;e0++)
+		{
+            if (e0<capacityWorldVerts)
+            {
+                const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];
+                worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);
+            }
+		}
+	}
+    
+    int closestFaceA=0;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(
+                                              facesA[hullA->m_faceOffset+face].m_plane.x,
+                                              facesA[hullA->m_faceOffset+face].m_plane.y,
+                                              facesA[hullA->m_faceOffset+face].m_plane.z,
+                                              0.f);
+			const float4 faceANormalWS = qtRotate(ornA,Normal);
+            
+			float d = dot3F4(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+                worldNormalsA1[pairIndex] = faceANormalWS;
+			}
+		}
+	}
+    
+    int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices;
+    if (numVerticesA>capacityWorldVerts)
+       numVerticesA = capacityWorldVerts;
+    
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+        if (e0<capacityWorldVerts)
+        {
+            const float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];
+            worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);
+        }
+    }
+    
+    clippingFaces[pairIndex].x = closestFaceA;
+    clippingFaces[pairIndex].y = closestFaceB;
+    clippingFaces[pairIndex].z = numVerticesA;
+    clippingFaces[pairIndex].w = numWorldVertsB1;
+    
+    
+	return numContactsOut;
+}
+
+
+
+
+// work-in-progress
+__kernel void   findConcaveSeparatingAxisKernel( __global int4* concavePairs,
+																					__global const BodyData* rigidBodies,
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global const btGpuChildShape* gpuChildShapes,
+																					__global btAabbCL* aabbs,
+																					__global float4* concaveSeparatingNormalsOut,
+																					__global int* concaveHasSeparatingNormals,
+																					__global int4* clippingFacesOut,
+																					__global float4* worldVertsA1GPU,
+																					__global float4*  worldNormalsAGPU,
+																					__global float4* worldVertsB1GPU,
+																					int vertexFaceCapacity,
+																					int numConcavePairs
+																					)
+{
+
+	int i = get_global_id(0);
+	if (i>=numConcavePairs)
+		return;
+
+	concaveHasSeparatingNormals[i] = 0;
+
+	int pairIdx = i;
+
+	int bodyIndexA = concavePairs[i].x;
+	int bodyIndexB = concavePairs[i].y;
+
+	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+
+	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+
+	if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&&
+		collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS)
+	{
+		concavePairs[pairIdx].w = -1;
+		return;
+	}
+
+
+
+	int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+	int numActualConcaveConvexTests = 0;
+	
+	int f = concavePairs[i].z;
+	
+	bool overlap = false;
+	
+	ConvexPolyhedronCL convexPolyhedronA;
+
+	//add 3 vertices of the triangle
+	convexPolyhedronA.m_numVertices = 3;
+	convexPolyhedronA.m_vertexOffset = 0;
+	float4	localCenter = make_float4(0.f,0.f,0.f,0.f);
+
+	btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];
+	float4 triMinAabb, triMaxAabb;
+	btAabbCL triAabb;
+	triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);
+	triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);
+	
+	float4 verticesA[3];
+	for (int i=0;i<3;i++)
+	{
+		int index = indices[face.m_indexOffset+i];
+		float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];
+		verticesA[i] = vert;
+		localCenter += vert;
+			
+		triAabb.m_min = min(triAabb.m_min,vert);		
+		triAabb.m_max = max(triAabb.m_max,vert);		
+
+	}
+
+	overlap = true;
+	overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;
+	overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;
+	overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;
+		
+	if (overlap)
+	{
+		float dmin = FLT_MAX;
+		int hasSeparatingAxis=5;
+		float4 sepAxis=make_float4(1,2,3,4);
+
+		int localCC=0;
+		numActualConcaveConvexTests++;
+
+		//a triangle has 3 unique edges
+		convexPolyhedronA.m_numUniqueEdges = 3;
+		convexPolyhedronA.m_uniqueEdgesOffset = 0;
+		float4 uniqueEdgesA[3];
+		
+		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);
+		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);
+		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);
+
+
+		convexPolyhedronA.m_faceOffset = 0;
+                                  
+		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);
+                             
+		btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];
+		int indicesA[3+3+2+2+2];
+		int curUsedIndices=0;
+		int fidx=0;
+
+		//front size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[0] = 0;
+			indicesA[1] = 1;
+			indicesA[2] = 2;
+			curUsedIndices+=3;
+			float c = face.m_plane.w;
+			facesA[fidx].m_plane.x = normal.x;
+			facesA[fidx].m_plane.y = normal.y;
+			facesA[fidx].m_plane.z = normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+		//back size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[3]=2;
+			indicesA[4]=1;
+			indicesA[5]=0;
+			curUsedIndices+=3;
+			float c = dot(normal,verticesA[0]);
+			float c1 = -face.m_plane.w;
+			facesA[fidx].m_plane.x = -normal.x;
+			facesA[fidx].m_plane.y = -normal.y;
+			facesA[fidx].m_plane.z = -normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+
+		bool addEdgePlanes = true;
+		if (addEdgePlanes)
+		{
+			int numVertices=3;
+			int prevVertex = numVertices-1;
+			for (int i=0;i<numVertices;i++)
+			{
+				float4 v0 = verticesA[i];
+				float4 v1 = verticesA[prevVertex];
+                                            
+				float4 edgeNormal = normalize(cross(normal,v1-v0));
+				float c = -dot(edgeNormal,v0);
+
+				facesA[fidx].m_numIndices = 2;
+				facesA[fidx].m_indexOffset=curUsedIndices;
+				indicesA[curUsedIndices++]=i;
+				indicesA[curUsedIndices++]=prevVertex;
+                                            
+				facesA[fidx].m_plane.x = edgeNormal.x;
+				facesA[fidx].m_plane.y = edgeNormal.y;
+				facesA[fidx].m_plane.z = edgeNormal.z;
+				facesA[fidx].m_plane.w = c;
+				fidx++;
+				prevVertex = i;
+			}
+		}
+		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;
+		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);
+
+
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 ornB =rigidBodies[bodyIndexB].m_quat;
+
+		
+
+
+		///////////////////
+		///compound shape support
+
+		if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+			int compoundChild = concavePairs[pairIdx].w;
+			int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;
+			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			float4 newPosB = transform(&childPosB,&posB,&ornB);
+			float4 newOrnB = qtMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+			shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+		}
+		//////////////////
+
+		float4 c0local = convexPolyhedronA.m_localCenter;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+
+
+		bool sepA = findSeparatingAxisLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],
+												posA,ornA,
+												posB,ornB,
+												DeltaC2,
+												verticesA,uniqueEdgesA,facesA,indicesA,
+												vertices,uniqueEdges,faces,indices,
+												&sepAxis,&dmin);
+		hasSeparatingAxis = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxisLocalB(	&convexShapes[shapeIndexB],&convexPolyhedronA,
+												posB,ornB,
+												posA,ornA,
+												DeltaC2,
+												vertices,uniqueEdges,faces,indices,
+												verticesA,uniqueEdgesA,facesA,indicesA,
+												&sepAxis,&dmin);
+
+			if (!sepB)
+			{
+				hasSeparatingAxis = 0;
+			} else
+			{
+				bool sepEE = findSeparatingAxisEdgeEdgeLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],
+															posA,ornA,
+															posB,ornB,
+															DeltaC2,
+															verticesA,uniqueEdgesA,facesA,indicesA,
+															vertices,uniqueEdges,faces,indices,
+															&sepAxis,&dmin);
+	
+				if (!sepEE)
+				{
+					hasSeparatingAxis = 0;
+				} else
+				{
+					hasSeparatingAxis = 1;
+				}
+			}
+		}	
+		
+		if (hasSeparatingAxis)
+		{
+			sepAxis.w = dmin;
+			concaveSeparatingNormalsOut[pairIdx]=sepAxis;
+			concaveHasSeparatingNormals[i]=1;
+
+
+			float minDist = -1e30f;
+			float maxDist = 0.02f;
+
+		
+
+			findClippingFaces(sepAxis,
+                     &convexPolyhedronA,
+					 &convexShapes[shapeIndexB],
+					 posA,ornA,
+					 posB,ornB,
+                      worldVertsA1GPU,
+                      worldNormalsAGPU,
+                      worldVertsB1GPU,
+					  vertexFaceCapacity,
+                      minDist, maxDist,
+                      verticesA,
+                      facesA,
+                      indicesA,
+ 					  vertices,
+                      faces,
+                      indices,
+                      clippingFacesOut, pairIdx);
+
+
+		} else
+		{	
+			//mark this pair as in-active
+			concavePairs[pairIdx].w = -1;
+		}
+	}
+	else
+	{	
+		//mark this pair as in-active
+		concavePairs[pairIdx].w = -1;
+	}
+	
+	concavePairs[pairIdx].z = -1;//now z is used for existing/persistent contacts
+}
+
+
+
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl
new file mode 100644
index 0000000000..f433971741
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl
@@ -0,0 +1,1888 @@
+
+#define TRIANGLE_NUM_CONVEX_FACES 5
+
+
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+
+#ifdef cl_ext_atomic_counters_32
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+#else
+#define counter32_t volatile __global int*
+#endif
+
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+
+#define max2 max
+#define min2 min
+
+typedef unsigned int u32;
+
+
+
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+
+
+
+#define GET_NPOINTS(x) (x).m_worldNormalOnB.w
+
+
+
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+
+#define make_float4 (float4)
+#define make_float2 (float2)
+#define make_uint4 (uint4)
+#define make_int4 (int4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+
+
+__inline
+float fastDiv(float numerator, float denominator)
+{
+	return native_divide(numerator, denominator);	
+//	return numerator/denominator;	
+}
+
+__inline
+float4 fastDiv4(float4 numerator, float4 denominator)
+{
+	return native_divide(numerator, denominator);	
+}
+
+
+__inline
+float4 cross3(float4 a, float4 b)
+{
+	return cross(a,b);
+}
+
+//#define dot3F4 dot
+
+__inline
+float dot3F4(float4 a, float4 b)
+{
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+}
+
+__inline
+float4 fastNormalize4(float4 v)
+{
+	return fast_normalize(v);
+}
+
+
+///////////////////////////////////////
+//	Quaternion
+///////////////////////////////////////
+
+typedef float4 Quaternion;
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b);
+
+__inline
+Quaternion qtNormalize(Quaternion in);
+
+__inline
+float4 qtRotate(Quaternion q, float4 vec);
+
+__inline
+Quaternion qtInvert(Quaternion q);
+
+
+
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b)
+{
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+}
+
+__inline
+Quaternion qtNormalize(Quaternion in)
+{
+	return fastNormalize4(in);
+//	in /= length( in );
+//	return in;
+}
+__inline
+float4 qtRotate(Quaternion q, float4 vec)
+{
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+}
+
+__inline
+Quaternion qtInvert(Quaternion q)
+{
+	return (Quaternion)(-q.xyz, q.w);
+}
+
+__inline
+float4 qtInvRotate(const Quaternion q, float4 vec)
+{
+	return qtRotate( qtInvert( q ), vec );
+}
+
+__inline
+float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)
+{
+	return qtRotate( *orientation, *p ) + (*translation);
+}
+
+
+
+__inline
+float4 normalize3(const float4 a)
+{
+	float4 n = make_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+}
+
+
+__inline float4 lerp3(const float4 a,const float4 b, float  t)
+{
+	return make_float4(	a.x + (b.x - a.x) * t,
+						a.y + (b.y - a.y) * t,
+						a.z + (b.z - a.z) * t,
+						0.f);
+}
+
+
+
+// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut
+int clipFaceGlobal(__global const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, __global float4* ppVtxOut)
+{
+	
+	int ve;
+	float ds, de;
+	int numVertsOut = 0;
+    //double-check next test
+    	if (numVertsIn < 2)
+    		return 0;
+    
+	float4 firstVertex=pVtxIn[numVertsIn-1];
+	float4 endVertex = pVtxIn[0];
+	
+	ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;
+    
+	for (ve = 0; ve < numVertsIn; ve++)
+	{
+		endVertex=pVtxIn[ve];
+		de = dot3F4(planeNormalWS,endVertex)+planeEqWS;
+		if (ds<0)
+		{
+			if (de<0)
+			{
+				// Start < 0, end < 0, so output endVertex
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+			else
+			{
+				// Start < 0, end >= 0, so output intersection
+				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+			}
+		}
+		else
+		{
+			if (de<0)
+			{
+				// Start >= 0, end < 0 so output intersection and end
+				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+		}
+		firstVertex = endVertex;
+		ds = de;
+	}
+	return numVertsOut;
+}
+
+
+
+// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut
+int clipFace(const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, float4* ppVtxOut)
+{
+	
+	int ve;
+	float ds, de;
+	int numVertsOut = 0;
+//double-check next test
+	if (numVertsIn < 2)
+		return 0;
+
+	float4 firstVertex=pVtxIn[numVertsIn-1];
+	float4 endVertex = pVtxIn[0];
+	
+	ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;
+
+	for (ve = 0; ve < numVertsIn; ve++)
+	{
+		endVertex=pVtxIn[ve];
+
+		de = dot3F4(planeNormalWS,endVertex)+planeEqWS;
+
+		if (ds<0)
+		{
+			if (de<0)
+			{
+				// Start < 0, end < 0, so output endVertex
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+			else
+			{
+				// Start < 0, end >= 0, so output intersection
+				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+			}
+		}
+		else
+		{
+			if (de<0)
+			{
+				// Start >= 0, end < 0 so output intersection and end
+				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+		}
+		firstVertex = endVertex;
+		ds = de;
+	}
+	return numVertsOut;
+}
+
+
+int clipFaceAgainstHull(const float4 separatingNormal, __global const b3ConvexPolyhedronData_t* hullA,  
+	const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,
+	float4* worldVertsB2, int capacityWorldVertsB2,
+	const float minDist, float maxDist,
+	__global const float4* vertices,
+	__global const b3GpuFace_t* faces,
+	__global const int* indices,
+	float4* contactsOut,
+	int contactCapacity)
+{
+	int numContactsOut = 0;
+
+	float4* pVtxIn = worldVertsB1;
+	float4* pVtxOut = worldVertsB2;
+	
+	int numVertsIn = numWorldVertsB1;
+	int numVertsOut = 0;
+
+	int closestFaceA=-1;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(
+				faces[hullA->m_faceOffset+face].m_plane.x, 
+				faces[hullA->m_faceOffset+face].m_plane.y, 
+				faces[hullA->m_faceOffset+face].m_plane.z,0.f);
+			const float4 faceANormalWS = qtRotate(ornA,Normal);
+		
+			float d = dot3F4(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+			}
+		}
+	}
+	if (closestFaceA<0)
+		return numContactsOut;
+
+	b3GpuFace_t polyA = faces[hullA->m_faceOffset+closestFaceA];
+
+	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+	int numVerticesA = polyA.m_numIndices;
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+		const float4 a = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+e0]];
+		const float4 b = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+((e0+1)%numVerticesA)]];
+		const float4 edge0 = a - b;
+		const float4 WorldEdge0 = qtRotate(ornA,edge0);
+		float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);
+
+		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);
+		float4 worldA1 = transform(&a,&posA,&ornA);
+		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);
+		
+		float4 planeNormalWS = planeNormalWS1;
+		float planeEqWS=planeEqWS1;
+		
+		//clip face
+		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);
+		numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);
+
+		//btSwap(pVtxIn,pVtxOut);
+		float4* tmp = pVtxOut;
+		pVtxOut = pVtxIn;
+		pVtxIn = tmp;
+		numVertsIn = numVertsOut;
+		numVertsOut = 0;
+	}
+
+	
+	// only keep points that are behind the witness face
+	{
+		float4 localPlaneNormal  = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		float localPlaneEq = polyA.m_plane.w;
+		float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);
+		float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);
+		for (int i=0;i<numVertsIn;i++)
+		{
+			float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;
+			if (depth <=minDist)
+			{
+				depth = minDist;
+			}
+
+			if (depth <=maxDist)
+			{
+				float4 pointInWorld = pVtxIn[i];
+				//resultOut.addContactPoint(separatingNormal,point,depth);
+				contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);
+			}
+		}
+	}
+
+	return numContactsOut;
+}
+
+
+
+int clipFaceAgainstHullLocalA(const float4 separatingNormal, const b3ConvexPolyhedronData_t* hullA,  
+	const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,
+	float4* worldVertsB2, int capacityWorldVertsB2,
+	const float minDist, float maxDist,
+	const float4* verticesA,
+	const b3GpuFace_t* facesA,
+	const int* indicesA,
+	__global const float4* verticesB,
+	__global const b3GpuFace_t* facesB,
+	__global const int* indicesB,
+	float4* contactsOut,
+	int contactCapacity)
+{
+	int numContactsOut = 0;
+
+	float4* pVtxIn = worldVertsB1;
+	float4* pVtxOut = worldVertsB2;
+	
+	int numVertsIn = numWorldVertsB1;
+	int numVertsOut = 0;
+
+	int closestFaceA=-1;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(
+				facesA[hullA->m_faceOffset+face].m_plane.x, 
+				facesA[hullA->m_faceOffset+face].m_plane.y, 
+				facesA[hullA->m_faceOffset+face].m_plane.z,0.f);
+			const float4 faceANormalWS = qtRotate(ornA,Normal);
+		
+			float d = dot3F4(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+			}
+		}
+	}
+	if (closestFaceA<0)
+		return numContactsOut;
+
+	b3GpuFace_t polyA = facesA[hullA->m_faceOffset+closestFaceA];
+
+	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+	int numVerticesA = polyA.m_numIndices;
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+		const float4 a = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+e0]];
+		const float4 b = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+((e0+1)%numVerticesA)]];
+		const float4 edge0 = a - b;
+		const float4 WorldEdge0 = qtRotate(ornA,edge0);
+		float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);
+
+		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);
+		float4 worldA1 = transform(&a,&posA,&ornA);
+		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);
+		
+		float4 planeNormalWS = planeNormalWS1;
+		float planeEqWS=planeEqWS1;
+		
+		//clip face
+		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);
+		numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);
+
+		//btSwap(pVtxIn,pVtxOut);
+		float4* tmp = pVtxOut;
+		pVtxOut = pVtxIn;
+		pVtxIn = tmp;
+		numVertsIn = numVertsOut;
+		numVertsOut = 0;
+	}
+
+	
+	// only keep points that are behind the witness face
+	{
+		float4 localPlaneNormal  = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		float localPlaneEq = polyA.m_plane.w;
+		float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);
+		float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);
+		for (int i=0;i<numVertsIn;i++)
+		{
+			float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;
+			if (depth <=minDist)
+			{
+				depth = minDist;
+			}
+
+			if (depth <=maxDist)
+			{
+				float4 pointInWorld = pVtxIn[i];
+				//resultOut.addContactPoint(separatingNormal,point,depth);
+				contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);
+			}
+		}
+	}
+
+	return numContactsOut;
+}
+
+int	clipHullAgainstHull(const float4 separatingNormal,
+	__global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, 
+	const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, 
+	float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,
+	const float minDist, float maxDist,
+	__global const float4* vertices,
+	__global const b3GpuFace_t* faces,
+	__global const int* indices,
+	float4*	localContactsOut,
+	int localContactCapacity)
+{
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+
+
+	int closestFaceB=-1;
+	float dmax = -FLT_MAX;
+
+	{
+		for(int face=0;face<hullB->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x, 
+				faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);
+			const float4 WorldNormal = qtRotate(ornB, Normal);
+			float d = dot3F4(WorldNormal,separatingNormal);
+			if (d > dmax)
+			{
+				dmax = d;
+				closestFaceB = face;
+			}
+		}
+	}
+
+	{
+		const b3GpuFace_t polyB = faces[hullB->m_faceOffset+closestFaceB];
+		const int numVertices = polyB.m_numIndices;
+		for(int e0=0;e0<numVertices;e0++)
+		{
+			const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];
+			worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);
+		}
+	}
+
+	if (closestFaceB>=0)
+	{
+		numContactsOut = clipFaceAgainstHull(separatingNormal, hullA, 
+				posA,ornA,
+				worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,vertices,
+				faces,
+				indices,localContactsOut,localContactCapacity);
+	}
+
+	return numContactsOut;
+}
+
+
+int	clipHullAgainstHullLocalA(const float4 separatingNormal,
+	const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, 
+	const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, 
+	float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,
+	const float minDist, float maxDist,
+	const float4* verticesA,
+	const b3GpuFace_t* facesA,
+	const int* indicesA,
+	__global const float4* verticesB,
+	__global const b3GpuFace_t* facesB,
+	__global const int* indicesB,
+	float4*	localContactsOut,
+	int localContactCapacity)
+{
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+
+
+	int closestFaceB=-1;
+	float dmax = -FLT_MAX;
+
+	{
+		for(int face=0;face<hullB->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x, 
+				facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);
+			const float4 WorldNormal = qtRotate(ornB, Normal);
+			float d = dot3F4(WorldNormal,separatingNormal);
+			if (d > dmax)
+			{
+				dmax = d;
+				closestFaceB = face;
+			}
+		}
+	}
+
+	{
+		const b3GpuFace_t polyB = facesB[hullB->m_faceOffset+closestFaceB];
+		const int numVertices = polyB.m_numIndices;
+		for(int e0=0;e0<numVertices;e0++)
+		{
+			const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];
+			worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);
+		}
+	}
+
+	if (closestFaceB>=0)
+	{
+		numContactsOut = clipFaceAgainstHullLocalA(separatingNormal, hullA, 
+				posA,ornA,
+				worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,
+				verticesA,facesA,indicesA,
+				verticesB,facesB,indicesB,
+				localContactsOut,localContactCapacity);
+	}
+
+	return numContactsOut;
+}
+
+#define PARALLEL_SUM(v, n) for(int j=1; j<n; j++) v[0] += v[j];
+#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;}
+#define REDUCE_MAX(v, n) {int i=0;\
+for(int offset=0; offset<n; offset++) v[i] = (v[i].y > v[i+offset].y)? v[i]: v[i+offset]; }
+#define REDUCE_MIN(v, n) {int i=0;\
+for(int offset=0; offset<n; offset++) v[i] = (v[i].y < v[i+offset].y)? v[i]: v[i+offset]; }
+
+int extractManifoldSequentialGlobal(__global const float4* p, int nPoints, float4 nearNormal, int4* contactIdx)
+{
+	if( nPoints == 0 )
+        return 0;
+    
+    if (nPoints <=4)
+        return nPoints;
+    
+    
+    if (nPoints >64)
+        nPoints = 64;
+    
+	float4 center = make_float4(0.f);
+	{
+		
+		for (int i=0;i<nPoints;i++)
+			center += p[i];
+		center /= (float)nPoints;
+	}
+    
+	
+    
+	//	sample 4 directions
+    
+    float4 aVector = p[0] - center;
+    float4 u = cross3( nearNormal, aVector );
+    float4 v = cross3( nearNormal, u );
+    u = normalize3( u );
+    v = normalize3( v );
+    
+    
+    //keep point with deepest penetration
+    float minW= FLT_MAX;
+    
+    int minIndex=-1;
+    
+    float4 maxDots;
+    maxDots.x = FLT_MIN;
+    maxDots.y = FLT_MIN;
+    maxDots.z = FLT_MIN;
+    maxDots.w = FLT_MIN;
+    
+    //	idx, distance
+    for(int ie = 0; ie<nPoints; ie++ )
+    {
+        if (p[ie].w<minW)
+        {
+            minW = p[ie].w;
+            minIndex=ie;
+        }
+        float f;
+        float4 r = p[ie]-center;
+        f = dot3F4( u, r );
+        if (f<maxDots.x)
+        {
+            maxDots.x = f;
+            contactIdx[0].x = ie;
+        }
+        
+        f = dot3F4( -u, r );
+        if (f<maxDots.y)
+        {
+            maxDots.y = f;
+            contactIdx[0].y = ie;
+        }
+        
+        
+        f = dot3F4( v, r );
+        if (f<maxDots.z)
+        {
+            maxDots.z = f;
+            contactIdx[0].z = ie;
+        }
+        
+        f = dot3F4( -v, r );
+        if (f<maxDots.w)
+        {
+            maxDots.w = f;
+            contactIdx[0].w = ie;
+        }
+        
+    }
+    
+    if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)
+    {
+        //replace the first contact with minimum (todo: replace contact with least penetration)
+        contactIdx[0].x = minIndex;
+    }
+    
+    return 4;
+    
+}
+
+
+int extractManifoldSequentialGlobalFake(__global const float4* p, int nPoints, float4 nearNormal, int* contactIdx)
+{
+    contactIdx[0] = 0;
+    contactIdx[1] = 1;
+    contactIdx[2] = 2;
+    contactIdx[3] = 3;
+    
+	if( nPoints == 0 ) return 0;
+    
+	nPoints = min2( nPoints, 4 );
+    return nPoints;
+    
+}
+
+
+
+int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int* contactIdx)
+{
+	if( nPoints == 0 ) return 0;
+
+	nPoints = min2( nPoints, 64 );
+
+	float4 center = make_float4(0.f);
+	{
+		float4 v[64];
+		for (int i=0;i<nPoints;i++)
+			v[i] = p[i];
+		//memcpy( v, p, nPoints*sizeof(float4) );
+		PARALLEL_SUM( v, nPoints );
+		center = v[0]/(float)nPoints;
+	}
+
+	
+
+	{	//	sample 4 directions
+		if( nPoints < 4 )
+		{
+			for(int i=0; i<nPoints; i++) 
+				contactIdx[i] = i;
+			return nPoints;
+		}
+
+		float4 aVector = p[0] - center;
+		float4 u = cross3( nearNormal, aVector );
+		float4 v = cross3( nearNormal, u );
+		u = normalize3( u );
+		v = normalize3( v );
+
+		int idx[4];
+
+		float2 max00 = make_float2(0,FLT_MAX);
+		{
+			//	idx, distance
+			{
+				{
+					int4 a[64];
+					for(int ie = 0; ie<nPoints; ie++ )
+					{
+						
+						
+						float f;
+						float4 r = p[ie]-center;
+						f = dot3F4( u, r );
+						a[ie].x = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
+
+						f = dot3F4( -u, r );
+						a[ie].y = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
+
+						f = dot3F4( v, r );
+						a[ie].z = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
+
+						f = dot3F4( -v, r );
+						a[ie].w = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
+					}
+
+					for(int ie=0; ie<nPoints; ie++)
+					{
+						a[0].x = (a[0].x > a[ie].x )? a[0].x: a[ie].x;
+						a[0].y = (a[0].y > a[ie].y )? a[0].y: a[ie].y;
+						a[0].z = (a[0].z > a[ie].z )? a[0].z: a[ie].z;
+						a[0].w = (a[0].w > a[ie].w )? a[0].w: a[ie].w;
+					}
+
+					idx[0] = (int)a[0].x & 0xff;
+					idx[1] = (int)a[0].y & 0xff;
+					idx[2] = (int)a[0].z & 0xff;
+					idx[3] = (int)a[0].w & 0xff;
+				}
+			}
+
+			{
+				float2 h[64];
+				PARALLEL_DO( h[ie] = make_float2((float)ie, p[ie].w), nPoints );
+				REDUCE_MIN( h, nPoints );
+				max00 = h[0];
+			}
+		}
+
+		contactIdx[0] = idx[0];
+		contactIdx[1] = idx[1];
+		contactIdx[2] = idx[2];
+		contactIdx[3] = idx[3];
+
+
+		return 4;
+	}
+}
+
+
+
+__kernel void   extractManifoldAndAddContactKernel(__global const int4* pairs, 
+																	__global const b3RigidBodyData_t* rigidBodies, 
+																	__global const float4* closestPointsWorld,
+																	__global const float4* separatingNormalsWorld,
+																	__global const int* contactCounts,
+																	__global const int* contactOffsets,
+																	__global struct b3Contact4Data* restrict contactsOut,
+																	counter32_t nContactsOut,
+																	int contactCapacity,
+																	int numPairs,
+																	int pairIndex
+																	)
+{
+	int idx = get_global_id(0);
+	
+	if (idx<numPairs)
+	{
+		float4 normal = separatingNormalsWorld[idx];
+		int nPoints = contactCounts[idx];
+		__global const float4* pointsIn = &closestPointsWorld[contactOffsets[idx]];
+		float4 localPoints[64];
+		for (int i=0;i<nPoints;i++)
+		{
+			localPoints[i] = pointsIn[i];
+		}
+
+		int contactIdx[4];// = {-1,-1,-1,-1};
+		contactIdx[0] = -1;
+		contactIdx[1] = -1;
+		contactIdx[2] = -1;
+		contactIdx[3] = -1;
+
+		int nContacts = extractManifoldSequential(localPoints, nPoints, normal, contactIdx);
+
+		int dstIdx;
+		AppendInc( nContactsOut, dstIdx );
+		if (dstIdx<contactCapacity)
+		{
+			__global struct b3Contact4Data* c = contactsOut + dstIdx;
+			c->m_worldNormalOnB = -normal;
+			c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+			c->m_batchIdx = idx;
+			int bodyA = pairs[pairIndex].x;
+			int bodyB = pairs[pairIndex].y;
+			c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;
+			c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;
+			c->m_childIndexA = -1;
+			c->m_childIndexB = -1;
+			for (int i=0;i<nContacts;i++)
+			{
+				c->m_worldPosB[i] = localPoints[contactIdx[i]];
+			}
+			GET_NPOINTS(*c) = nContacts;
+		}
+	}
+}
+
+
+void	trInverse(float4 translationIn, Quaternion orientationIn,
+		float4* translationOut, Quaternion* orientationOut)
+{
+	*orientationOut = qtInvert(orientationIn);
+	*translationOut = qtRotate(*orientationOut, -translationIn);
+}
+
+void	trMul(float4 translationA, Quaternion orientationA,
+						float4 translationB, Quaternion orientationB,
+		float4* translationOut, Quaternion* orientationOut)
+{
+	*orientationOut = qtMul(orientationA,orientationB);
+	*translationOut = transform(&translationB,&translationA,&orientationA);
+}
+
+
+
+
+__kernel void   clipHullHullKernel( __global int4* pairs, 
+																					__global const b3RigidBodyData_t* rigidBodies, 
+																					__global const b3Collidable_t* collidables,
+																					__global const b3ConvexPolyhedronData_t* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const b3GpuFace_t* faces,
+																					__global const int* indices,
+																					__global const float4* separatingNormals,
+																					__global const int* hasSeparatingAxis,
+																					__global struct b3Contact4Data* restrict globalContactsOut,
+																					counter32_t nGlobalContactsOut,
+																					int numPairs,
+																					int contactCapacity)
+{
+
+	int i = get_global_id(0);
+	int pairIndex = i;
+	
+	float4 worldVertsB1[64];
+	float4 worldVertsB2[64];
+	int capacityWorldVerts = 64;	
+
+	float4 localContactsOut[64];
+	int localContactCapacity=64;
+	
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+
+	if (i<numPairs)
+	{
+
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+			
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+
+		if (hasSeparatingAxis[i])
+		{
+
+			
+			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+			
+
+
+		
+			int numLocalContactsOut = clipHullAgainstHull(separatingNormals[i],
+														&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],
+														rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,
+													  rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,
+													  worldVertsB1,worldVertsB2,capacityWorldVerts,
+														minDist, maxDist,
+														vertices,faces,indices,
+														localContactsOut,localContactCapacity);
+												
+		if (numLocalContactsOut>0)
+		{
+				float4 normal = -separatingNormals[i];
+				int nPoints = numLocalContactsOut;
+				float4* pointsIn = localContactsOut;
+				int contactIdx[4];// = {-1,-1,-1,-1};
+
+				contactIdx[0] = -1;
+				contactIdx[1] = -1;
+				contactIdx[2] = -1;
+				contactIdx[3] = -1;
+		
+				int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);
+		
+				
+				int mprContactIndex = pairs[pairIndex].z;
+
+				int dstIdx = mprContactIndex;
+				if (dstIdx<0)
+				{
+					AppendInc( nGlobalContactsOut, dstIdx );
+				}
+
+				if (dstIdx<contactCapacity)
+				{
+					pairs[pairIndex].z = dstIdx;
+
+					__global struct b3Contact4Data* c = globalContactsOut+ dstIdx;
+					c->m_worldNormalOnB = -normal;
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+					c->m_batchIdx = pairIndex;
+					int bodyA = pairs[pairIndex].x;
+					int bodyB = pairs[pairIndex].y;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+					c->m_childIndexA = -1;
+					c->m_childIndexB = -1;
+
+					for (int i=0;i<nReducedContacts;i++)
+					{
+					//this condition means: overwrite contact point, unless at index i==0 we have a valid 'mpr' contact
+						if (i>0||(mprContactIndex<0))
+						{
+							c->m_worldPosB[i] = pointsIn[contactIdx[i]];
+						}
+					}
+					GET_NPOINTS(*c) = nReducedContacts;
+				}
+				
+			}//		if (numContactsOut>0)
+		}//		if (hasSeparatingAxis[i])
+	}//	if (i<numPairs)
+
+}
+
+
+__kernel void   clipCompoundsHullHullKernel( __global const int4* gpuCompoundPairs, 
+																					__global const b3RigidBodyData_t* rigidBodies, 
+																					__global const b3Collidable_t* collidables,
+																					__global const b3ConvexPolyhedronData_t* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const b3GpuFace_t* faces,
+																					__global const int* indices,
+																					__global const b3GpuChildShape_t* gpuChildShapes,
+																					__global const float4* gpuCompoundSepNormalsOut,
+																					__global const int* gpuHasCompoundSepNormalsOut,
+																					__global struct b3Contact4Data* restrict globalContactsOut,
+																					counter32_t nGlobalContactsOut,
+																					int numCompoundPairs, int maxContactCapacity)
+{
+
+	int i = get_global_id(0);
+	int pairIndex = i;
+	
+	float4 worldVertsB1[64];
+	float4 worldVertsB2[64];
+	int capacityWorldVerts = 64;	
+
+	float4 localContactsOut[64];
+	int localContactCapacity=64;
+	
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+
+	if (i<numCompoundPairs)
+	{
+
+		if (gpuHasCompoundSepNormalsOut[i])
+		{
+
+			int bodyIndexA = gpuCompoundPairs[i].x;
+			int bodyIndexB = gpuCompoundPairs[i].y;
+			
+			int childShapeIndexA = gpuCompoundPairs[i].z;
+			int childShapeIndexB = gpuCompoundPairs[i].w;
+			
+			int collidableIndexA = -1;
+			int collidableIndexB = -1;
+			
+			float4 ornA = rigidBodies[bodyIndexA].m_quat;
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			
+			float4 ornB = rigidBodies[bodyIndexB].m_quat;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+								
+			if (childShapeIndexA >= 0)
+			{
+				collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+				float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+				float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+				float4 newPosA = qtRotate(ornA,childPosA)+posA;
+				float4 newOrnA = qtMul(ornA,childOrnA);
+				posA = newPosA;
+				ornA = newOrnA;
+			} else
+			{
+				collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+			}
+			
+			if (childShapeIndexB>=0)
+			{
+				collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+				float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+				float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+				float4 newPosB = transform(&childPosB,&posB,&ornB);
+				float4 newOrnB = qtMul(ornB,childOrnB);
+				posB = newPosB;
+				ornB = newOrnB;
+			} else
+			{
+				collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	
+			}
+			
+			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		
+			int numLocalContactsOut = clipHullAgainstHull(gpuCompoundSepNormalsOut[i],
+														&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],
+														posA,ornA,
+													  posB,ornB,
+													  worldVertsB1,worldVertsB2,capacityWorldVerts,
+														minDist, maxDist,
+														vertices,faces,indices,
+														localContactsOut,localContactCapacity);
+												
+		if (numLocalContactsOut>0)
+		{
+				float4 normal = -gpuCompoundSepNormalsOut[i];
+				int nPoints = numLocalContactsOut;
+				float4* pointsIn = localContactsOut;
+				int contactIdx[4];// = {-1,-1,-1,-1};
+
+				contactIdx[0] = -1;
+				contactIdx[1] = -1;
+				contactIdx[2] = -1;
+				contactIdx[3] = -1;
+		
+				int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);
+		
+				int dstIdx;
+				AppendInc( nGlobalContactsOut, dstIdx );
+				if ((dstIdx+nReducedContacts) < maxContactCapacity)
+				{
+					__global struct b3Contact4Data* c = globalContactsOut+ dstIdx;
+					c->m_worldNormalOnB = -normal;
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+					c->m_batchIdx = pairIndex;
+					int bodyA = gpuCompoundPairs[pairIndex].x;
+					int bodyB = gpuCompoundPairs[pairIndex].y;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+					c->m_childIndexA = childShapeIndexA;
+					c->m_childIndexB = childShapeIndexB;
+					for (int i=0;i<nReducedContacts;i++)
+					{
+						c->m_worldPosB[i] = pointsIn[contactIdx[i]];
+					}
+					GET_NPOINTS(*c) = nReducedContacts;
+				}
+				
+			}//		if (numContactsOut>0)
+		}//		if (gpuHasCompoundSepNormalsOut[i])
+	}//	if (i<numCompoundPairs)
+
+}
+
+
+
+__kernel void   sphereSphereCollisionKernel( __global const int4* pairs, 
+																					__global const b3RigidBodyData_t* rigidBodies, 
+																					__global const b3Collidable_t* collidables,
+																					__global const float4* separatingNormals,
+																					__global const int* hasSeparatingAxis,
+																					__global struct b3Contact4Data* restrict globalContactsOut,
+																					counter32_t nGlobalContactsOut,
+																					int contactCapacity,
+																					int numPairs)
+{
+
+	int i = get_global_id(0);
+	int pairIndex = i;
+	
+	if (i<numPairs)
+	{
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+			
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)
+		{
+			//sphere-sphere
+			float radiusA = collidables[collidableIndexA].m_radius;
+			float radiusB = collidables[collidableIndexB].m_radius;
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+
+			float4 diff = posA-posB;
+			float len = length(diff);
+			
+			///iff distance positive, don't generate a new contact
+			if ( len <= (radiusA+radiusB))
+			{
+				///distance (negative means penetration)
+				float dist = len - (radiusA+radiusB);
+				float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);
+				if (len > 0.00001)
+				{
+					normalOnSurfaceB = diff / len;
+				}
+				float4 contactPosB = posB + normalOnSurfaceB*radiusB;
+				contactPosB.w = dist;
+								
+				int dstIdx;
+				AppendInc( nGlobalContactsOut, dstIdx );
+				if (dstIdx < contactCapacity)
+				{
+					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
+					c->m_worldNormalOnB = -normalOnSurfaceB;
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+					c->m_batchIdx = pairIndex;
+					int bodyA = pairs[pairIndex].x;
+					int bodyB = pairs[pairIndex].y;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+					c->m_worldPosB[0] = contactPosB;
+					c->m_childIndexA = -1;
+					c->m_childIndexB = -1;
+
+					GET_NPOINTS(*c) = 1;
+				}//if (dstIdx < numPairs)
+			}//if ( len <= (radiusA+radiusB))
+		}//SHAPE_SPHERE SHAPE_SPHERE
+	}//if (i<numPairs)
+}				
+
+__kernel void   clipHullHullConcaveConvexKernel( __global int4* concavePairsIn,
+																					__global const b3RigidBodyData_t* rigidBodies, 
+																					__global const b3Collidable_t* collidables,
+																					__global const b3ConvexPolyhedronData_t* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const b3GpuFace_t* faces,
+																					__global const int* indices,
+																					__global const b3GpuChildShape_t* gpuChildShapes,
+																					__global const float4* separatingNormals,
+																					__global struct b3Contact4Data* restrict globalContactsOut,
+																					counter32_t nGlobalContactsOut,
+																					int contactCapacity,
+																					int numConcavePairs)
+{
+
+	int i = get_global_id(0);
+	int pairIndex = i;
+	
+	float4 worldVertsB1[64];
+	float4 worldVertsB2[64];
+	int capacityWorldVerts = 64;	
+
+	float4 localContactsOut[64];
+	int localContactCapacity=64;
+	
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+
+	if (i<numConcavePairs)
+	{
+		//negative value means that the pair is invalid
+		if (concavePairsIn[i].w<0)
+			return;
+
+		int bodyIndexA = concavePairsIn[i].x;
+		int bodyIndexB = concavePairsIn[i].y;
+		int f = concavePairsIn[i].z;
+		int childShapeIndexA = f;
+		
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+		
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		
+		///////////////////////////////////////////////////////////////
+		
+	
+		bool overlap = false;
+		
+		b3ConvexPolyhedronData_t convexPolyhedronA;
+
+	//add 3 vertices of the triangle
+		convexPolyhedronA.m_numVertices = 3;
+		convexPolyhedronA.m_vertexOffset = 0;
+		float4	localCenter = make_float4(0.f,0.f,0.f,0.f);
+
+		b3GpuFace_t face = faces[convexShapes[shapeIndexA].m_faceOffset+f];
+		
+		float4 verticesA[3];
+		for (int i=0;i<3;i++)
+		{
+			int index = indices[face.m_indexOffset+i];
+			float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];
+			verticesA[i] = vert;
+			localCenter += vert;
+		}
+
+		float dmin = FLT_MAX;
+
+		int localCC=0;
+
+		//a triangle has 3 unique edges
+		convexPolyhedronA.m_numUniqueEdges = 3;
+		convexPolyhedronA.m_uniqueEdgesOffset = 0;
+		float4 uniqueEdgesA[3];
+		
+		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);
+		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);
+		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);
+
+
+		convexPolyhedronA.m_faceOffset = 0;
+                                  
+		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);
+                             
+		b3GpuFace_t facesA[TRIANGLE_NUM_CONVEX_FACES];
+		int indicesA[3+3+2+2+2];
+		int curUsedIndices=0;
+		int fidx=0;
+
+		//front size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[0] = 0;
+			indicesA[1] = 1;
+			indicesA[2] = 2;
+			curUsedIndices+=3;
+			float c = face.m_plane.w;
+			facesA[fidx].m_plane.x = normal.x;
+			facesA[fidx].m_plane.y = normal.y;
+			facesA[fidx].m_plane.z = normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+		//back size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[3]=2;
+			indicesA[4]=1;
+			indicesA[5]=0;
+			curUsedIndices+=3;
+			float c = dot3F4(normal,verticesA[0]);
+			float c1 = -face.m_plane.w;
+			facesA[fidx].m_plane.x = -normal.x;
+			facesA[fidx].m_plane.y = -normal.y;
+			facesA[fidx].m_plane.z = -normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+
+		bool addEdgePlanes = true;
+		if (addEdgePlanes)
+		{
+			int numVertices=3;
+			int prevVertex = numVertices-1;
+			for (int i=0;i<numVertices;i++)
+			{
+				float4 v0 = verticesA[i];
+				float4 v1 = verticesA[prevVertex];
+                                            
+				float4 edgeNormal = normalize(cross(normal,v1-v0));
+				float c = -dot3F4(edgeNormal,v0);
+
+				facesA[fidx].m_numIndices = 2;
+				facesA[fidx].m_indexOffset=curUsedIndices;
+				indicesA[curUsedIndices++]=i;
+				indicesA[curUsedIndices++]=prevVertex;
+                                            
+				facesA[fidx].m_plane.x = edgeNormal.x;
+				facesA[fidx].m_plane.y = edgeNormal.y;
+				facesA[fidx].m_plane.z = edgeNormal.z;
+				facesA[fidx].m_plane.w = c;
+				fidx++;
+				prevVertex = i;
+			}
+		}
+		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;
+		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);
+
+
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 ornB =rigidBodies[bodyIndexB].m_quat;
+
+
+		float4 sepAxis = separatingNormals[i];
+		
+		int shapeTypeB = collidables[collidableIndexB].m_shapeType;
+		int childShapeIndexB =-1;
+		if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+			///////////////////
+			///compound shape support
+			
+			childShapeIndexB = concavePairsIn[pairIndex].w;
+			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			float4 newPosB = transform(&childPosB,&posB,&ornB);
+			float4 newOrnB = qtMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+			
+		}
+		
+		////////////////////////////////////////
+		
+		
+		
+		int numLocalContactsOut = clipHullAgainstHullLocalA(sepAxis,
+														&convexPolyhedronA, &convexShapes[shapeIndexB],
+														posA,ornA,
+													  posB,ornB,
+													  worldVertsB1,worldVertsB2,capacityWorldVerts,
+														minDist, maxDist,
+														&verticesA,&facesA,&indicesA,
+														vertices,faces,indices,
+														localContactsOut,localContactCapacity);
+												
+		if (numLocalContactsOut>0)
+		{
+			float4 normal = -separatingNormals[i];
+			int nPoints = numLocalContactsOut;
+			float4* pointsIn = localContactsOut;
+			int contactIdx[4];// = {-1,-1,-1,-1};
+
+			contactIdx[0] = -1;
+			contactIdx[1] = -1;
+			contactIdx[2] = -1;
+			contactIdx[3] = -1;
+	
+			int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);
+	
+			int dstIdx;
+			AppendInc( nGlobalContactsOut, dstIdx );
+			if (dstIdx<contactCapacity)
+			{
+				__global struct b3Contact4Data* c = globalContactsOut+ dstIdx;
+				c->m_worldNormalOnB = -normal;
+				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+				c->m_batchIdx = pairIndex;
+				int bodyA = concavePairsIn[pairIndex].x;
+				int bodyB = concavePairsIn[pairIndex].y;
+				c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+				c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+				c->m_childIndexA = childShapeIndexA;
+				c->m_childIndexB = childShapeIndexB;
+				for (int i=0;i<nReducedContacts;i++)
+				{
+					c->m_worldPosB[i] = pointsIn[contactIdx[i]];
+				}
+				GET_NPOINTS(*c) = nReducedContacts;
+			}
+				
+		}//		if (numContactsOut>0)
+	}//	if (i<numPairs)
+}
+
+
+
+
+
+
+int	findClippingFaces(const float4 separatingNormal,
+                      __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB,
+                      const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,
+                       __global float4* worldVertsA1,
+                      __global float4* worldNormalsA1,
+                      __global float4* worldVertsB1,
+                      int capacityWorldVerts,
+                      const float minDist, float maxDist,
+                      __global const float4* vertices,
+                      __global const b3GpuFace_t* faces,
+                      __global const int* indices,
+                      __global int4* clippingFaces, int pairIndex)
+{
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+    
+    
+	int closestFaceB=-1;
+	float dmax = -FLT_MAX;
+    
+	{
+		for(int face=0;face<hullB->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x,
+                                              faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);
+			const float4 WorldNormal = qtRotate(ornB, Normal);
+			float d = dot3F4(WorldNormal,separatingNormal);
+			if (d > dmax)
+			{
+				dmax = d;
+				closestFaceB = face;
+			}
+		}
+	}
+    
+	{
+		const b3GpuFace_t polyB = faces[hullB->m_faceOffset+closestFaceB];
+		const int numVertices = polyB.m_numIndices;
+		for(int e0=0;e0<numVertices;e0++)
+		{
+			const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];
+			worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);
+		}
+	}
+    
+    int closestFaceA=-1;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(
+                                              faces[hullA->m_faceOffset+face].m_plane.x,
+                                              faces[hullA->m_faceOffset+face].m_plane.y,
+                                              faces[hullA->m_faceOffset+face].m_plane.z,
+                                              0.f);
+			const float4 faceANormalWS = qtRotate(ornA,Normal);
+            
+			float d = dot3F4(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+                worldNormalsA1[pairIndex] = faceANormalWS;
+			}
+		}
+	}
+    
+    int numVerticesA = faces[hullA->m_faceOffset+closestFaceA].m_numIndices;
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+        const float4 a = vertices[hullA->m_vertexOffset+indices[faces[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];
+        worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);
+    }
+    
+    clippingFaces[pairIndex].x = closestFaceA;
+    clippingFaces[pairIndex].y = closestFaceB;
+    clippingFaces[pairIndex].z = numVerticesA;
+    clippingFaces[pairIndex].w = numWorldVertsB1;
+    
+    
+	return numContactsOut;
+}
+
+
+
+int clipFaces(__global float4* worldVertsA1,
+              __global float4* worldNormalsA1,
+              __global float4* worldVertsB1,
+              __global float4* worldVertsB2, 
+              int capacityWorldVertsB2,
+              const float minDist, float maxDist,
+              __global int4* clippingFaces,
+              int pairIndex)
+{
+	int numContactsOut = 0;
+    
+    int closestFaceA = clippingFaces[pairIndex].x;
+    int closestFaceB = clippingFaces[pairIndex].y;
+	int numVertsInA = clippingFaces[pairIndex].z;
+	int numVertsInB = clippingFaces[pairIndex].w;
+    
+	int numVertsOut = 0;
+    
+	if (closestFaceA<0)
+		return numContactsOut;
+    
+    __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];
+    __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];
+    
+    
+	
+	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+    
+	for(int e0=0;e0<numVertsInA;e0++)
+	{
+		const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];
+		const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];
+		const float4 WorldEdge0 = aw - bw;
+		float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];
+		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);
+		float4 worldA1 = aw;
+		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);
+		float4 planeNormalWS = planeNormalWS1;
+		float planeEqWS=planeEqWS1;
+		numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);
+		__global float4* tmp = pVtxOut;
+		pVtxOut = pVtxIn;
+		pVtxIn = tmp;
+		numVertsInB = numVertsOut;
+		numVertsOut = 0;
+	}
+    
+    //float4 planeNormalWS = worldNormalsA1[pairIndex];
+    //float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);
+
+
+    
+    /*for (int i=0;i<numVertsInB;i++)
+    {
+        pVtxOut[i] = pVtxIn[i];
+    }*/
+    
+    
+    
+    
+    //numVertsInB=0;
+	
+    float4 planeNormalWS = worldNormalsA1[pairIndex];
+    float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);
+
+    for (int i=0;i<numVertsInB;i++)
+    {
+        float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;
+        if (depth <=minDist)
+        {
+            depth = minDist;
+        }
+        
+        if (depth <=maxDist)
+        {
+            float4 pointInWorld = pVtxIn[i];
+            pVtxOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);
+        }
+    }
+   
+    clippingFaces[pairIndex].w =numContactsOut;
+   
+    
+	return numContactsOut;
+
+}
+
+
+
+
+__kernel void   findClippingFacesKernel(  __global const int4* pairs,
+                                        __global const b3RigidBodyData_t* rigidBodies,
+                                        __global const b3Collidable_t* collidables,
+                                        __global const b3ConvexPolyhedronData_t* convexShapes,
+                                        __global const float4* vertices,
+                                        __global const float4* uniqueEdges,
+                                        __global const b3GpuFace_t* faces,
+                                        __global const int* indices,
+                                        __global const float4* separatingNormals,
+                                        __global const int* hasSeparatingAxis,
+                                        __global int4* clippingFacesOut,
+                                        __global float4* worldVertsA1,
+                                        __global float4* worldNormalsA1,
+                                        __global float4* worldVertsB1,
+                                        int capacityWorldVerts,
+                                        int numPairs
+                                        )
+{
+    
+	int i = get_global_id(0);
+	int pairIndex = i;
+    
+	
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+    
+	if (i<numPairs)
+	{
+        
+		if (hasSeparatingAxis[i])
+		{
+            
+			int bodyIndexA = pairs[i].x;
+			int bodyIndexB = pairs[i].y;
+			
+			int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+			int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+			
+			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+			
+            
+            
+			int numLocalContactsOut = findClippingFaces(separatingNormals[i],
+                                                        &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],
+                                                        rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,
+                                                        rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,
+                                                        worldVertsA1,
+                                                        worldNormalsA1,
+                                                        worldVertsB1,capacityWorldVerts,
+                                                        minDist, maxDist,
+                                                        vertices,faces,indices,
+                                                        clippingFacesOut,i);
+            
+            
+		}//		if (hasSeparatingAxis[i])
+	}//	if (i<numPairs)
+    
+}
+
+
+
+
+__kernel void   clipFacesAndFindContactsKernel(    __global const float4* separatingNormals,
+                                                   __global const int* hasSeparatingAxis,
+                                                   __global int4* clippingFacesOut,
+                                                   __global float4* worldVertsA1,
+                                                   __global float4* worldNormalsA1,
+                                                   __global float4* worldVertsB1,
+                                                   __global float4* worldVertsB2,
+                                                    int vertexFaceCapacity,
+                                                   int numPairs,
+					                                        int debugMode
+                                                   )
+{
+    int i = get_global_id(0);
+	int pairIndex = i;
+	
+    
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+    
+	if (i<numPairs)
+	{
+        
+		if (hasSeparatingAxis[i])
+		{
+            
+//			int bodyIndexA = pairs[i].x;
+	//		int bodyIndexB = pairs[i].y;
+		    
+            int numLocalContactsOut = 0;
+
+            int capacityWorldVertsB2 = vertexFaceCapacity;
+            
+            __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];
+            __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];
+            
+
+            {
+                __global int4* clippingFaces = clippingFacesOut;
+            
+                
+                int closestFaceA = clippingFaces[pairIndex].x;
+                int closestFaceB = clippingFaces[pairIndex].y;
+                int numVertsInA = clippingFaces[pairIndex].z;
+                int numVertsInB = clippingFaces[pairIndex].w;
+                
+                int numVertsOut = 0;
+                
+                if (closestFaceA>=0)
+                {
+                    
+                    
+                    
+                    // clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+                    
+                    for(int e0=0;e0<numVertsInA;e0++)
+                    {
+                        const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];
+                        const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];
+                        const float4 WorldEdge0 = aw - bw;
+                        float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];
+                        float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);
+                        float4 worldA1 = aw;
+                        float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);
+                        float4 planeNormalWS = planeNormalWS1;
+                        float planeEqWS=planeEqWS1;
+                        numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);
+                        __global float4* tmp = pVtxOut;
+                        pVtxOut = pVtxIn;
+                        pVtxIn = tmp;
+                        numVertsInB = numVertsOut;
+                        numVertsOut = 0;
+                    }
+                    
+                    float4 planeNormalWS = worldNormalsA1[pairIndex];
+                    float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);
+                    
+                    for (int i=0;i<numVertsInB;i++)
+                    {
+                        float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;
+                        if (depth <=minDist)
+                        {
+                            depth = minDist;
+                        }
+                        
+                        if (depth <=maxDist)
+                        {
+                            float4 pointInWorld = pVtxIn[i];
+                            pVtxOut[numLocalContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);
+                        }
+                    }
+                    
+                }
+                clippingFaces[pairIndex].w =numLocalContactsOut;
+                
+
+            }
+            
+            for (int i=0;i<numLocalContactsOut;i++)
+                pVtxIn[i] = pVtxOut[i];
+                
+		}//		if (hasSeparatingAxis[i])
+	}//	if (i<numPairs)
+    
+}
+
+
+
+
+
+__kernel void   newContactReductionKernel( __global int4* pairs,
+                                                   __global const b3RigidBodyData_t* rigidBodies,
+                                                   __global const float4* separatingNormals,
+                                                   __global const int* hasSeparatingAxis,
+                                                   __global struct b3Contact4Data* globalContactsOut,
+                                                   __global int4* clippingFaces,
+                                                   __global float4* worldVertsB2,
+                                                   volatile __global int* nGlobalContactsOut,
+                                                   int vertexFaceCapacity,
+												   int contactCapacity,
+                                                   int numPairs
+                                                   )
+{
+    int i = get_global_id(0);
+	int pairIndex = i;
+	
+    int4 contactIdx;
+    contactIdx=make_int4(0,1,2,3);
+    
+	if (i<numPairs)
+	{
+        
+		if (hasSeparatingAxis[i])
+		{
+            
+			
+            
+            
+			int nPoints = clippingFaces[pairIndex].w;
+           
+            if (nPoints>0)
+            {
+
+                 __global float4* pointsIn = &worldVertsB2[pairIndex*vertexFaceCapacity];
+                float4 normal = -separatingNormals[i];
+                
+                int nReducedContacts = extractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx);
+            
+				int mprContactIndex = pairs[pairIndex].z;
+
+                int dstIdx = mprContactIndex;
+
+				if (dstIdx<0)
+				{
+	                AppendInc( nGlobalContactsOut, dstIdx );
+				}
+//#if 0
+                
+				if (dstIdx < contactCapacity)
+				{
+
+					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
+					c->m_worldNormalOnB = -normal;
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
+					c->m_batchIdx = pairIndex;
+					int bodyA = pairs[pairIndex].x;
+					int bodyB = pairs[pairIndex].y;
+
+					pairs[pairIndex].w = dstIdx;
+
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+                    c->m_childIndexA =-1;
+					c->m_childIndexB =-1;
+
+                    switch (nReducedContacts)
+                    {
+                        case 4:
+                            c->m_worldPosB[3] = pointsIn[contactIdx.w];
+                        case 3:
+                            c->m_worldPosB[2] = pointsIn[contactIdx.z];
+                        case 2:
+                            c->m_worldPosB[1] = pointsIn[contactIdx.y];
+                        case 1:
+							if (mprContactIndex<0)//test
+	                            c->m_worldPosB[0] = pointsIn[contactIdx.x];
+                        default:
+                        {
+                        }
+                    };
+                    
+					GET_NPOINTS(*c) = nReducedContacts;
+                    
+                 }
+                 
+                
+//#endif
+				
+			}//		if (numContactsOut>0)
+		}//		if (hasSeparatingAxis[i])
+	}//	if (i<numPairs)
+
+    
+    
+}
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h
new file mode 100644
index 0000000000..f0ecfc7851
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h
@@ -0,0 +1,2099 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* satClipKernelsCL= \
+"#define TRIANGLE_NUM_CONVEX_FACES 5\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"#else\n"
+"#define counter32_t volatile __global int*\n"
+"#endif\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GET_NUM_GROUPS get_num_groups(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+"#define max2 max\n"
+"#define min2 min\n"
+"typedef unsigned int u32;\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#define B3_PLATFORM_DEFINITIONS_H\n"
+"struct MyTest\n"
+"{\n"
+"	int bla;\n"
+"};\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3AtomicAdd atomic_add\n"
+"#define b3Fabs fabs\n"
+"#define b3Sqrt native_sqrt\n"
+"#define b3Sin native_sin\n"
+"#define b3Cos native_cos\n"
+"#define B3_STATIC\n"
+"#endif\n"
+"#endif\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"	#define b3MinFloat4 min\n"
+"	#define b3MaxFloat4 max\n"
+"	#define b3Normalized(a) normalize(a)\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"{\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
+"}\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"{\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"}\n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"{\n"
+"	b3Float4	m_worldPosB[4];\n"
+"//	b3Float4	m_localPosA[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormalOnB;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"};\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"{\n"
+"	return (int)contact->m_worldNormalOnB.w;\n"
+"};\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"{\n"
+"	contact->m_worldNormalOnB.w = (float)numPoints;\n"
+"};\n"
+"#endif //B3_CONTACT4DATA_H\n"
+"#ifndef B3_CONVEX_POLYHEDRON_DATA_H\n"
+"#define B3_CONVEX_POLYHEDRON_DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#define B3_QUAT_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif\n"
+"#endif\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"	typedef float4	b3Quat;\n"
+"	#define b3QuatConstArg const b3Quat\n"
+"	\n"
+"	\n"
+"inline float4 b3FastNormalize4(float4 v)\n"
+"{\n"
+"	v = (float4)(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"}\n"
+"	\n"
+"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
+"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
+"{\n"
+"	b3Quat ans;\n"
+"	ans = b3Cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
+"	return ans;\n"
+"}\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
+"{\n"
+"	b3Quat q;\n"
+"	q=in;\n"
+"	//return b3FastNormalize4(in);\n"
+"	float len = native_sqrt(dot(q, q));\n"
+"	if(len > 0.f)\n"
+"	{\n"
+"		q *= 1.f / len;\n"
+"	}\n"
+"	else\n"
+"	{\n"
+"		q.x = q.y = q.z = 0.f;\n"
+"		q.w = 1.f;\n"
+"	}\n"
+"	return q;\n"
+"}\n"
+"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"{\n"
+"	b3Quat qInv = b3QuatInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"}\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
+"{\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"}\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
+"{\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"}\n"
+"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"{\n"
+"	return b3QuatRotate( b3QuatInvert( q ), vec );\n"
+"}\n"
+"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)\n"
+"{\n"
+"	return b3QuatRotate( orientation, point ) + (translation);\n"
+"}\n"
+"	\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"typedef struct b3GpuFace b3GpuFace_t;\n"
+"struct b3GpuFace\n"
+"{\n"
+"	b3Float4 m_plane;\n"
+"	int m_indexOffset;\n"
+"	int m_numIndices;\n"
+"	int m_unusedPadding1;\n"
+"	int m_unusedPadding2;\n"
+"};\n"
+"typedef struct b3ConvexPolyhedronData b3ConvexPolyhedronData_t;\n"
+"struct b3ConvexPolyhedronData\n"
+"{\n"
+"	b3Float4		m_localCenter;\n"
+"	b3Float4		m_extents;\n"
+"	b3Float4		mC;\n"
+"	b3Float4		mE;\n"
+"	float			m_radius;\n"
+"	int	m_faceOffset;\n"
+"	int m_numFaces;\n"
+"	int	m_numVertices;\n"
+"	int m_vertexOffset;\n"
+"	int	m_uniqueEdgesOffset;\n"
+"	int	m_numUniqueEdges;\n"
+"	int m_unused;\n"
+"};\n"
+"#endif //B3_CONVEX_POLYHEDRON_DATA_H\n"
+"#ifndef B3_COLLIDABLE_H\n"
+"#define B3_COLLIDABLE_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"enum b3ShapeTypes\n"
+"{\n"
+"	SHAPE_HEIGHT_FIELD=1,\n"
+"	SHAPE_CONVEX_HULL=3,\n"
+"	SHAPE_PLANE=4,\n"
+"	SHAPE_CONCAVE_TRIMESH=5,\n"
+"	SHAPE_COMPOUND_OF_CONVEX_HULLS=6,\n"
+"	SHAPE_SPHERE=7,\n"
+"	MAX_NUM_SHAPE_TYPES,\n"
+"};\n"
+"typedef struct b3Collidable b3Collidable_t;\n"
+"struct b3Collidable\n"
+"{\n"
+"	union {\n"
+"		int m_numChildShapes;\n"
+"		int m_bvhIndex;\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float m_radius;\n"
+"		int	m_compoundBvhIndex;\n"
+"	};\n"
+"	int m_shapeType;\n"
+"	int m_shapeIndex;\n"
+"};\n"
+"typedef struct b3GpuChildShape b3GpuChildShape_t;\n"
+"struct b3GpuChildShape\n"
+"{\n"
+"	b3Float4	m_childPosition;\n"
+"	b3Quat		m_childOrientation;\n"
+"	int m_shapeIndex;\n"
+"	int m_unused0;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"};\n"
+"struct b3CompoundOverlappingPair\n"
+"{\n"
+"	int m_bodyIndexA;\n"
+"	int m_bodyIndexB;\n"
+"//	int	m_pairType;\n"
+"	int m_childShapeIndexA;\n"
+"	int m_childShapeIndexB;\n"
+"};\n"
+"#endif //B3_COLLIDABLE_H\n"
+"#ifndef B3_RIGIDBODY_DATA_H\n"
+"#define B3_RIGIDBODY_DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifndef B3_MAT3x3_H\n"
+"#define B3_MAT3x3_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"typedef struct\n"
+"{\n"
+"	b3Float4 m_row[3];\n"
+"}b3Mat3x3;\n"
+"#define b3Mat3x3ConstArg const b3Mat3x3\n"
+"#define b3GetRow(m,row) (m.m_row[row])\n"
+"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
+"{\n"
+"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
+"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
+"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
+"	out.m_row[0].w = 0.f;\n"
+"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
+"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
+"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
+"	out.m_row[1].w = 0.f;\n"
+"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
+"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
+"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
+"	out.m_row[2].w = 0.f;\n"
+"	return out;\n"
+"}\n"
+"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
+"{\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = fabs(matIn.m_row[0]);\n"
+"	out.m_row[1] = fabs(matIn.m_row[1]);\n"
+"	out.m_row[2] = fabs(matIn.m_row[2]);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtZero();\n"
+"__inline\n"
+"b3Mat3x3 mtIdentity();\n"
+"__inline\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
+"__inline\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
+"__inline\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
+"__inline\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
+"__inline\n"
+"b3Mat3x3 mtZero()\n"
+"{\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(0.f);\n"
+"	m.m_row[1] = (b3Float4)(0.f);\n"
+"	m.m_row[2] = (b3Float4)(0.f);\n"
+"	return m;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtIdentity()\n"
+"{\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
+"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
+"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
+"	return m;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
+"{\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
+"{\n"
+"	b3Mat3x3 transB;\n"
+"	transB = mtTranspose( b );\n"
+"	b3Mat3x3 ans;\n"
+"	//	why this doesn't run when 0ing in the for{}\n"
+"	a.m_row[0].w = 0.f;\n"
+"	a.m_row[1].w = 0.f;\n"
+"	a.m_row[2].w = 0.f;\n"
+"	for(int i=0; i<3; i++)\n"
+"	{\n"
+"//	a.m_row[i].w = 0.f;\n"
+"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
+"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
+"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
+"		ans.m_row[i].w = 0.f;\n"
+"	}\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
+"{\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
+"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
+"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
+"{\n"
+"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a, colx );\n"
+"	ans.y = b3Dot3F4( a, coly );\n"
+"	ans.z = b3Dot3F4( a, colz );\n"
+"	return ans;\n"
+"}\n"
+"#endif\n"
+"#endif //B3_MAT3x3_H\n"
+"typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
+"struct b3RigidBodyData\n"
+"{\n"
+"	b3Float4				m_pos;\n"
+"	b3Quat					m_quat;\n"
+"	b3Float4				m_linVel;\n"
+"	b3Float4				m_angVel;\n"
+"	int 					m_collidableIdx;\n"
+"	float 				m_invMass;\n"
+"	float 				m_restituitionCoeff;\n"
+"	float 				m_frictionCoeff;\n"
+"};\n"
+"typedef struct b3InertiaData b3InertiaData_t;\n"
+"struct b3InertiaData\n"
+"{\n"
+"	b3Mat3x3 m_invInertiaWorld;\n"
+"	b3Mat3x3 m_initInvInertia;\n"
+"};\n"
+"#endif //B3_RIGIDBODY_DATA_H\n"
+"	\n"
+"#define GET_NPOINTS(x) (x).m_worldNormalOnB.w\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"#define make_float4 (float4)\n"
+"#define make_float2 (float2)\n"
+"#define make_uint4 (uint4)\n"
+"#define make_int4 (int4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"__inline\n"
+"float fastDiv(float numerator, float denominator)\n"
+"{\n"
+"	return native_divide(numerator, denominator);	\n"
+"//	return numerator/denominator;	\n"
+"}\n"
+"__inline\n"
+"float4 fastDiv4(float4 numerator, float4 denominator)\n"
+"{\n"
+"	return native_divide(numerator, denominator);	\n"
+"}\n"
+"__inline\n"
+"float4 cross3(float4 a, float4 b)\n"
+"{\n"
+"	return cross(a,b);\n"
+"}\n"
+"//#define dot3F4 dot\n"
+"__inline\n"
+"float dot3F4(float4 a, float4 b)\n"
+"{\n"
+"	float4 a1 = make_float4(a.xyz,0.f);\n"
+"	float4 b1 = make_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"}\n"
+"__inline\n"
+"float4 fastNormalize4(float4 v)\n"
+"{\n"
+"	return fast_normalize(v);\n"
+"}\n"
+"///////////////////////////////////////\n"
+"//	Quaternion\n"
+"///////////////////////////////////////\n"
+"typedef float4 Quaternion;\n"
+"__inline\n"
+"Quaternion qtMul(Quaternion a, Quaternion b);\n"
+"__inline\n"
+"Quaternion qtNormalize(Quaternion in);\n"
+"__inline\n"
+"float4 qtRotate(Quaternion q, float4 vec);\n"
+"__inline\n"
+"Quaternion qtInvert(Quaternion q);\n"
+"__inline\n"
+"Quaternion qtMul(Quaternion a, Quaternion b)\n"
+"{\n"
+"	Quaternion ans;\n"
+"	ans = cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - dot3F4(a, b);\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"Quaternion qtNormalize(Quaternion in)\n"
+"{\n"
+"	return fastNormalize4(in);\n"
+"//	in /= length( in );\n"
+"//	return in;\n"
+"}\n"
+"__inline\n"
+"float4 qtRotate(Quaternion q, float4 vec)\n"
+"{\n"
+"	Quaternion qInv = qtInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"Quaternion qtInvert(Quaternion q)\n"
+"{\n"
+"	return (Quaternion)(-q.xyz, q.w);\n"
+"}\n"
+"__inline\n"
+"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
+"{\n"
+"	return qtRotate( qtInvert( q ), vec );\n"
+"}\n"
+"__inline\n"
+"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
+"{\n"
+"	return qtRotate( *orientation, *p ) + (*translation);\n"
+"}\n"
+"__inline\n"
+"float4 normalize3(const float4 a)\n"
+"{\n"
+"	float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
+"	return fastNormalize4( n );\n"
+"}\n"
+"__inline float4 lerp3(const float4 a,const float4 b, float  t)\n"
+"{\n"
+"	return make_float4(	a.x + (b.x - a.x) * t,\n"
+"						a.y + (b.y - a.y) * t,\n"
+"						a.z + (b.z - a.z) * t,\n"
+"						0.f);\n"
+"}\n"
+"// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut\n"
+"int clipFaceGlobal(__global const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, __global float4* ppVtxOut)\n"
+"{\n"
+"	\n"
+"	int ve;\n"
+"	float ds, de;\n"
+"	int numVertsOut = 0;\n"
+"    //double-check next test\n"
+"    	if (numVertsIn < 2)\n"
+"    		return 0;\n"
+"    \n"
+"	float4 firstVertex=pVtxIn[numVertsIn-1];\n"
+"	float4 endVertex = pVtxIn[0];\n"
+"	\n"
+"	ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;\n"
+"    \n"
+"	for (ve = 0; ve < numVertsIn; ve++)\n"
+"	{\n"
+"		endVertex=pVtxIn[ve];\n"
+"		de = dot3F4(planeNormalWS,endVertex)+planeEqWS;\n"
+"		if (ds<0)\n"
+"		{\n"
+"			if (de<0)\n"
+"			{\n"
+"				// Start < 0, end < 0, so output endVertex\n"
+"				ppVtxOut[numVertsOut++] = endVertex;\n"
+"			}\n"
+"			else\n"
+"			{\n"
+"				// Start < 0, end >= 0, so output intersection\n"
+"				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n"
+"			}\n"
+"		}\n"
+"		else\n"
+"		{\n"
+"			if (de<0)\n"
+"			{\n"
+"				// Start >= 0, end < 0 so output intersection and end\n"
+"				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n"
+"				ppVtxOut[numVertsOut++] = endVertex;\n"
+"			}\n"
+"		}\n"
+"		firstVertex = endVertex;\n"
+"		ds = de;\n"
+"	}\n"
+"	return numVertsOut;\n"
+"}\n"
+"// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut\n"
+"int clipFace(const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, float4* ppVtxOut)\n"
+"{\n"
+"	\n"
+"	int ve;\n"
+"	float ds, de;\n"
+"	int numVertsOut = 0;\n"
+"//double-check next test\n"
+"	if (numVertsIn < 2)\n"
+"		return 0;\n"
+"	float4 firstVertex=pVtxIn[numVertsIn-1];\n"
+"	float4 endVertex = pVtxIn[0];\n"
+"	\n"
+"	ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;\n"
+"	for (ve = 0; ve < numVertsIn; ve++)\n"
+"	{\n"
+"		endVertex=pVtxIn[ve];\n"
+"		de = dot3F4(planeNormalWS,endVertex)+planeEqWS;\n"
+"		if (ds<0)\n"
+"		{\n"
+"			if (de<0)\n"
+"			{\n"
+"				// Start < 0, end < 0, so output endVertex\n"
+"				ppVtxOut[numVertsOut++] = endVertex;\n"
+"			}\n"
+"			else\n"
+"			{\n"
+"				// Start < 0, end >= 0, so output intersection\n"
+"				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n"
+"			}\n"
+"		}\n"
+"		else\n"
+"		{\n"
+"			if (de<0)\n"
+"			{\n"
+"				// Start >= 0, end < 0 so output intersection and end\n"
+"				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n"
+"				ppVtxOut[numVertsOut++] = endVertex;\n"
+"			}\n"
+"		}\n"
+"		firstVertex = endVertex;\n"
+"		ds = de;\n"
+"	}\n"
+"	return numVertsOut;\n"
+"}\n"
+"int clipFaceAgainstHull(const float4 separatingNormal, __global const b3ConvexPolyhedronData_t* hullA,  \n"
+"	const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,\n"
+"	float4* worldVertsB2, int capacityWorldVertsB2,\n"
+"	const float minDist, float maxDist,\n"
+"	__global const float4* vertices,\n"
+"	__global const b3GpuFace_t* faces,\n"
+"	__global const int* indices,\n"
+"	float4* contactsOut,\n"
+"	int contactCapacity)\n"
+"{\n"
+"	int numContactsOut = 0;\n"
+"	float4* pVtxIn = worldVertsB1;\n"
+"	float4* pVtxOut = worldVertsB2;\n"
+"	\n"
+"	int numVertsIn = numWorldVertsB1;\n"
+"	int numVertsOut = 0;\n"
+"	int closestFaceA=-1;\n"
+"	{\n"
+"		float dmin = FLT_MAX;\n"
+"		for(int face=0;face<hullA->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(\n"
+"				faces[hullA->m_faceOffset+face].m_plane.x, \n"
+"				faces[hullA->m_faceOffset+face].m_plane.y, \n"
+"				faces[hullA->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 faceANormalWS = qtRotate(ornA,Normal);\n"
+"		\n"
+"			float d = dot3F4(faceANormalWS,separatingNormal);\n"
+"			if (d < dmin)\n"
+"			{\n"
+"				dmin = d;\n"
+"				closestFaceA = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if (closestFaceA<0)\n"
+"		return numContactsOut;\n"
+"	b3GpuFace_t polyA = faces[hullA->m_faceOffset+closestFaceA];\n"
+"	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
+"	int numVerticesA = polyA.m_numIndices;\n"
+"	for(int e0=0;e0<numVerticesA;e0++)\n"
+"	{\n"
+"		const float4 a = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+e0]];\n"
+"		const float4 b = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+((e0+1)%numVerticesA)]];\n"
+"		const float4 edge0 = a - b;\n"
+"		const float4 WorldEdge0 = qtRotate(ornA,edge0);\n"
+"		float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n"
+"		float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);\n"
+"		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n"
+"		float4 worldA1 = transform(&a,&posA,&ornA);\n"
+"		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n"
+"		\n"
+"		float4 planeNormalWS = planeNormalWS1;\n"
+"		float planeEqWS=planeEqWS1;\n"
+"		\n"
+"		//clip face\n"
+"		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n"
+"		numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n"
+"		//btSwap(pVtxIn,pVtxOut);\n"
+"		float4* tmp = pVtxOut;\n"
+"		pVtxOut = pVtxIn;\n"
+"		pVtxIn = tmp;\n"
+"		numVertsIn = numVertsOut;\n"
+"		numVertsOut = 0;\n"
+"	}\n"
+"	\n"
+"	// only keep points that are behind the witness face\n"
+"	{\n"
+"		float4 localPlaneNormal  = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n"
+"		float localPlaneEq = polyA.m_plane.w;\n"
+"		float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);\n"
+"		float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);\n"
+"		for (int i=0;i<numVertsIn;i++)\n"
+"		{\n"
+"			float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n"
+"			if (depth <=minDist)\n"
+"			{\n"
+"				depth = minDist;\n"
+"			}\n"
+"			if (depth <=maxDist)\n"
+"			{\n"
+"				float4 pointInWorld = pVtxIn[i];\n"
+"				//resultOut.addContactPoint(separatingNormal,point,depth);\n"
+"				contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	return numContactsOut;\n"
+"}\n"
+"int clipFaceAgainstHullLocalA(const float4 separatingNormal, const b3ConvexPolyhedronData_t* hullA,  \n"
+"	const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,\n"
+"	float4* worldVertsB2, int capacityWorldVertsB2,\n"
+"	const float minDist, float maxDist,\n"
+"	const float4* verticesA,\n"
+"	const b3GpuFace_t* facesA,\n"
+"	const int* indicesA,\n"
+"	__global const float4* verticesB,\n"
+"	__global const b3GpuFace_t* facesB,\n"
+"	__global const int* indicesB,\n"
+"	float4* contactsOut,\n"
+"	int contactCapacity)\n"
+"{\n"
+"	int numContactsOut = 0;\n"
+"	float4* pVtxIn = worldVertsB1;\n"
+"	float4* pVtxOut = worldVertsB2;\n"
+"	\n"
+"	int numVertsIn = numWorldVertsB1;\n"
+"	int numVertsOut = 0;\n"
+"	int closestFaceA=-1;\n"
+"	{\n"
+"		float dmin = FLT_MAX;\n"
+"		for(int face=0;face<hullA->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(\n"
+"				facesA[hullA->m_faceOffset+face].m_plane.x, \n"
+"				facesA[hullA->m_faceOffset+face].m_plane.y, \n"
+"				facesA[hullA->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 faceANormalWS = qtRotate(ornA,Normal);\n"
+"		\n"
+"			float d = dot3F4(faceANormalWS,separatingNormal);\n"
+"			if (d < dmin)\n"
+"			{\n"
+"				dmin = d;\n"
+"				closestFaceA = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if (closestFaceA<0)\n"
+"		return numContactsOut;\n"
+"	b3GpuFace_t polyA = facesA[hullA->m_faceOffset+closestFaceA];\n"
+"	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
+"	int numVerticesA = polyA.m_numIndices;\n"
+"	for(int e0=0;e0<numVerticesA;e0++)\n"
+"	{\n"
+"		const float4 a = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+e0]];\n"
+"		const float4 b = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+((e0+1)%numVerticesA)]];\n"
+"		const float4 edge0 = a - b;\n"
+"		const float4 WorldEdge0 = qtRotate(ornA,edge0);\n"
+"		float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n"
+"		float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);\n"
+"		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n"
+"		float4 worldA1 = transform(&a,&posA,&ornA);\n"
+"		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n"
+"		\n"
+"		float4 planeNormalWS = planeNormalWS1;\n"
+"		float planeEqWS=planeEqWS1;\n"
+"		\n"
+"		//clip face\n"
+"		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n"
+"		numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n"
+"		//btSwap(pVtxIn,pVtxOut);\n"
+"		float4* tmp = pVtxOut;\n"
+"		pVtxOut = pVtxIn;\n"
+"		pVtxIn = tmp;\n"
+"		numVertsIn = numVertsOut;\n"
+"		numVertsOut = 0;\n"
+"	}\n"
+"	\n"
+"	// only keep points that are behind the witness face\n"
+"	{\n"
+"		float4 localPlaneNormal  = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n"
+"		float localPlaneEq = polyA.m_plane.w;\n"
+"		float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);\n"
+"		float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);\n"
+"		for (int i=0;i<numVertsIn;i++)\n"
+"		{\n"
+"			float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n"
+"			if (depth <=minDist)\n"
+"			{\n"
+"				depth = minDist;\n"
+"			}\n"
+"			if (depth <=maxDist)\n"
+"			{\n"
+"				float4 pointInWorld = pVtxIn[i];\n"
+"				//resultOut.addContactPoint(separatingNormal,point,depth);\n"
+"				contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	return numContactsOut;\n"
+"}\n"
+"int	clipHullAgainstHull(const float4 separatingNormal,\n"
+"	__global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, \n"
+"	const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, \n"
+"	float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n"
+"	const float minDist, float maxDist,\n"
+"	__global const float4* vertices,\n"
+"	__global const b3GpuFace_t* faces,\n"
+"	__global const int* indices,\n"
+"	float4*	localContactsOut,\n"
+"	int localContactCapacity)\n"
+"{\n"
+"	int numContactsOut = 0;\n"
+"	int numWorldVertsB1= 0;\n"
+"	int closestFaceB=-1;\n"
+"	float dmax = -FLT_MAX;\n"
+"	{\n"
+"		for(int face=0;face<hullB->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x, \n"
+"				faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 WorldNormal = qtRotate(ornB, Normal);\n"
+"			float d = dot3F4(WorldNormal,separatingNormal);\n"
+"			if (d > dmax)\n"
+"			{\n"
+"				dmax = d;\n"
+"				closestFaceB = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	{\n"
+"		const b3GpuFace_t polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
+"		const int numVertices = polyB.m_numIndices;\n"
+"		for(int e0=0;e0<numVertices;e0++)\n"
+"		{\n"
+"			const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];\n"
+"			worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);\n"
+"		}\n"
+"	}\n"
+"	if (closestFaceB>=0)\n"
+"	{\n"
+"		numContactsOut = clipFaceAgainstHull(separatingNormal, hullA, \n"
+"				posA,ornA,\n"
+"				worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,vertices,\n"
+"				faces,\n"
+"				indices,localContactsOut,localContactCapacity);\n"
+"	}\n"
+"	return numContactsOut;\n"
+"}\n"
+"int	clipHullAgainstHullLocalA(const float4 separatingNormal,\n"
+"	const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, \n"
+"	const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, \n"
+"	float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n"
+"	const float minDist, float maxDist,\n"
+"	const float4* verticesA,\n"
+"	const b3GpuFace_t* facesA,\n"
+"	const int* indicesA,\n"
+"	__global const float4* verticesB,\n"
+"	__global const b3GpuFace_t* facesB,\n"
+"	__global const int* indicesB,\n"
+"	float4*	localContactsOut,\n"
+"	int localContactCapacity)\n"
+"{\n"
+"	int numContactsOut = 0;\n"
+"	int numWorldVertsB1= 0;\n"
+"	int closestFaceB=-1;\n"
+"	float dmax = -FLT_MAX;\n"
+"	{\n"
+"		for(int face=0;face<hullB->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x, \n"
+"				facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 WorldNormal = qtRotate(ornB, Normal);\n"
+"			float d = dot3F4(WorldNormal,separatingNormal);\n"
+"			if (d > dmax)\n"
+"			{\n"
+"				dmax = d;\n"
+"				closestFaceB = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	{\n"
+"		const b3GpuFace_t polyB = facesB[hullB->m_faceOffset+closestFaceB];\n"
+"		const int numVertices = polyB.m_numIndices;\n"
+"		for(int e0=0;e0<numVertices;e0++)\n"
+"		{\n"
+"			const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];\n"
+"			worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);\n"
+"		}\n"
+"	}\n"
+"	if (closestFaceB>=0)\n"
+"	{\n"
+"		numContactsOut = clipFaceAgainstHullLocalA(separatingNormal, hullA, \n"
+"				posA,ornA,\n"
+"				worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,\n"
+"				verticesA,facesA,indicesA,\n"
+"				verticesB,facesB,indicesB,\n"
+"				localContactsOut,localContactCapacity);\n"
+"	}\n"
+"	return numContactsOut;\n"
+"}\n"
+"#define PARALLEL_SUM(v, n) for(int j=1; j<n; j++) v[0] += v[j];\n"
+"#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;}\n"
+"#define REDUCE_MAX(v, n) {int i=0; for(int offset=0; offset<n; offset++) v[i] = (v[i].y > v[i+offset].y)? v[i]: v[i+offset]; }\n"
+"#define REDUCE_MIN(v, n) {int i=0; for(int offset=0; offset<n; offset++) v[i] = (v[i].y < v[i+offset].y)? v[i]: v[i+offset]; }\n"
+"int extractManifoldSequentialGlobal(__global const float4* p, int nPoints, float4 nearNormal, int4* contactIdx)\n"
+"{\n"
+"	if( nPoints == 0 )\n"
+"        return 0;\n"
+"    \n"
+"    if (nPoints <=4)\n"
+"        return nPoints;\n"
+"    \n"
+"    \n"
+"    if (nPoints >64)\n"
+"        nPoints = 64;\n"
+"    \n"
+"	float4 center = make_float4(0.f);\n"
+"	{\n"
+"		\n"
+"		for (int i=0;i<nPoints;i++)\n"
+"			center += p[i];\n"
+"		center /= (float)nPoints;\n"
+"	}\n"
+"    \n"
+"	\n"
+"    \n"
+"	//	sample 4 directions\n"
+"    \n"
+"    float4 aVector = p[0] - center;\n"
+"    float4 u = cross3( nearNormal, aVector );\n"
+"    float4 v = cross3( nearNormal, u );\n"
+"    u = normalize3( u );\n"
+"    v = normalize3( v );\n"
+"    \n"
+"    \n"
+"    //keep point with deepest penetration\n"
+"    float minW= FLT_MAX;\n"
+"    \n"
+"    int minIndex=-1;\n"
+"    \n"
+"    float4 maxDots;\n"
+"    maxDots.x = FLT_MIN;\n"
+"    maxDots.y = FLT_MIN;\n"
+"    maxDots.z = FLT_MIN;\n"
+"    maxDots.w = FLT_MIN;\n"
+"    \n"
+"    //	idx, distance\n"
+"    for(int ie = 0; ie<nPoints; ie++ )\n"
+"    {\n"
+"        if (p[ie].w<minW)\n"
+"        {\n"
+"            minW = p[ie].w;\n"
+"            minIndex=ie;\n"
+"        }\n"
+"        float f;\n"
+"        float4 r = p[ie]-center;\n"
+"        f = dot3F4( u, r );\n"
+"        if (f<maxDots.x)\n"
+"        {\n"
+"            maxDots.x = f;\n"
+"            contactIdx[0].x = ie;\n"
+"        }\n"
+"        \n"
+"        f = dot3F4( -u, r );\n"
+"        if (f<maxDots.y)\n"
+"        {\n"
+"            maxDots.y = f;\n"
+"            contactIdx[0].y = ie;\n"
+"        }\n"
+"        \n"
+"        \n"
+"        f = dot3F4( v, r );\n"
+"        if (f<maxDots.z)\n"
+"        {\n"
+"            maxDots.z = f;\n"
+"            contactIdx[0].z = ie;\n"
+"        }\n"
+"        \n"
+"        f = dot3F4( -v, r );\n"
+"        if (f<maxDots.w)\n"
+"        {\n"
+"            maxDots.w = f;\n"
+"            contactIdx[0].w = ie;\n"
+"        }\n"
+"        \n"
+"    }\n"
+"    \n"
+"    if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)\n"
+"    {\n"
+"        //replace the first contact with minimum (todo: replace contact with least penetration)\n"
+"        contactIdx[0].x = minIndex;\n"
+"    }\n"
+"    \n"
+"    return 4;\n"
+"    \n"
+"}\n"
+"int extractManifoldSequentialGlobalFake(__global const float4* p, int nPoints, float4 nearNormal, int* contactIdx)\n"
+"{\n"
+"    contactIdx[0] = 0;\n"
+"    contactIdx[1] = 1;\n"
+"    contactIdx[2] = 2;\n"
+"    contactIdx[3] = 3;\n"
+"    \n"
+"	if( nPoints == 0 ) return 0;\n"
+"    \n"
+"	nPoints = min2( nPoints, 4 );\n"
+"    return nPoints;\n"
+"    \n"
+"}\n"
+"int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int* contactIdx)\n"
+"{\n"
+"	if( nPoints == 0 ) return 0;\n"
+"	nPoints = min2( nPoints, 64 );\n"
+"	float4 center = make_float4(0.f);\n"
+"	{\n"
+"		float4 v[64];\n"
+"		for (int i=0;i<nPoints;i++)\n"
+"			v[i] = p[i];\n"
+"		//memcpy( v, p, nPoints*sizeof(float4) );\n"
+"		PARALLEL_SUM( v, nPoints );\n"
+"		center = v[0]/(float)nPoints;\n"
+"	}\n"
+"	\n"
+"	{	//	sample 4 directions\n"
+"		if( nPoints < 4 )\n"
+"		{\n"
+"			for(int i=0; i<nPoints; i++) \n"
+"				contactIdx[i] = i;\n"
+"			return nPoints;\n"
+"		}\n"
+"		float4 aVector = p[0] - center;\n"
+"		float4 u = cross3( nearNormal, aVector );\n"
+"		float4 v = cross3( nearNormal, u );\n"
+"		u = normalize3( u );\n"
+"		v = normalize3( v );\n"
+"		int idx[4];\n"
+"		float2 max00 = make_float2(0,FLT_MAX);\n"
+"		{\n"
+"			//	idx, distance\n"
+"			{\n"
+"				{\n"
+"					int4 a[64];\n"
+"					for(int ie = 0; ie<nPoints; ie++ )\n"
+"					{\n"
+"						\n"
+"						\n"
+"						float f;\n"
+"						float4 r = p[ie]-center;\n"
+"						f = dot3F4( u, r );\n"
+"						a[ie].x = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
+"						f = dot3F4( -u, r );\n"
+"						a[ie].y = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
+"						f = dot3F4( v, r );\n"
+"						a[ie].z = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
+"						f = dot3F4( -v, r );\n"
+"						a[ie].w = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
+"					}\n"
+"					for(int ie=0; ie<nPoints; ie++)\n"
+"					{\n"
+"						a[0].x = (a[0].x > a[ie].x )? a[0].x: a[ie].x;\n"
+"						a[0].y = (a[0].y > a[ie].y )? a[0].y: a[ie].y;\n"
+"						a[0].z = (a[0].z > a[ie].z )? a[0].z: a[ie].z;\n"
+"						a[0].w = (a[0].w > a[ie].w )? a[0].w: a[ie].w;\n"
+"					}\n"
+"					idx[0] = (int)a[0].x & 0xff;\n"
+"					idx[1] = (int)a[0].y & 0xff;\n"
+"					idx[2] = (int)a[0].z & 0xff;\n"
+"					idx[3] = (int)a[0].w & 0xff;\n"
+"				}\n"
+"			}\n"
+"			{\n"
+"				float2 h[64];\n"
+"				PARALLEL_DO( h[ie] = make_float2((float)ie, p[ie].w), nPoints );\n"
+"				REDUCE_MIN( h, nPoints );\n"
+"				max00 = h[0];\n"
+"			}\n"
+"		}\n"
+"		contactIdx[0] = idx[0];\n"
+"		contactIdx[1] = idx[1];\n"
+"		contactIdx[2] = idx[2];\n"
+"		contactIdx[3] = idx[3];\n"
+"		return 4;\n"
+"	}\n"
+"}\n"
+"__kernel void   extractManifoldAndAddContactKernel(__global const int4* pairs, \n"
+"																	__global const b3RigidBodyData_t* rigidBodies, \n"
+"																	__global const float4* closestPointsWorld,\n"
+"																	__global const float4* separatingNormalsWorld,\n"
+"																	__global const int* contactCounts,\n"
+"																	__global const int* contactOffsets,\n"
+"																	__global struct b3Contact4Data* restrict contactsOut,\n"
+"																	counter32_t nContactsOut,\n"
+"																	int contactCapacity,\n"
+"																	int numPairs,\n"
+"																	int pairIndex\n"
+"																	)\n"
+"{\n"
+"	int idx = get_global_id(0);\n"
+"	\n"
+"	if (idx<numPairs)\n"
+"	{\n"
+"		float4 normal = separatingNormalsWorld[idx];\n"
+"		int nPoints = contactCounts[idx];\n"
+"		__global const float4* pointsIn = &closestPointsWorld[contactOffsets[idx]];\n"
+"		float4 localPoints[64];\n"
+"		for (int i=0;i<nPoints;i++)\n"
+"		{\n"
+"			localPoints[i] = pointsIn[i];\n"
+"		}\n"
+"		int contactIdx[4];// = {-1,-1,-1,-1};\n"
+"		contactIdx[0] = -1;\n"
+"		contactIdx[1] = -1;\n"
+"		contactIdx[2] = -1;\n"
+"		contactIdx[3] = -1;\n"
+"		int nContacts = extractManifoldSequential(localPoints, nPoints, normal, contactIdx);\n"
+"		int dstIdx;\n"
+"		AppendInc( nContactsOut, dstIdx );\n"
+"		if (dstIdx<contactCapacity)\n"
+"		{\n"
+"			__global struct b3Contact4Data* c = contactsOut + dstIdx;\n"
+"			c->m_worldNormalOnB = -normal;\n"
+"			c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"			c->m_batchIdx = idx;\n"
+"			int bodyA = pairs[pairIndex].x;\n"
+"			int bodyB = pairs[pairIndex].y;\n"
+"			c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;\n"
+"			c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;\n"
+"			c->m_childIndexA = -1;\n"
+"			c->m_childIndexB = -1;\n"
+"			for (int i=0;i<nContacts;i++)\n"
+"			{\n"
+"				c->m_worldPosB[i] = localPoints[contactIdx[i]];\n"
+"			}\n"
+"			GET_NPOINTS(*c) = nContacts;\n"
+"		}\n"
+"	}\n"
+"}\n"
+"void	trInverse(float4 translationIn, Quaternion orientationIn,\n"
+"		float4* translationOut, Quaternion* orientationOut)\n"
+"{\n"
+"	*orientationOut = qtInvert(orientationIn);\n"
+"	*translationOut = qtRotate(*orientationOut, -translationIn);\n"
+"}\n"
+"void	trMul(float4 translationA, Quaternion orientationA,\n"
+"						float4 translationB, Quaternion orientationB,\n"
+"		float4* translationOut, Quaternion* orientationOut)\n"
+"{\n"
+"	*orientationOut = qtMul(orientationA,orientationB);\n"
+"	*translationOut = transform(&translationB,&translationA,&orientationA);\n"
+"}\n"
+"__kernel void   clipHullHullKernel( __global int4* pairs, \n"
+"																					__global const b3RigidBodyData_t* rigidBodies, \n"
+"																					__global const b3Collidable_t* collidables,\n"
+"																					__global const b3ConvexPolyhedronData_t* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const b3GpuFace_t* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global const float4* separatingNormals,\n"
+"																					__global const int* hasSeparatingAxis,\n"
+"																					__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"																					counter32_t nGlobalContactsOut,\n"
+"																					int numPairs,\n"
+"																					int contactCapacity)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"	float4 worldVertsB1[64];\n"
+"	float4 worldVertsB2[64];\n"
+"	int capacityWorldVerts = 64;	\n"
+"	float4 localContactsOut[64];\n"
+"	int localContactCapacity=64;\n"
+"	\n"
+"	float minDist = -1e30f;\n"
+"	float maxDist = 0.02f;\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"			\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"		if (hasSeparatingAxis[i])\n"
+"		{\n"
+"			\n"
+"			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"			\n"
+"		\n"
+"			int numLocalContactsOut = clipHullAgainstHull(separatingNormals[i],\n"
+"														&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n"
+"														rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,\n"
+"													  rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,\n"
+"													  worldVertsB1,worldVertsB2,capacityWorldVerts,\n"
+"														minDist, maxDist,\n"
+"														vertices,faces,indices,\n"
+"														localContactsOut,localContactCapacity);\n"
+"												\n"
+"		if (numLocalContactsOut>0)\n"
+"		{\n"
+"				float4 normal = -separatingNormals[i];\n"
+"				int nPoints = numLocalContactsOut;\n"
+"				float4* pointsIn = localContactsOut;\n"
+"				int contactIdx[4];// = {-1,-1,-1,-1};\n"
+"				contactIdx[0] = -1;\n"
+"				contactIdx[1] = -1;\n"
+"				contactIdx[2] = -1;\n"
+"				contactIdx[3] = -1;\n"
+"		\n"
+"				int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n"
+"		\n"
+"				\n"
+"				int mprContactIndex = pairs[pairIndex].z;\n"
+"				int dstIdx = mprContactIndex;\n"
+"				if (dstIdx<0)\n"
+"				{\n"
+"					AppendInc( nGlobalContactsOut, dstIdx );\n"
+"				}\n"
+"				if (dstIdx<contactCapacity)\n"
+"				{\n"
+"					pairs[pairIndex].z = dstIdx;\n"
+"					__global struct b3Contact4Data* c = globalContactsOut+ dstIdx;\n"
+"					c->m_worldNormalOnB = -normal;\n"
+"					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"					c->m_batchIdx = pairIndex;\n"
+"					int bodyA = pairs[pairIndex].x;\n"
+"					int bodyB = pairs[pairIndex].y;\n"
+"					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"					c->m_childIndexA = -1;\n"
+"					c->m_childIndexB = -1;\n"
+"					for (int i=0;i<nReducedContacts;i++)\n"
+"					{\n"
+"					//this condition means: overwrite contact point, unless at index i==0 we have a valid 'mpr' contact\n"
+"						if (i>0||(mprContactIndex<0))\n"
+"						{\n"
+"							c->m_worldPosB[i] = pointsIn[contactIdx[i]];\n"
+"						}\n"
+"					}\n"
+"					GET_NPOINTS(*c) = nReducedContacts;\n"
+"				}\n"
+"				\n"
+"			}//		if (numContactsOut>0)\n"
+"		}//		if (hasSeparatingAxis[i])\n"
+"	}//	if (i<numPairs)\n"
+"}\n"
+"__kernel void   clipCompoundsHullHullKernel( __global const int4* gpuCompoundPairs, \n"
+"																					__global const b3RigidBodyData_t* rigidBodies, \n"
+"																					__global const b3Collidable_t* collidables,\n"
+"																					__global const b3ConvexPolyhedronData_t* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const b3GpuFace_t* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global const b3GpuChildShape_t* gpuChildShapes,\n"
+"																					__global const float4* gpuCompoundSepNormalsOut,\n"
+"																					__global const int* gpuHasCompoundSepNormalsOut,\n"
+"																					__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"																					counter32_t nGlobalContactsOut,\n"
+"																					int numCompoundPairs, int maxContactCapacity)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"	float4 worldVertsB1[64];\n"
+"	float4 worldVertsB2[64];\n"
+"	int capacityWorldVerts = 64;	\n"
+"	float4 localContactsOut[64];\n"
+"	int localContactCapacity=64;\n"
+"	\n"
+"	float minDist = -1e30f;\n"
+"	float maxDist = 0.02f;\n"
+"	if (i<numCompoundPairs)\n"
+"	{\n"
+"		if (gpuHasCompoundSepNormalsOut[i])\n"
+"		{\n"
+"			int bodyIndexA = gpuCompoundPairs[i].x;\n"
+"			int bodyIndexB = gpuCompoundPairs[i].y;\n"
+"			\n"
+"			int childShapeIndexA = gpuCompoundPairs[i].z;\n"
+"			int childShapeIndexB = gpuCompoundPairs[i].w;\n"
+"			\n"
+"			int collidableIndexA = -1;\n"
+"			int collidableIndexB = -1;\n"
+"			\n"
+"			float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"			float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"			\n"
+"			float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"			float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"								\n"
+"			if (childShapeIndexA >= 0)\n"
+"			{\n"
+"				collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n"
+"				float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n"
+"				float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n"
+"				float4 newPosA = qtRotate(ornA,childPosA)+posA;\n"
+"				float4 newOrnA = qtMul(ornA,childOrnA);\n"
+"				posA = newPosA;\n"
+"				ornA = newOrnA;\n"
+"			} else\n"
+"			{\n"
+"				collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"			}\n"
+"			\n"
+"			if (childShapeIndexB>=0)\n"
+"			{\n"
+"				collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"				float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"				float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"				float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"				float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"				posB = newPosB;\n"
+"				ornB = newOrnB;\n"
+"			} else\n"
+"			{\n"
+"				collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	\n"
+"			}\n"
+"			\n"
+"			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"		\n"
+"			int numLocalContactsOut = clipHullAgainstHull(gpuCompoundSepNormalsOut[i],\n"
+"														&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n"
+"														posA,ornA,\n"
+"													  posB,ornB,\n"
+"													  worldVertsB1,worldVertsB2,capacityWorldVerts,\n"
+"														minDist, maxDist,\n"
+"														vertices,faces,indices,\n"
+"														localContactsOut,localContactCapacity);\n"
+"												\n"
+"		if (numLocalContactsOut>0)\n"
+"		{\n"
+"				float4 normal = -gpuCompoundSepNormalsOut[i];\n"
+"				int nPoints = numLocalContactsOut;\n"
+"				float4* pointsIn = localContactsOut;\n"
+"				int contactIdx[4];// = {-1,-1,-1,-1};\n"
+"				contactIdx[0] = -1;\n"
+"				contactIdx[1] = -1;\n"
+"				contactIdx[2] = -1;\n"
+"				contactIdx[3] = -1;\n"
+"		\n"
+"				int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n"
+"		\n"
+"				int dstIdx;\n"
+"				AppendInc( nGlobalContactsOut, dstIdx );\n"
+"				if ((dstIdx+nReducedContacts) < maxContactCapacity)\n"
+"				{\n"
+"					__global struct b3Contact4Data* c = globalContactsOut+ dstIdx;\n"
+"					c->m_worldNormalOnB = -normal;\n"
+"					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"					c->m_batchIdx = pairIndex;\n"
+"					int bodyA = gpuCompoundPairs[pairIndex].x;\n"
+"					int bodyB = gpuCompoundPairs[pairIndex].y;\n"
+"					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"					c->m_childIndexA = childShapeIndexA;\n"
+"					c->m_childIndexB = childShapeIndexB;\n"
+"					for (int i=0;i<nReducedContacts;i++)\n"
+"					{\n"
+"						c->m_worldPosB[i] = pointsIn[contactIdx[i]];\n"
+"					}\n"
+"					GET_NPOINTS(*c) = nReducedContacts;\n"
+"				}\n"
+"				\n"
+"			}//		if (numContactsOut>0)\n"
+"		}//		if (gpuHasCompoundSepNormalsOut[i])\n"
+"	}//	if (i<numCompoundPairs)\n"
+"}\n"
+"__kernel void   sphereSphereCollisionKernel( __global const int4* pairs, \n"
+"																					__global const b3RigidBodyData_t* rigidBodies, \n"
+"																					__global const b3Collidable_t* collidables,\n"
+"																					__global const float4* separatingNormals,\n"
+"																					__global const int* hasSeparatingAxis,\n"
+"																					__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"																					counter32_t nGlobalContactsOut,\n"
+"																					int contactCapacity,\n"
+"																					int numPairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"			\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n"
+"		{\n"
+"			//sphere-sphere\n"
+"			float radiusA = collidables[collidableIndexA].m_radius;\n"
+"			float radiusB = collidables[collidableIndexB].m_radius;\n"
+"			float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"			float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"			float4 diff = posA-posB;\n"
+"			float len = length(diff);\n"
+"			\n"
+"			///iff distance positive, don't generate a new contact\n"
+"			if ( len <= (radiusA+radiusB))\n"
+"			{\n"
+"				///distance (negative means penetration)\n"
+"				float dist = len - (radiusA+radiusB);\n"
+"				float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);\n"
+"				if (len > 0.00001)\n"
+"				{\n"
+"					normalOnSurfaceB = diff / len;\n"
+"				}\n"
+"				float4 contactPosB = posB + normalOnSurfaceB*radiusB;\n"
+"				contactPosB.w = dist;\n"
+"								\n"
+"				int dstIdx;\n"
+"				AppendInc( nGlobalContactsOut, dstIdx );\n"
+"				if (dstIdx < contactCapacity)\n"
+"				{\n"
+"					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"					c->m_worldNormalOnB = -normalOnSurfaceB;\n"
+"					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"					c->m_batchIdx = pairIndex;\n"
+"					int bodyA = pairs[pairIndex].x;\n"
+"					int bodyB = pairs[pairIndex].y;\n"
+"					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"					c->m_worldPosB[0] = contactPosB;\n"
+"					c->m_childIndexA = -1;\n"
+"					c->m_childIndexB = -1;\n"
+"					GET_NPOINTS(*c) = 1;\n"
+"				}//if (dstIdx < numPairs)\n"
+"			}//if ( len <= (radiusA+radiusB))\n"
+"		}//SHAPE_SPHERE SHAPE_SPHERE\n"
+"	}//if (i<numPairs)\n"
+"}				\n"
+"__kernel void   clipHullHullConcaveConvexKernel( __global int4* concavePairsIn,\n"
+"																					__global const b3RigidBodyData_t* rigidBodies, \n"
+"																					__global const b3Collidable_t* collidables,\n"
+"																					__global const b3ConvexPolyhedronData_t* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const b3GpuFace_t* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global const b3GpuChildShape_t* gpuChildShapes,\n"
+"																					__global const float4* separatingNormals,\n"
+"																					__global struct b3Contact4Data* restrict globalContactsOut,\n"
+"																					counter32_t nGlobalContactsOut,\n"
+"																					int contactCapacity,\n"
+"																					int numConcavePairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"	float4 worldVertsB1[64];\n"
+"	float4 worldVertsB2[64];\n"
+"	int capacityWorldVerts = 64;	\n"
+"	float4 localContactsOut[64];\n"
+"	int localContactCapacity=64;\n"
+"	\n"
+"	float minDist = -1e30f;\n"
+"	float maxDist = 0.02f;\n"
+"	if (i<numConcavePairs)\n"
+"	{\n"
+"		//negative value means that the pair is invalid\n"
+"		if (concavePairsIn[i].w<0)\n"
+"			return;\n"
+"		int bodyIndexA = concavePairsIn[i].x;\n"
+"		int bodyIndexB = concavePairsIn[i].y;\n"
+"		int f = concavePairsIn[i].z;\n"
+"		int childShapeIndexA = f;\n"
+"		\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"		\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"		\n"
+"		///////////////////////////////////////////////////////////////\n"
+"		\n"
+"	\n"
+"		bool overlap = false;\n"
+"		\n"
+"		b3ConvexPolyhedronData_t convexPolyhedronA;\n"
+"	//add 3 vertices of the triangle\n"
+"		convexPolyhedronA.m_numVertices = 3;\n"
+"		convexPolyhedronA.m_vertexOffset = 0;\n"
+"		float4	localCenter = make_float4(0.f,0.f,0.f,0.f);\n"
+"		b3GpuFace_t face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
+"		\n"
+"		float4 verticesA[3];\n"
+"		for (int i=0;i<3;i++)\n"
+"		{\n"
+"			int index = indices[face.m_indexOffset+i];\n"
+"			float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n"
+"			verticesA[i] = vert;\n"
+"			localCenter += vert;\n"
+"		}\n"
+"		float dmin = FLT_MAX;\n"
+"		int localCC=0;\n"
+"		//a triangle has 3 unique edges\n"
+"		convexPolyhedronA.m_numUniqueEdges = 3;\n"
+"		convexPolyhedronA.m_uniqueEdgesOffset = 0;\n"
+"		float4 uniqueEdgesA[3];\n"
+"		\n"
+"		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n"
+"		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n"
+"		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n"
+"		convexPolyhedronA.m_faceOffset = 0;\n"
+"                                  \n"
+"		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n"
+"                             \n"
+"		b3GpuFace_t facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
+"		int indicesA[3+3+2+2+2];\n"
+"		int curUsedIndices=0;\n"
+"		int fidx=0;\n"
+"		//front size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[0] = 0;\n"
+"			indicesA[1] = 1;\n"
+"			indicesA[2] = 2;\n"
+"			curUsedIndices+=3;\n"
+"			float c = face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = normal.x;\n"
+"			facesA[fidx].m_plane.y = normal.y;\n"
+"			facesA[fidx].m_plane.z = normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"		//back size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[3]=2;\n"
+"			indicesA[4]=1;\n"
+"			indicesA[5]=0;\n"
+"			curUsedIndices+=3;\n"
+"			float c = dot3F4(normal,verticesA[0]);\n"
+"			float c1 = -face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = -normal.x;\n"
+"			facesA[fidx].m_plane.y = -normal.y;\n"
+"			facesA[fidx].m_plane.z = -normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"		bool addEdgePlanes = true;\n"
+"		if (addEdgePlanes)\n"
+"		{\n"
+"			int numVertices=3;\n"
+"			int prevVertex = numVertices-1;\n"
+"			for (int i=0;i<numVertices;i++)\n"
+"			{\n"
+"				float4 v0 = verticesA[i];\n"
+"				float4 v1 = verticesA[prevVertex];\n"
+"                                            \n"
+"				float4 edgeNormal = normalize(cross(normal,v1-v0));\n"
+"				float c = -dot3F4(edgeNormal,v0);\n"
+"				facesA[fidx].m_numIndices = 2;\n"
+"				facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"				indicesA[curUsedIndices++]=i;\n"
+"				indicesA[curUsedIndices++]=prevVertex;\n"
+"                                            \n"
+"				facesA[fidx].m_plane.x = edgeNormal.x;\n"
+"				facesA[fidx].m_plane.y = edgeNormal.y;\n"
+"				facesA[fidx].m_plane.z = edgeNormal.z;\n"
+"				facesA[fidx].m_plane.w = c;\n"
+"				fidx++;\n"
+"				prevVertex = i;\n"
+"			}\n"
+"		}\n"
+"		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n"
+"		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		posA.w = 0.f;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"		posB.w = 0.f;\n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"		float4 sepAxis = separatingNormals[i];\n"
+"		\n"
+"		int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
+"		int childShapeIndexB =-1;\n"
+"		if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"		{\n"
+"			///////////////////\n"
+"			///compound shape support\n"
+"			\n"
+"			childShapeIndexB = concavePairsIn[pairIndex].w;\n"
+"			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"			shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n"
+"			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"			float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"			float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"			posB = newPosB;\n"
+"			ornB = newOrnB;\n"
+"			\n"
+"		}\n"
+"		\n"
+"		////////////////////////////////////////\n"
+"		\n"
+"		\n"
+"		\n"
+"		int numLocalContactsOut = clipHullAgainstHullLocalA(sepAxis,\n"
+"														&convexPolyhedronA, &convexShapes[shapeIndexB],\n"
+"														posA,ornA,\n"
+"													  posB,ornB,\n"
+"													  worldVertsB1,worldVertsB2,capacityWorldVerts,\n"
+"														minDist, maxDist,\n"
+"														&verticesA,&facesA,&indicesA,\n"
+"														vertices,faces,indices,\n"
+"														localContactsOut,localContactCapacity);\n"
+"												\n"
+"		if (numLocalContactsOut>0)\n"
+"		{\n"
+"			float4 normal = -separatingNormals[i];\n"
+"			int nPoints = numLocalContactsOut;\n"
+"			float4* pointsIn = localContactsOut;\n"
+"			int contactIdx[4];// = {-1,-1,-1,-1};\n"
+"			contactIdx[0] = -1;\n"
+"			contactIdx[1] = -1;\n"
+"			contactIdx[2] = -1;\n"
+"			contactIdx[3] = -1;\n"
+"	\n"
+"			int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n"
+"	\n"
+"			int dstIdx;\n"
+"			AppendInc( nGlobalContactsOut, dstIdx );\n"
+"			if (dstIdx<contactCapacity)\n"
+"			{\n"
+"				__global struct b3Contact4Data* c = globalContactsOut+ dstIdx;\n"
+"				c->m_worldNormalOnB = -normal;\n"
+"				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"				c->m_batchIdx = pairIndex;\n"
+"				int bodyA = concavePairsIn[pairIndex].x;\n"
+"				int bodyB = concavePairsIn[pairIndex].y;\n"
+"				c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"				c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"				c->m_childIndexA = childShapeIndexA;\n"
+"				c->m_childIndexB = childShapeIndexB;\n"
+"				for (int i=0;i<nReducedContacts;i++)\n"
+"				{\n"
+"					c->m_worldPosB[i] = pointsIn[contactIdx[i]];\n"
+"				}\n"
+"				GET_NPOINTS(*c) = nReducedContacts;\n"
+"			}\n"
+"				\n"
+"		}//		if (numContactsOut>0)\n"
+"	}//	if (i<numPairs)\n"
+"}\n"
+"int	findClippingFaces(const float4 separatingNormal,\n"
+"                      __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB,\n"
+"                      const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,\n"
+"                       __global float4* worldVertsA1,\n"
+"                      __global float4* worldNormalsA1,\n"
+"                      __global float4* worldVertsB1,\n"
+"                      int capacityWorldVerts,\n"
+"                      const float minDist, float maxDist,\n"
+"                      __global const float4* vertices,\n"
+"                      __global const b3GpuFace_t* faces,\n"
+"                      __global const int* indices,\n"
+"                      __global int4* clippingFaces, int pairIndex)\n"
+"{\n"
+"	int numContactsOut = 0;\n"
+"	int numWorldVertsB1= 0;\n"
+"    \n"
+"    \n"
+"	int closestFaceB=-1;\n"
+"	float dmax = -FLT_MAX;\n"
+"    \n"
+"	{\n"
+"		for(int face=0;face<hullB->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x,\n"
+"                                              faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 WorldNormal = qtRotate(ornB, Normal);\n"
+"			float d = dot3F4(WorldNormal,separatingNormal);\n"
+"			if (d > dmax)\n"
+"			{\n"
+"				dmax = d;\n"
+"				closestFaceB = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"    \n"
+"	{\n"
+"		const b3GpuFace_t polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
+"		const int numVertices = polyB.m_numIndices;\n"
+"		for(int e0=0;e0<numVertices;e0++)\n"
+"		{\n"
+"			const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];\n"
+"			worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);\n"
+"		}\n"
+"	}\n"
+"    \n"
+"    int closestFaceA=-1;\n"
+"	{\n"
+"		float dmin = FLT_MAX;\n"
+"		for(int face=0;face<hullA->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(\n"
+"                                              faces[hullA->m_faceOffset+face].m_plane.x,\n"
+"                                              faces[hullA->m_faceOffset+face].m_plane.y,\n"
+"                                              faces[hullA->m_faceOffset+face].m_plane.z,\n"
+"                                              0.f);\n"
+"			const float4 faceANormalWS = qtRotate(ornA,Normal);\n"
+"            \n"
+"			float d = dot3F4(faceANormalWS,separatingNormal);\n"
+"			if (d < dmin)\n"
+"			{\n"
+"				dmin = d;\n"
+"				closestFaceA = face;\n"
+"                worldNormalsA1[pairIndex] = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"    \n"
+"    int numVerticesA = faces[hullA->m_faceOffset+closestFaceA].m_numIndices;\n"
+"	for(int e0=0;e0<numVerticesA;e0++)\n"
+"	{\n"
+"        const float4 a = vertices[hullA->m_vertexOffset+indices[faces[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];\n"
+"        worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);\n"
+"    }\n"
+"    \n"
+"    clippingFaces[pairIndex].x = closestFaceA;\n"
+"    clippingFaces[pairIndex].y = closestFaceB;\n"
+"    clippingFaces[pairIndex].z = numVerticesA;\n"
+"    clippingFaces[pairIndex].w = numWorldVertsB1;\n"
+"    \n"
+"    \n"
+"	return numContactsOut;\n"
+"}\n"
+"int clipFaces(__global float4* worldVertsA1,\n"
+"              __global float4* worldNormalsA1,\n"
+"              __global float4* worldVertsB1,\n"
+"              __global float4* worldVertsB2, \n"
+"              int capacityWorldVertsB2,\n"
+"              const float minDist, float maxDist,\n"
+"              __global int4* clippingFaces,\n"
+"              int pairIndex)\n"
+"{\n"
+"	int numContactsOut = 0;\n"
+"    \n"
+"    int closestFaceA = clippingFaces[pairIndex].x;\n"
+"    int closestFaceB = clippingFaces[pairIndex].y;\n"
+"	int numVertsInA = clippingFaces[pairIndex].z;\n"
+"	int numVertsInB = clippingFaces[pairIndex].w;\n"
+"    \n"
+"	int numVertsOut = 0;\n"
+"    \n"
+"	if (closestFaceA<0)\n"
+"		return numContactsOut;\n"
+"    \n"
+"    __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];\n"
+"    __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];\n"
+"    \n"
+"    \n"
+"	\n"
+"	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
+"    \n"
+"	for(int e0=0;e0<numVertsInA;e0++)\n"
+"	{\n"
+"		const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];\n"
+"		const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];\n"
+"		const float4 WorldEdge0 = aw - bw;\n"
+"		float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];\n"
+"		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n"
+"		float4 worldA1 = aw;\n"
+"		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n"
+"		float4 planeNormalWS = planeNormalWS1;\n"
+"		float planeEqWS=planeEqWS1;\n"
+"		numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);\n"
+"		__global float4* tmp = pVtxOut;\n"
+"		pVtxOut = pVtxIn;\n"
+"		pVtxIn = tmp;\n"
+"		numVertsInB = numVertsOut;\n"
+"		numVertsOut = 0;\n"
+"	}\n"
+"    \n"
+"    //float4 planeNormalWS = worldNormalsA1[pairIndex];\n"
+"    //float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n"
+"    \n"
+"    /*for (int i=0;i<numVertsInB;i++)\n"
+"    {\n"
+"        pVtxOut[i] = pVtxIn[i];\n"
+"    }*/\n"
+"    \n"
+"    \n"
+"    \n"
+"    \n"
+"    //numVertsInB=0;\n"
+"	\n"
+"    float4 planeNormalWS = worldNormalsA1[pairIndex];\n"
+"    float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n"
+"    for (int i=0;i<numVertsInB;i++)\n"
+"    {\n"
+"        float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n"
+"        if (depth <=minDist)\n"
+"        {\n"
+"            depth = minDist;\n"
+"        }\n"
+"        \n"
+"        if (depth <=maxDist)\n"
+"        {\n"
+"            float4 pointInWorld = pVtxIn[i];\n"
+"            pVtxOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n"
+"        }\n"
+"    }\n"
+"   \n"
+"    clippingFaces[pairIndex].w =numContactsOut;\n"
+"   \n"
+"    \n"
+"	return numContactsOut;\n"
+"}\n"
+"__kernel void   findClippingFacesKernel(  __global const int4* pairs,\n"
+"                                        __global const b3RigidBodyData_t* rigidBodies,\n"
+"                                        __global const b3Collidable_t* collidables,\n"
+"                                        __global const b3ConvexPolyhedronData_t* convexShapes,\n"
+"                                        __global const float4* vertices,\n"
+"                                        __global const float4* uniqueEdges,\n"
+"                                        __global const b3GpuFace_t* faces,\n"
+"                                        __global const int* indices,\n"
+"                                        __global const float4* separatingNormals,\n"
+"                                        __global const int* hasSeparatingAxis,\n"
+"                                        __global int4* clippingFacesOut,\n"
+"                                        __global float4* worldVertsA1,\n"
+"                                        __global float4* worldNormalsA1,\n"
+"                                        __global float4* worldVertsB1,\n"
+"                                        int capacityWorldVerts,\n"
+"                                        int numPairs\n"
+"                                        )\n"
+"{\n"
+"    \n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"    \n"
+"	\n"
+"	float minDist = -1e30f;\n"
+"	float maxDist = 0.02f;\n"
+"    \n"
+"	if (i<numPairs)\n"
+"	{\n"
+"        \n"
+"		if (hasSeparatingAxis[i])\n"
+"		{\n"
+"            \n"
+"			int bodyIndexA = pairs[i].x;\n"
+"			int bodyIndexB = pairs[i].y;\n"
+"			\n"
+"			int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"			int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"			\n"
+"			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"			\n"
+"            \n"
+"            \n"
+"			int numLocalContactsOut = findClippingFaces(separatingNormals[i],\n"
+"                                                        &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n"
+"                                                        rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,\n"
+"                                                        rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,\n"
+"                                                        worldVertsA1,\n"
+"                                                        worldNormalsA1,\n"
+"                                                        worldVertsB1,capacityWorldVerts,\n"
+"                                                        minDist, maxDist,\n"
+"                                                        vertices,faces,indices,\n"
+"                                                        clippingFacesOut,i);\n"
+"            \n"
+"            \n"
+"		}//		if (hasSeparatingAxis[i])\n"
+"	}//	if (i<numPairs)\n"
+"    \n"
+"}\n"
+"__kernel void   clipFacesAndFindContactsKernel(    __global const float4* separatingNormals,\n"
+"                                                   __global const int* hasSeparatingAxis,\n"
+"                                                   __global int4* clippingFacesOut,\n"
+"                                                   __global float4* worldVertsA1,\n"
+"                                                   __global float4* worldNormalsA1,\n"
+"                                                   __global float4* worldVertsB1,\n"
+"                                                   __global float4* worldVertsB2,\n"
+"                                                    int vertexFaceCapacity,\n"
+"                                                   int numPairs,\n"
+"					                                        int debugMode\n"
+"                                                   )\n"
+"{\n"
+"    int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"    \n"
+"	float minDist = -1e30f;\n"
+"	float maxDist = 0.02f;\n"
+"    \n"
+"	if (i<numPairs)\n"
+"	{\n"
+"        \n"
+"		if (hasSeparatingAxis[i])\n"
+"		{\n"
+"            \n"
+"//			int bodyIndexA = pairs[i].x;\n"
+"	//		int bodyIndexB = pairs[i].y;\n"
+"		    \n"
+"            int numLocalContactsOut = 0;\n"
+"            int capacityWorldVertsB2 = vertexFaceCapacity;\n"
+"            \n"
+"            __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];\n"
+"            __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];\n"
+"            \n"
+"            {\n"
+"                __global int4* clippingFaces = clippingFacesOut;\n"
+"            \n"
+"                \n"
+"                int closestFaceA = clippingFaces[pairIndex].x;\n"
+"                int closestFaceB = clippingFaces[pairIndex].y;\n"
+"                int numVertsInA = clippingFaces[pairIndex].z;\n"
+"                int numVertsInB = clippingFaces[pairIndex].w;\n"
+"                \n"
+"                int numVertsOut = 0;\n"
+"                \n"
+"                if (closestFaceA>=0)\n"
+"                {\n"
+"                    \n"
+"                    \n"
+"                    \n"
+"                    // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
+"                    \n"
+"                    for(int e0=0;e0<numVertsInA;e0++)\n"
+"                    {\n"
+"                        const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];\n"
+"                        const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];\n"
+"                        const float4 WorldEdge0 = aw - bw;\n"
+"                        float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];\n"
+"                        float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n"
+"                        float4 worldA1 = aw;\n"
+"                        float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n"
+"                        float4 planeNormalWS = planeNormalWS1;\n"
+"                        float planeEqWS=planeEqWS1;\n"
+"                        numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);\n"
+"                        __global float4* tmp = pVtxOut;\n"
+"                        pVtxOut = pVtxIn;\n"
+"                        pVtxIn = tmp;\n"
+"                        numVertsInB = numVertsOut;\n"
+"                        numVertsOut = 0;\n"
+"                    }\n"
+"                    \n"
+"                    float4 planeNormalWS = worldNormalsA1[pairIndex];\n"
+"                    float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n"
+"                    \n"
+"                    for (int i=0;i<numVertsInB;i++)\n"
+"                    {\n"
+"                        float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n"
+"                        if (depth <=minDist)\n"
+"                        {\n"
+"                            depth = minDist;\n"
+"                        }\n"
+"                        \n"
+"                        if (depth <=maxDist)\n"
+"                        {\n"
+"                            float4 pointInWorld = pVtxIn[i];\n"
+"                            pVtxOut[numLocalContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n"
+"                        }\n"
+"                    }\n"
+"                    \n"
+"                }\n"
+"                clippingFaces[pairIndex].w =numLocalContactsOut;\n"
+"                \n"
+"            }\n"
+"            \n"
+"            for (int i=0;i<numLocalContactsOut;i++)\n"
+"                pVtxIn[i] = pVtxOut[i];\n"
+"                \n"
+"		}//		if (hasSeparatingAxis[i])\n"
+"	}//	if (i<numPairs)\n"
+"    \n"
+"}\n"
+"__kernel void   newContactReductionKernel( __global int4* pairs,\n"
+"                                                   __global const b3RigidBodyData_t* rigidBodies,\n"
+"                                                   __global const float4* separatingNormals,\n"
+"                                                   __global const int* hasSeparatingAxis,\n"
+"                                                   __global struct b3Contact4Data* globalContactsOut,\n"
+"                                                   __global int4* clippingFaces,\n"
+"                                                   __global float4* worldVertsB2,\n"
+"                                                   volatile __global int* nGlobalContactsOut,\n"
+"                                                   int vertexFaceCapacity,\n"
+"												   int contactCapacity,\n"
+"                                                   int numPairs\n"
+"                                                   )\n"
+"{\n"
+"    int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"    int4 contactIdx;\n"
+"    contactIdx=make_int4(0,1,2,3);\n"
+"    \n"
+"	if (i<numPairs)\n"
+"	{\n"
+"        \n"
+"		if (hasSeparatingAxis[i])\n"
+"		{\n"
+"            \n"
+"			\n"
+"            \n"
+"            \n"
+"			int nPoints = clippingFaces[pairIndex].w;\n"
+"           \n"
+"            if (nPoints>0)\n"
+"            {\n"
+"                 __global float4* pointsIn = &worldVertsB2[pairIndex*vertexFaceCapacity];\n"
+"                float4 normal = -separatingNormals[i];\n"
+"                \n"
+"                int nReducedContacts = extractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx);\n"
+"            \n"
+"				int mprContactIndex = pairs[pairIndex].z;\n"
+"                int dstIdx = mprContactIndex;\n"
+"				if (dstIdx<0)\n"
+"				{\n"
+"	                AppendInc( nGlobalContactsOut, dstIdx );\n"
+"				}\n"
+"//#if 0\n"
+"                \n"
+"				if (dstIdx < contactCapacity)\n"
+"				{\n"
+"					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"					c->m_worldNormalOnB = -normal;\n"
+"					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
+"					c->m_batchIdx = pairIndex;\n"
+"					int bodyA = pairs[pairIndex].x;\n"
+"					int bodyB = pairs[pairIndex].y;\n"
+"					pairs[pairIndex].w = dstIdx;\n"
+"					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"                    c->m_childIndexA =-1;\n"
+"					c->m_childIndexB =-1;\n"
+"                    switch (nReducedContacts)\n"
+"                    {\n"
+"                        case 4:\n"
+"                            c->m_worldPosB[3] = pointsIn[contactIdx.w];\n"
+"                        case 3:\n"
+"                            c->m_worldPosB[2] = pointsIn[contactIdx.z];\n"
+"                        case 2:\n"
+"                            c->m_worldPosB[1] = pointsIn[contactIdx.y];\n"
+"                        case 1:\n"
+"							if (mprContactIndex<0)//test\n"
+"	                            c->m_worldPosB[0] = pointsIn[contactIdx.x];\n"
+"                        default:\n"
+"                        {\n"
+"                        }\n"
+"                    };\n"
+"                    \n"
+"					GET_NPOINTS(*c) = nReducedContacts;\n"
+"                    \n"
+"                 }\n"
+"                 \n"
+"                \n"
+"//#endif\n"
+"				\n"
+"			}//		if (numContactsOut>0)\n"
+"		}//		if (hasSeparatingAxis[i])\n"
+"	}//	if (i<numPairs)\n"
+"    \n"
+"    \n"
+"}\n"
+;
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcave.cl b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcave.cl
new file mode 100644
index 0000000000..31ca43b8cd
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcave.cl
@@ -0,0 +1,1220 @@
+
+//keep this enum in sync with the CPU version (in btCollidable.h)
+//written by Erwin Coumans
+
+
+#define SHAPE_CONVEX_HULL 3
+#define SHAPE_CONCAVE_TRIMESH 5
+#define TRIANGLE_NUM_CONVEX_FACES 5
+#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6
+
+#define B3_MAX_STACK_DEPTH 256
+
+
+typedef unsigned int u32;
+
+///keep this in sync with btCollidable.h
+typedef struct
+{
+	union {
+		int m_numChildShapes;
+		int m_bvhIndex;
+	};
+	union
+	{
+		float m_radius;
+		int	m_compoundBvhIndex;
+	};
+	
+	int m_shapeType;
+	int m_shapeIndex;
+	
+} btCollidableGpu;
+
+#define MAX_NUM_PARTS_IN_BITS 10
+
+///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.
+///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
+typedef struct
+{
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes
+	int	m_escapeIndexOrTriangleIndex;
+} b3QuantizedBvhNode;
+
+typedef struct
+{
+	float4		m_aabbMin;
+	float4		m_aabbMax;
+	float4		m_quantization;
+	int			m_numNodes;
+	int			m_numSubTrees;
+	int			m_nodeOffset;
+	int			m_subTreeOffset;
+
+} b3BvhInfo;
+
+
+int	getTriangleIndex(const b3QuantizedBvhNode* rootNode)
+{
+	unsigned int x=0;
+	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+	// Get only the lower bits where the triangle index is stored
+	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));
+}
+
+int	getTriangleIndexGlobal(__global const b3QuantizedBvhNode* rootNode)
+{
+	unsigned int x=0;
+	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+	// Get only the lower bits where the triangle index is stored
+	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));
+}
+
+int isLeafNode(const b3QuantizedBvhNode* rootNode)
+{
+	//skipindex is negative (internal node), triangleindex >=0 (leafnode)
+	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;
+}
+
+int isLeafNodeGlobal(__global const b3QuantizedBvhNode* rootNode)
+{
+	//skipindex is negative (internal node), triangleindex >=0 (leafnode)
+	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;
+}
+	
+int getEscapeIndex(const b3QuantizedBvhNode* rootNode)
+{
+	return -rootNode->m_escapeIndexOrTriangleIndex;
+}
+
+int getEscapeIndexGlobal(__global const b3QuantizedBvhNode* rootNode)
+{
+	return -rootNode->m_escapeIndexOrTriangleIndex;
+}
+
+
+typedef struct
+{
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes, points to the root of the subtree
+	int			m_rootNodeIndex;
+	//4 bytes
+	int			m_subtreeSize;
+	int			m_padding[3];
+} b3BvhSubtreeInfo;
+
+
+
+
+
+
+
+typedef struct
+{
+	float4	m_childPosition;
+	float4	m_childOrientation;
+	int m_shapeIndex;
+	int m_unused0;
+	int m_unused1;
+	int m_unused2;
+} btGpuChildShape;
+
+
+typedef struct
+{
+	float4 m_pos;
+	float4 m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+
+	u32 m_collidableIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} BodyData;
+
+
+typedef struct  
+{
+	float4		m_localCenter;
+	float4		m_extents;
+	float4		mC;
+	float4		mE;
+	
+	float			m_radius;
+	int	m_faceOffset;
+	int m_numFaces;
+	int	m_numVertices;
+
+	int m_vertexOffset;
+	int	m_uniqueEdgesOffset;
+	int	m_numUniqueEdges;
+	int m_unused;
+} ConvexPolyhedronCL;
+
+typedef struct 
+{
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3Common/shared/b3Int2.h"
+
+
+
+typedef struct
+{
+	float4 m_plane;
+	int m_indexOffset;
+	int m_numIndices;
+} btGpuFace;
+
+#define make_float4 (float4)
+
+
+__inline
+float4 cross3(float4 a, float4 b)
+{
+	return cross(a,b);
+
+	
+//	float4 a1 = make_float4(a.xyz,0.f);
+//	float4 b1 = make_float4(b.xyz,0.f);
+
+//	return cross(a1,b1);
+
+//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);
+	
+	//	float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);
+	
+	//return c;
+}
+
+__inline
+float dot3F4(float4 a, float4 b)
+{
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+}
+
+__inline
+float4 fastNormalize4(float4 v)
+{
+	v = make_float4(v.xyz,0.f);
+	return fast_normalize(v);
+}
+
+
+///////////////////////////////////////
+//	Quaternion
+///////////////////////////////////////
+
+typedef float4 Quaternion;
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b);
+
+__inline
+Quaternion qtNormalize(Quaternion in);
+
+__inline
+float4 qtRotate(Quaternion q, float4 vec);
+
+__inline
+Quaternion qtInvert(Quaternion q);
+
+
+
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b)
+{
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+}
+
+__inline
+Quaternion qtNormalize(Quaternion in)
+{
+	return fastNormalize4(in);
+//	in /= length( in );
+//	return in;
+}
+__inline
+float4 qtRotate(Quaternion q, float4 vec)
+{
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+}
+
+__inline
+Quaternion qtInvert(Quaternion q)
+{
+	return (Quaternion)(-q.xyz, q.w);
+}
+
+__inline
+float4 qtInvRotate(const Quaternion q, float4 vec)
+{
+	return qtRotate( qtInvert( q ), vec );
+}
+
+__inline
+float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)
+{
+	return qtRotate( *orientation, *p ) + (*translation);
+}
+
+
+
+__inline
+float4 normalize3(const float4 a)
+{
+	float4 n = make_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+}
+
+inline void projectLocal(const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, 
+const float4* dir, const float4* vertices, float* min, float* max)
+{
+	min[0] = FLT_MAX;
+	max[0] = -FLT_MAX;
+	int numVerts = hull->m_numVertices;
+
+	const float4 localDir = qtInvRotate(orn,*dir);
+	float offset = dot(pos,*dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);
+		if(dp < min[0])	
+			min[0] = dp;
+		if(dp > max[0])	
+			max[0] = dp;
+	}
+	if(min[0]>max[0])
+	{
+		float tmp = min[0];
+		min[0] = max[0];
+		max[0] = tmp;
+	}
+	min[0] += offset;
+	max[0] += offset;
+}
+
+inline void project(__global const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, 
+const float4* dir, __global const float4* vertices, float* min, float* max)
+{
+	min[0] = FLT_MAX;
+	max[0] = -FLT_MAX;
+	int numVerts = hull->m_numVertices;
+
+	const float4 localDir = qtInvRotate(orn,*dir);
+	float offset = dot(pos,*dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);
+		if(dp < min[0])	
+			min[0] = dp;
+		if(dp > max[0])	
+			max[0] = dp;
+	}
+	if(min[0]>max[0])
+	{
+		float tmp = min[0];
+		min[0] = max[0];
+		max[0] = tmp;
+	}
+	min[0] += offset;
+	max[0] += offset;
+}
+
+inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA,const float4 ornA,
+	const float4 posB,const float4 ornB,
+	float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth)
+{
+	float Min0,Max0;
+	float Min1,Max1;
+	projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);
+	project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);
+
+	if(Max0<Min1 || Max1<Min0)
+		return false;
+
+	float d0 = Max0 - Min1;
+	float d1 = Max1 - Min0;
+	*depth = d0<d1 ? d0:d1;
+	return true;
+}
+
+
+
+
+inline bool IsAlmostZero(const float4 v)
+{
+	if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)
+		return false;
+	return true;
+}
+
+
+
+bool findSeparatingAxisLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	
+	const float4* verticesA, 
+	const float4* uniqueEdgesA, 
+	const btGpuFace* facesA,
+	const int*  indicesA,
+
+	__global const float4* verticesB, 
+	__global const float4* uniqueEdgesB, 
+	__global const btGpuFace* facesB,
+	__global const int*  indicesB,
+	float4* sep,
+	float* dmin)
+{
+	
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for(int i=0;i<numFacesA;i++)
+		{
+			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;
+			float4 faceANormalWS = qtRotate(ornA,normal);
+			if (dot3F4(DeltaC2,faceANormalWS)<0)
+				faceANormalWS*=-1.f;
+			curPlaneTests++;
+			float d;
+			if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))
+				return false;
+			if(d<*dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+bool findSeparatingAxisLocalB(	__global const ConvexPolyhedronCL* hullA,  const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* verticesA, 
+	__global const float4* uniqueEdgesA, 
+	__global const btGpuFace* facesA,
+	__global const int*  indicesA,
+	const float4* verticesB,
+	const float4* uniqueEdgesB, 
+	const btGpuFace* facesB,
+	const int*  indicesB,
+	float4* sep,
+	float* dmin)
+{
+
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for(int i=0;i<numFacesA;i++)
+		{
+			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;
+			float4 faceANormalWS = qtRotate(ornA,normal);
+			if (dot3F4(DeltaC2,faceANormalWS)<0)
+				faceANormalWS *= -1.f;
+			curPlaneTests++;
+			float d;
+			if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d))
+				return false;
+			if(d<*dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+
+
+bool findSeparatingAxisEdgeEdgeLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	const float4* verticesA, 
+	const float4* uniqueEdgesA, 
+	const btGpuFace* facesA,
+	const int*  indicesA,
+	__global const float4* verticesB, 
+	__global const float4* uniqueEdgesB, 
+	__global const btGpuFace* facesB,
+	__global const int*  indicesB,
+		float4* sep,
+	float* dmin)
+{
+
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+
+	int curPlaneTests=0;
+
+	int curEdgeEdge = 0;
+	// Test edges
+	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)
+	{
+		const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];
+		float4 edge0World = qtRotate(ornA,edge0);
+
+		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)
+		{
+			const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];
+			float4 edge1World = qtRotate(ornB,edge1);
+
+
+			float4 crossje = cross3(edge0World,edge1World);
+
+			curEdgeEdge++;
+			if(!IsAlmostZero(crossje))
+			{
+				crossje = normalize3(crossje);
+				if (dot3F4(DeltaC2,crossje)<0)
+					crossje *= -1.f;
+
+				float dist;
+				bool result = true;
+				{
+					float Min0,Max0;
+					float Min1,Max1;
+					projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);
+					project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);
+				
+					if(Max0<Min1 || Max1<Min0)
+						result = false;
+				
+					float d0 = Max0 - Min1;
+					float d1 = Max1 - Min0;
+					dist = d0<d1 ? d0:d1;
+					result = true;
+
+				}
+				
+
+				if(dist<*dmin)
+				{
+					*dmin = dist;
+					*sep = crossje;
+				}
+			}
+		}
+
+	}
+
+	
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+
+
+inline int	findClippingFaces(const float4 separatingNormal,
+                      const ConvexPolyhedronCL* hullA, 
+					  __global const ConvexPolyhedronCL* hullB,
+                      const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,
+                       __global float4* worldVertsA1,
+                      __global float4* worldNormalsA1,
+                      __global float4* worldVertsB1,
+                      int capacityWorldVerts,
+                      const float minDist, float maxDist,
+					  const float4* verticesA,
+                      const btGpuFace* facesA,
+                      const int* indicesA,
+					  __global const float4* verticesB,
+                      __global const btGpuFace* facesB,
+                      __global const int* indicesB,
+                      __global int4* clippingFaces, int pairIndex)
+{
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+    
+    
+	int closestFaceB=0;
+	float dmax = -FLT_MAX;
+    
+	{
+		for(int face=0;face<hullB->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x,
+                                              facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);
+			const float4 WorldNormal = qtRotate(ornB, Normal);
+			float d = dot3F4(WorldNormal,separatingNormal);
+			if (d > dmax)
+			{
+				dmax = d;
+				closestFaceB = face;
+			}
+		}
+	}
+    
+	{
+		const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];
+		int numVertices = polyB.m_numIndices;
+        if (numVertices>capacityWorldVerts)
+            numVertices = capacityWorldVerts;
+        if (numVertices<0)
+            numVertices = 0;
+        
+		for(int e0=0;e0<numVertices;e0++)
+		{
+            if (e0<capacityWorldVerts)
+            {
+                const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];
+                worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);
+            }
+		}
+	}
+    
+    int closestFaceA=0;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(
+                                              facesA[hullA->m_faceOffset+face].m_plane.x,
+                                              facesA[hullA->m_faceOffset+face].m_plane.y,
+                                              facesA[hullA->m_faceOffset+face].m_plane.z,
+                                              0.f);
+			const float4 faceANormalWS = qtRotate(ornA,Normal);
+            
+			float d = dot3F4(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+                worldNormalsA1[pairIndex] = faceANormalWS;
+			}
+		}
+	}
+    
+    int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices;
+    if (numVerticesA>capacityWorldVerts)
+       numVerticesA = capacityWorldVerts;
+    if (numVerticesA<0)
+        numVerticesA=0;
+    
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+        if (e0<capacityWorldVerts)
+        {
+            const float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];
+            worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);
+        }
+    }
+    
+    clippingFaces[pairIndex].x = closestFaceA;
+    clippingFaces[pairIndex].y = closestFaceB;
+    clippingFaces[pairIndex].z = numVerticesA;
+    clippingFaces[pairIndex].w = numWorldVertsB1;
+    
+    
+	return numContactsOut;
+}
+
+
+
+
+// work-in-progress
+__kernel void   findConcaveSeparatingAxisVertexFaceKernel( __global int4* concavePairs,
+                                                __global const BodyData* rigidBodies,
+                                                __global const btCollidableGpu* collidables,
+                                                __global const ConvexPolyhedronCL* convexShapes,
+                                                __global const float4* vertices,
+                                                __global const float4* uniqueEdges,
+                                                __global const btGpuFace* faces,
+                                                __global const int* indices,
+                                                __global const btGpuChildShape* gpuChildShapes,
+                                                __global btAabbCL* aabbs,
+                                                __global float4* concaveSeparatingNormalsOut,
+                                                __global int* concaveHasSeparatingNormals,
+                                                __global int4* clippingFacesOut,
+                                                __global float4* worldVertsA1GPU,
+                                                __global float4*  worldNormalsAGPU,
+                                                __global float4* worldVertsB1GPU,
+                                                __global float* dmins,
+                                                int vertexFaceCapacity,
+                                                int numConcavePairs
+                                                )
+{
+    
+	int i = get_global_id(0);
+	if (i>=numConcavePairs)
+		return;
+    
+	concaveHasSeparatingNormals[i] = 0;
+    
+	int pairIdx = i;
+    
+	int bodyIndexA = concavePairs[i].x;
+	int bodyIndexB = concavePairs[i].y;
+    
+	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+    
+	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+    
+	if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&&
+		collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS)
+	{
+		concavePairs[pairIdx].w = -1;
+		return;
+	}
+    
+    
+    
+	int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+	int numActualConcaveConvexTests = 0;
+	
+	int f = concavePairs[i].z;
+	
+	bool overlap = false;
+	
+	ConvexPolyhedronCL convexPolyhedronA;
+    
+	//add 3 vertices of the triangle
+	convexPolyhedronA.m_numVertices = 3;
+	convexPolyhedronA.m_vertexOffset = 0;
+	float4	localCenter = make_float4(0.f,0.f,0.f,0.f);
+    
+	btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];
+	float4 triMinAabb, triMaxAabb;
+	btAabbCL triAabb;
+	triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);
+	triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);
+	
+	float4 verticesA[3];
+	for (int i=0;i<3;i++)
+	{
+		int index = indices[face.m_indexOffset+i];
+		float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];
+		verticesA[i] = vert;
+		localCenter += vert;
+        
+		triAabb.m_min = min(triAabb.m_min,vert);
+		triAabb.m_max = max(triAabb.m_max,vert);
+        
+	}
+    
+	overlap = true;
+	overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;
+	overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;
+	overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;
+    
+	if (overlap)
+	{
+		float dmin = FLT_MAX;
+		int hasSeparatingAxis=5;
+		float4 sepAxis=make_float4(1,2,3,4);
+        
+		int localCC=0;
+		numActualConcaveConvexTests++;
+        
+		//a triangle has 3 unique edges
+		convexPolyhedronA.m_numUniqueEdges = 3;
+		convexPolyhedronA.m_uniqueEdgesOffset = 0;
+		float4 uniqueEdgesA[3];
+		
+		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);
+		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);
+		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);
+        
+        
+		convexPolyhedronA.m_faceOffset = 0;
+        
+		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);
+        
+		btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];
+		int indicesA[3+3+2+2+2];
+		int curUsedIndices=0;
+		int fidx=0;
+        
+		//front size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[0] = 0;
+			indicesA[1] = 1;
+			indicesA[2] = 2;
+			curUsedIndices+=3;
+			float c = face.m_plane.w;
+			facesA[fidx].m_plane.x = normal.x;
+			facesA[fidx].m_plane.y = normal.y;
+			facesA[fidx].m_plane.z = normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+		//back size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[3]=2;
+			indicesA[4]=1;
+			indicesA[5]=0;
+			curUsedIndices+=3;
+			float c = dot(normal,verticesA[0]);
+			float c1 = -face.m_plane.w;
+			facesA[fidx].m_plane.x = -normal.x;
+			facesA[fidx].m_plane.y = -normal.y;
+			facesA[fidx].m_plane.z = -normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+        
+		bool addEdgePlanes = true;
+		if (addEdgePlanes)
+		{
+			int numVertices=3;
+			int prevVertex = numVertices-1;
+			for (int i=0;i<numVertices;i++)
+			{
+				float4 v0 = verticesA[i];
+				float4 v1 = verticesA[prevVertex];
+                
+				float4 edgeNormal = normalize(cross(normal,v1-v0));
+				float c = -dot(edgeNormal,v0);
+                
+				facesA[fidx].m_numIndices = 2;
+				facesA[fidx].m_indexOffset=curUsedIndices;
+				indicesA[curUsedIndices++]=i;
+				indicesA[curUsedIndices++]=prevVertex;
+                
+				facesA[fidx].m_plane.x = edgeNormal.x;
+				facesA[fidx].m_plane.y = edgeNormal.y;
+				facesA[fidx].m_plane.z = edgeNormal.z;
+				facesA[fidx].m_plane.w = c;
+				fidx++;
+				prevVertex = i;
+			}
+		}
+		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;
+		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);
+        
+        
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+        
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 ornB =rigidBodies[bodyIndexB].m_quat;
+        
+		
+        
+        
+		///////////////////
+		///compound shape support
+        
+		if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+			int compoundChild = concavePairs[pairIdx].w;
+			int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;
+			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			float4 newPosB = transform(&childPosB,&posB,&ornB);
+			float4 newOrnB = qtMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+			shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+		}
+		//////////////////
+        
+		float4 c0local = convexPolyhedronA.m_localCenter;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+        
+        
+		bool sepA = findSeparatingAxisLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],
+                                             posA,ornA,
+                                             posB,ornB,
+                                             DeltaC2,
+                                             verticesA,uniqueEdgesA,facesA,indicesA,
+                                             vertices,uniqueEdges,faces,indices,
+                                             &sepAxis,&dmin);
+		hasSeparatingAxis = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxisLocalB(	&convexShapes[shapeIndexB],&convexPolyhedronA,
+                                                 posB,ornB,
+                                                 posA,ornA,
+                                                 DeltaC2,
+                                                 vertices,uniqueEdges,faces,indices,
+                                                 verticesA,uniqueEdgesA,facesA,indicesA,
+                                                 &sepAxis,&dmin);
+            
+			if (!sepB)
+			{
+				hasSeparatingAxis = 0;
+			} else
+			{
+				hasSeparatingAxis = 1;
+			}
+		}	
+		
+		if (hasSeparatingAxis)
+		{
+            dmins[i] = dmin;
+			concaveSeparatingNormalsOut[pairIdx]=sepAxis;
+			concaveHasSeparatingNormals[i]=1;
+            
+		} else
+		{	
+			//mark this pair as in-active
+			concavePairs[pairIdx].w = -1;
+		}
+	}
+	else
+	{	
+		//mark this pair as in-active
+		concavePairs[pairIdx].w = -1;
+	}
+}
+
+
+
+
+// work-in-progress
+__kernel void   findConcaveSeparatingAxisEdgeEdgeKernel( __global int4* concavePairs,
+                                                          __global const BodyData* rigidBodies,
+                                                          __global const btCollidableGpu* collidables,
+                                                          __global const ConvexPolyhedronCL* convexShapes,
+                                                          __global const float4* vertices,
+                                                          __global const float4* uniqueEdges,
+                                                          __global const btGpuFace* faces,
+                                                          __global const int* indices,
+                                                          __global const btGpuChildShape* gpuChildShapes,
+                                                          __global btAabbCL* aabbs,
+                                                          __global float4* concaveSeparatingNormalsOut,
+                                                          __global int* concaveHasSeparatingNormals,
+                                                          __global int4* clippingFacesOut,
+                                                          __global float4* worldVertsA1GPU,
+                                                          __global float4*  worldNormalsAGPU,
+                                                          __global float4* worldVertsB1GPU,
+                                                          __global float* dmins,
+                                                          int vertexFaceCapacity,
+                                                          int numConcavePairs
+                                                          )
+{
+    
+	int i = get_global_id(0);
+	if (i>=numConcavePairs)
+		return;
+    
+	if (!concaveHasSeparatingNormals[i])
+        return;
+    
+	int pairIdx = i;
+    
+	int bodyIndexA = concavePairs[i].x;
+	int bodyIndexB = concavePairs[i].y;
+    
+	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+    
+	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+    
+    
+	int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+	int numActualConcaveConvexTests = 0;
+	
+	int f = concavePairs[i].z;
+	
+	bool overlap = false;
+	
+	ConvexPolyhedronCL convexPolyhedronA;
+    
+	//add 3 vertices of the triangle
+	convexPolyhedronA.m_numVertices = 3;
+	convexPolyhedronA.m_vertexOffset = 0;
+	float4	localCenter = make_float4(0.f,0.f,0.f,0.f);
+    
+	btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];
+	float4 triMinAabb, triMaxAabb;
+	btAabbCL triAabb;
+	triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);
+	triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);
+	
+	float4 verticesA[3];
+	for (int i=0;i<3;i++)
+	{
+		int index = indices[face.m_indexOffset+i];
+		float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];
+		verticesA[i] = vert;
+		localCenter += vert;
+        
+		triAabb.m_min = min(triAabb.m_min,vert);
+		triAabb.m_max = max(triAabb.m_max,vert);
+        
+	}
+    
+	overlap = true;
+	overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;
+	overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;
+	overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;
+    
+	if (overlap)
+	{
+		float dmin = dmins[i];
+		int hasSeparatingAxis=5;
+		float4 sepAxis=make_float4(1,2,3,4);
+        sepAxis = concaveSeparatingNormalsOut[pairIdx];
+        
+		int localCC=0;
+		numActualConcaveConvexTests++;
+        
+		//a triangle has 3 unique edges
+		convexPolyhedronA.m_numUniqueEdges = 3;
+		convexPolyhedronA.m_uniqueEdgesOffset = 0;
+		float4 uniqueEdgesA[3];
+		
+		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);
+		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);
+		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);
+        
+        
+		convexPolyhedronA.m_faceOffset = 0;
+        
+		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);
+        
+		btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];
+		int indicesA[3+3+2+2+2];
+		int curUsedIndices=0;
+		int fidx=0;
+        
+		//front size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[0] = 0;
+			indicesA[1] = 1;
+			indicesA[2] = 2;
+			curUsedIndices+=3;
+			float c = face.m_plane.w;
+			facesA[fidx].m_plane.x = normal.x;
+			facesA[fidx].m_plane.y = normal.y;
+			facesA[fidx].m_plane.z = normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+		//back size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[3]=2;
+			indicesA[4]=1;
+			indicesA[5]=0;
+			curUsedIndices+=3;
+			float c = dot(normal,verticesA[0]);
+			float c1 = -face.m_plane.w;
+			facesA[fidx].m_plane.x = -normal.x;
+			facesA[fidx].m_plane.y = -normal.y;
+			facesA[fidx].m_plane.z = -normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+        
+		bool addEdgePlanes = true;
+		if (addEdgePlanes)
+		{
+			int numVertices=3;
+			int prevVertex = numVertices-1;
+			for (int i=0;i<numVertices;i++)
+			{
+				float4 v0 = verticesA[i];
+				float4 v1 = verticesA[prevVertex];
+                
+				float4 edgeNormal = normalize(cross(normal,v1-v0));
+				float c = -dot(edgeNormal,v0);
+                
+				facesA[fidx].m_numIndices = 2;
+				facesA[fidx].m_indexOffset=curUsedIndices;
+				indicesA[curUsedIndices++]=i;
+				indicesA[curUsedIndices++]=prevVertex;
+                
+				facesA[fidx].m_plane.x = edgeNormal.x;
+				facesA[fidx].m_plane.y = edgeNormal.y;
+				facesA[fidx].m_plane.z = edgeNormal.z;
+				facesA[fidx].m_plane.w = c;
+				fidx++;
+				prevVertex = i;
+			}
+		}
+		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;
+		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);
+        
+        
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+        
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 ornB =rigidBodies[bodyIndexB].m_quat;
+        
+		
+        
+        
+		///////////////////
+		///compound shape support
+        
+		if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+			int compoundChild = concavePairs[pairIdx].w;
+			int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;
+			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			float4 newPosB = transform(&childPosB,&posB,&ornB);
+			float4 newOrnB = qtMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+			shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+		}
+		//////////////////
+        
+		float4 c0local = convexPolyhedronA.m_localCenter;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+        
+        
+		{
+			bool sepEE = findSeparatingAxisEdgeEdgeLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],
+                                                              posA,ornA,
+                                                              posB,ornB,
+                                                              DeltaC2,
+                                                              verticesA,uniqueEdgesA,facesA,indicesA,
+                                                              vertices,uniqueEdges,faces,indices,
+                                                              &sepAxis,&dmin);
+                
+			if (!sepEE)
+			{
+				hasSeparatingAxis = 0;
+			} else
+			{
+				hasSeparatingAxis = 1;
+			}
+		}
+		
+		
+		if (hasSeparatingAxis)
+		{
+			sepAxis.w = dmin;
+            dmins[i] = dmin;
+			concaveSeparatingNormalsOut[pairIdx]=sepAxis;
+			concaveHasSeparatingNormals[i]=1;
+           
+ 	float minDist = -1e30f;
+			float maxDist = 0.02f;
+
+            
+            findClippingFaces(sepAxis,
+                              &convexPolyhedronA,
+                              &convexShapes[shapeIndexB],
+                              posA,ornA,
+                              posB,ornB,
+                              worldVertsA1GPU,
+                              worldNormalsAGPU,
+                              worldVertsB1GPU,
+                              vertexFaceCapacity,
+                              minDist, maxDist,
+                              verticesA,
+                              facesA,
+                              indicesA,
+                              vertices,
+                              faces,
+                              indices,
+                              clippingFacesOut, pairIdx);
+	           
+            
+		} else
+		{	
+			//mark this pair as in-active
+			concavePairs[pairIdx].w = -1;
+		}
+	}
+	else
+	{	
+		//mark this pair as in-active
+		concavePairs[pairIdx].w = -1;
+	}
+	
+	concavePairs[i].z = -1;//for the next stage, z is used to determine existing contact points
+}
+
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h
new file mode 100644
index 0000000000..611569cacf
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h
@@ -0,0 +1,1457 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* satConcaveKernelsCL= \
+"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
+"//written by Erwin Coumans\n"
+"#define SHAPE_CONVEX_HULL 3\n"
+"#define SHAPE_CONCAVE_TRIMESH 5\n"
+"#define TRIANGLE_NUM_CONVEX_FACES 5\n"
+"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
+"#define B3_MAX_STACK_DEPTH 256\n"
+"typedef unsigned int u32;\n"
+"///keep this in sync with btCollidable.h\n"
+"typedef struct\n"
+"{\n"
+"	union {\n"
+"		int m_numChildShapes;\n"
+"		int m_bvhIndex;\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float m_radius;\n"
+"		int	m_compoundBvhIndex;\n"
+"	};\n"
+"	\n"
+"	int m_shapeType;\n"
+"	int m_shapeIndex;\n"
+"	\n"
+"} btCollidableGpu;\n"
+"#define MAX_NUM_PARTS_IN_BITS 10\n"
+"///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
+"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n"
+"typedef struct\n"
+"{\n"
+"	//12 bytes\n"
+"	unsigned short int	m_quantizedAabbMin[3];\n"
+"	unsigned short int	m_quantizedAabbMax[3];\n"
+"	//4 bytes\n"
+"	int	m_escapeIndexOrTriangleIndex;\n"
+"} b3QuantizedBvhNode;\n"
+"typedef struct\n"
+"{\n"
+"	float4		m_aabbMin;\n"
+"	float4		m_aabbMax;\n"
+"	float4		m_quantization;\n"
+"	int			m_numNodes;\n"
+"	int			m_numSubTrees;\n"
+"	int			m_nodeOffset;\n"
+"	int			m_subTreeOffset;\n"
+"} b3BvhInfo;\n"
+"int	getTriangleIndex(const b3QuantizedBvhNode* rootNode)\n"
+"{\n"
+"	unsigned int x=0;\n"
+"	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
+"	// Get only the lower bits where the triangle index is stored\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
+"}\n"
+"int	getTriangleIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n"
+"{\n"
+"	unsigned int x=0;\n"
+"	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
+"	// Get only the lower bits where the triangle index is stored\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
+"}\n"
+"int isLeafNode(const b3QuantizedBvhNode* rootNode)\n"
+"{\n"
+"	//skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
+"}\n"
+"int isLeafNodeGlobal(__global const b3QuantizedBvhNode* rootNode)\n"
+"{\n"
+"	//skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
+"}\n"
+"	\n"
+"int getEscapeIndex(const b3QuantizedBvhNode* rootNode)\n"
+"{\n"
+"	return -rootNode->m_escapeIndexOrTriangleIndex;\n"
+"}\n"
+"int getEscapeIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n"
+"{\n"
+"	return -rootNode->m_escapeIndexOrTriangleIndex;\n"
+"}\n"
+"typedef struct\n"
+"{\n"
+"	//12 bytes\n"
+"	unsigned short int	m_quantizedAabbMin[3];\n"
+"	unsigned short int	m_quantizedAabbMax[3];\n"
+"	//4 bytes, points to the root of the subtree\n"
+"	int			m_rootNodeIndex;\n"
+"	//4 bytes\n"
+"	int			m_subtreeSize;\n"
+"	int			m_padding[3];\n"
+"} b3BvhSubtreeInfo;\n"
+"typedef struct\n"
+"{\n"
+"	float4	m_childPosition;\n"
+"	float4	m_childOrientation;\n"
+"	int m_shapeIndex;\n"
+"	int m_unused0;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"} btGpuChildShape;\n"
+"typedef struct\n"
+"{\n"
+"	float4 m_pos;\n"
+"	float4 m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"	u32 m_collidableIdx;\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} BodyData;\n"
+"typedef struct  \n"
+"{\n"
+"	float4		m_localCenter;\n"
+"	float4		m_extents;\n"
+"	float4		mC;\n"
+"	float4		mE;\n"
+"	\n"
+"	float			m_radius;\n"
+"	int	m_faceOffset;\n"
+"	int m_numFaces;\n"
+"	int	m_numVertices;\n"
+"	int m_vertexOffset;\n"
+"	int	m_uniqueEdgesOffset;\n"
+"	int	m_numUniqueEdges;\n"
+"	int m_unused;\n"
+"} ConvexPolyhedronCL;\n"
+"typedef struct \n"
+"{\n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} btAabbCL;\n"
+"#ifndef B3_AABB_H\n"
+"#define B3_AABB_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#define B3_PLATFORM_DEFINITIONS_H\n"
+"struct MyTest\n"
+"{\n"
+"	int bla;\n"
+"};\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3AtomicAdd atomic_add\n"
+"#define b3Fabs fabs\n"
+"#define b3Sqrt native_sqrt\n"
+"#define b3Sin native_sin\n"
+"#define b3Cos native_cos\n"
+"#define B3_STATIC\n"
+"#endif\n"
+"#endif\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"	#define b3MinFloat4 min\n"
+"	#define b3MaxFloat4 max\n"
+"	#define b3Normalized(a) normalize(a)\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"{\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
+"}\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"{\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"}\n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_MAT3x3_H\n"
+"#define B3_MAT3x3_H\n"
+"#ifndef B3_QUAT_H\n"
+"#define B3_QUAT_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif\n"
+"#endif\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"	typedef float4	b3Quat;\n"
+"	#define b3QuatConstArg const b3Quat\n"
+"	\n"
+"	\n"
+"inline float4 b3FastNormalize4(float4 v)\n"
+"{\n"
+"	v = (float4)(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"}\n"
+"	\n"
+"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
+"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
+"{\n"
+"	b3Quat ans;\n"
+"	ans = b3Cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
+"	return ans;\n"
+"}\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
+"{\n"
+"	b3Quat q;\n"
+"	q=in;\n"
+"	//return b3FastNormalize4(in);\n"
+"	float len = native_sqrt(dot(q, q));\n"
+"	if(len > 0.f)\n"
+"	{\n"
+"		q *= 1.f / len;\n"
+"	}\n"
+"	else\n"
+"	{\n"
+"		q.x = q.y = q.z = 0.f;\n"
+"		q.w = 1.f;\n"
+"	}\n"
+"	return q;\n"
+"}\n"
+"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"{\n"
+"	b3Quat qInv = b3QuatInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"}\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
+"{\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"}\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
+"{\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"}\n"
+"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"{\n"
+"	return b3QuatRotate( b3QuatInvert( q ), vec );\n"
+"}\n"
+"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)\n"
+"{\n"
+"	return b3QuatRotate( orientation, point ) + (translation);\n"
+"}\n"
+"	\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"typedef struct\n"
+"{\n"
+"	b3Float4 m_row[3];\n"
+"}b3Mat3x3;\n"
+"#define b3Mat3x3ConstArg const b3Mat3x3\n"
+"#define b3GetRow(m,row) (m.m_row[row])\n"
+"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
+"{\n"
+"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
+"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
+"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
+"	out.m_row[0].w = 0.f;\n"
+"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
+"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
+"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
+"	out.m_row[1].w = 0.f;\n"
+"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
+"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
+"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
+"	out.m_row[2].w = 0.f;\n"
+"	return out;\n"
+"}\n"
+"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
+"{\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = fabs(matIn.m_row[0]);\n"
+"	out.m_row[1] = fabs(matIn.m_row[1]);\n"
+"	out.m_row[2] = fabs(matIn.m_row[2]);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtZero();\n"
+"__inline\n"
+"b3Mat3x3 mtIdentity();\n"
+"__inline\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
+"__inline\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
+"__inline\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
+"__inline\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
+"__inline\n"
+"b3Mat3x3 mtZero()\n"
+"{\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(0.f);\n"
+"	m.m_row[1] = (b3Float4)(0.f);\n"
+"	m.m_row[2] = (b3Float4)(0.f);\n"
+"	return m;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtIdentity()\n"
+"{\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
+"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
+"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
+"	return m;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
+"{\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
+"{\n"
+"	b3Mat3x3 transB;\n"
+"	transB = mtTranspose( b );\n"
+"	b3Mat3x3 ans;\n"
+"	//	why this doesn't run when 0ing in the for{}\n"
+"	a.m_row[0].w = 0.f;\n"
+"	a.m_row[1].w = 0.f;\n"
+"	a.m_row[2].w = 0.f;\n"
+"	for(int i=0; i<3; i++)\n"
+"	{\n"
+"//	a.m_row[i].w = 0.f;\n"
+"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
+"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
+"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
+"		ans.m_row[i].w = 0.f;\n"
+"	}\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
+"{\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
+"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
+"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
+"{\n"
+"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a, colx );\n"
+"	ans.y = b3Dot3F4( a, coly );\n"
+"	ans.z = b3Dot3F4( a, colz );\n"
+"	return ans;\n"
+"}\n"
+"#endif\n"
+"#endif //B3_MAT3x3_H\n"
+"typedef struct b3Aabb b3Aabb_t;\n"
+"struct b3Aabb\n"
+"{\n"
+"	union\n"
+"	{\n"
+"		float m_min[4];\n"
+"		b3Float4 m_minVec;\n"
+"		int m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float	m_max[4];\n"
+"		b3Float4 m_maxVec;\n"
+"		int m_signedMaxIndices[4];\n"
+"	};\n"
+"};\n"
+"inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n"
+"						b3Float4ConstArg pos,\n"
+"						b3QuatConstArg orn,\n"
+"						b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n"
+"{\n"
+"		b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n"
+"		localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n"
+"		b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n"
+"		b3Mat3x3 m;\n"
+"		m = b3QuatGetRotationMatrix(orn);\n"
+"		b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n"
+"		b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n"
+"		\n"
+"		b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n"
+"										 b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n"
+"										 b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n"
+"										 0.f);\n"
+"		*aabbMinOut = center-extent;\n"
+"		*aabbMaxOut = center+extent;\n"
+"}\n"
+"/// conservative test for overlap between two aabbs\n"
+"inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n"
+"								b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n"
+"{\n"
+"	bool overlap = true;\n"
+"	overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n"
+"	overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n"
+"	overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n"
+"	return overlap;\n"
+"}\n"
+"#endif //B3_AABB_H\n"
+"/*\n"
+"Bullet Continuous Collision Detection and Physics Library\n"
+"Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose,\n"
+"including commercial applications, and to alter it and redistribute it freely,\n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"#ifndef B3_INT2_H\n"
+"#define B3_INT2_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#define b3UnsignedInt2 uint2\n"
+"#define b3Int2 int2\n"
+"#define b3MakeInt2 (int2)\n"
+"#endif //__cplusplus\n"
+"#endif\n"
+"typedef struct\n"
+"{\n"
+"	float4 m_plane;\n"
+"	int m_indexOffset;\n"
+"	int m_numIndices;\n"
+"} btGpuFace;\n"
+"#define make_float4 (float4)\n"
+"__inline\n"
+"float4 cross3(float4 a, float4 b)\n"
+"{\n"
+"	return cross(a,b);\n"
+"	\n"
+"//	float4 a1 = make_float4(a.xyz,0.f);\n"
+"//	float4 b1 = make_float4(b.xyz,0.f);\n"
+"//	return cross(a1,b1);\n"
+"//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);\n"
+"	\n"
+"	//	float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);\n"
+"	\n"
+"	//return c;\n"
+"}\n"
+"__inline\n"
+"float dot3F4(float4 a, float4 b)\n"
+"{\n"
+"	float4 a1 = make_float4(a.xyz,0.f);\n"
+"	float4 b1 = make_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"}\n"
+"__inline\n"
+"float4 fastNormalize4(float4 v)\n"
+"{\n"
+"	v = make_float4(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"}\n"
+"///////////////////////////////////////\n"
+"//	Quaternion\n"
+"///////////////////////////////////////\n"
+"typedef float4 Quaternion;\n"
+"__inline\n"
+"Quaternion qtMul(Quaternion a, Quaternion b);\n"
+"__inline\n"
+"Quaternion qtNormalize(Quaternion in);\n"
+"__inline\n"
+"float4 qtRotate(Quaternion q, float4 vec);\n"
+"__inline\n"
+"Quaternion qtInvert(Quaternion q);\n"
+"__inline\n"
+"Quaternion qtMul(Quaternion a, Quaternion b)\n"
+"{\n"
+"	Quaternion ans;\n"
+"	ans = cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - dot3F4(a, b);\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"Quaternion qtNormalize(Quaternion in)\n"
+"{\n"
+"	return fastNormalize4(in);\n"
+"//	in /= length( in );\n"
+"//	return in;\n"
+"}\n"
+"__inline\n"
+"float4 qtRotate(Quaternion q, float4 vec)\n"
+"{\n"
+"	Quaternion qInv = qtInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"Quaternion qtInvert(Quaternion q)\n"
+"{\n"
+"	return (Quaternion)(-q.xyz, q.w);\n"
+"}\n"
+"__inline\n"
+"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
+"{\n"
+"	return qtRotate( qtInvert( q ), vec );\n"
+"}\n"
+"__inline\n"
+"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
+"{\n"
+"	return qtRotate( *orientation, *p ) + (*translation);\n"
+"}\n"
+"__inline\n"
+"float4 normalize3(const float4 a)\n"
+"{\n"
+"	float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
+"	return fastNormalize4( n );\n"
+"}\n"
+"inline void projectLocal(const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, \n"
+"const float4* dir, const float4* vertices, float* min, float* max)\n"
+"{\n"
+"	min[0] = FLT_MAX;\n"
+"	max[0] = -FLT_MAX;\n"
+"	int numVerts = hull->m_numVertices;\n"
+"	const float4 localDir = qtInvRotate(orn,*dir);\n"
+"	float offset = dot(pos,*dir);\n"
+"	for(int i=0;i<numVerts;i++)\n"
+"	{\n"
+"		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n"
+"		if(dp < min[0])	\n"
+"			min[0] = dp;\n"
+"		if(dp > max[0])	\n"
+"			max[0] = dp;\n"
+"	}\n"
+"	if(min[0]>max[0])\n"
+"	{\n"
+"		float tmp = min[0];\n"
+"		min[0] = max[0];\n"
+"		max[0] = tmp;\n"
+"	}\n"
+"	min[0] += offset;\n"
+"	max[0] += offset;\n"
+"}\n"
+"inline void project(__global const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, \n"
+"const float4* dir, __global const float4* vertices, float* min, float* max)\n"
+"{\n"
+"	min[0] = FLT_MAX;\n"
+"	max[0] = -FLT_MAX;\n"
+"	int numVerts = hull->m_numVertices;\n"
+"	const float4 localDir = qtInvRotate(orn,*dir);\n"
+"	float offset = dot(pos,*dir);\n"
+"	for(int i=0;i<numVerts;i++)\n"
+"	{\n"
+"		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n"
+"		if(dp < min[0])	\n"
+"			min[0] = dp;\n"
+"		if(dp > max[0])	\n"
+"			max[0] = dp;\n"
+"	}\n"
+"	if(min[0]>max[0])\n"
+"	{\n"
+"		float tmp = min[0];\n"
+"		min[0] = max[0];\n"
+"		max[0] = tmp;\n"
+"	}\n"
+"	min[0] += offset;\n"
+"	max[0] += offset;\n"
+"}\n"
+"inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA,const float4 ornA,\n"
+"	const float4 posB,const float4 ornB,\n"
+"	float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth)\n"
+"{\n"
+"	float Min0,Max0;\n"
+"	float Min1,Max1;\n"
+"	projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);\n"
+"	project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);\n"
+"	if(Max0<Min1 || Max1<Min0)\n"
+"		return false;\n"
+"	float d0 = Max0 - Min1;\n"
+"	float d1 = Max1 - Min0;\n"
+"	*depth = d0<d1 ? d0:d1;\n"
+"	return true;\n"
+"}\n"
+"inline bool IsAlmostZero(const float4 v)\n"
+"{\n"
+"	if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)\n"
+"		return false;\n"
+"	return true;\n"
+"}\n"
+"bool findSeparatingAxisLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	\n"
+"	const float4* verticesA, \n"
+"	const float4* uniqueEdgesA, \n"
+"	const btGpuFace* facesA,\n"
+"	const int*  indicesA,\n"
+"	__global const float4* verticesB, \n"
+"	__global const float4* uniqueEdgesB, \n"
+"	__global const btGpuFace* facesB,\n"
+"	__global const int*  indicesB,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"{\n"
+"	\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	{\n"
+"		int numFacesA = hullA->m_numFaces;\n"
+"		// Test normals from hullA\n"
+"		for(int i=0;i<numFacesA;i++)\n"
+"		{\n"
+"			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n"
+"			float4 faceANormalWS = qtRotate(ornA,normal);\n"
+"			if (dot3F4(DeltaC2,faceANormalWS)<0)\n"
+"				faceANormalWS*=-1.f;\n"
+"			curPlaneTests++;\n"
+"			float d;\n"
+"			if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))\n"
+"				return false;\n"
+"			if(d<*dmin)\n"
+"			{\n"
+"				*dmin = d;\n"
+"				*sep = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"}\n"
+"bool findSeparatingAxisLocalB(	__global const ConvexPolyhedronCL* hullA,  const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	__global const float4* verticesA, \n"
+"	__global const float4* uniqueEdgesA, \n"
+"	__global const btGpuFace* facesA,\n"
+"	__global const int*  indicesA,\n"
+"	const float4* verticesB,\n"
+"	const float4* uniqueEdgesB, \n"
+"	const btGpuFace* facesB,\n"
+"	const int*  indicesB,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"{\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	{\n"
+"		int numFacesA = hullA->m_numFaces;\n"
+"		// Test normals from hullA\n"
+"		for(int i=0;i<numFacesA;i++)\n"
+"		{\n"
+"			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n"
+"			float4 faceANormalWS = qtRotate(ornA,normal);\n"
+"			if (dot3F4(DeltaC2,faceANormalWS)<0)\n"
+"				faceANormalWS *= -1.f;\n"
+"			curPlaneTests++;\n"
+"			float d;\n"
+"			if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d))\n"
+"				return false;\n"
+"			if(d<*dmin)\n"
+"			{\n"
+"				*dmin = d;\n"
+"				*sep = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"}\n"
+"bool findSeparatingAxisEdgeEdgeLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	const float4* verticesA, \n"
+"	const float4* uniqueEdgesA, \n"
+"	const btGpuFace* facesA,\n"
+"	const int*  indicesA,\n"
+"	__global const float4* verticesB, \n"
+"	__global const float4* uniqueEdgesB, \n"
+"	__global const btGpuFace* facesB,\n"
+"	__global const int*  indicesB,\n"
+"		float4* sep,\n"
+"	float* dmin)\n"
+"{\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	int curEdgeEdge = 0;\n"
+"	// Test edges\n"
+"	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n"
+"	{\n"
+"		const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];\n"
+"		float4 edge0World = qtRotate(ornA,edge0);\n"
+"		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n"
+"		{\n"
+"			const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];\n"
+"			float4 edge1World = qtRotate(ornB,edge1);\n"
+"			float4 crossje = cross3(edge0World,edge1World);\n"
+"			curEdgeEdge++;\n"
+"			if(!IsAlmostZero(crossje))\n"
+"			{\n"
+"				crossje = normalize3(crossje);\n"
+"				if (dot3F4(DeltaC2,crossje)<0)\n"
+"					crossje *= -1.f;\n"
+"				float dist;\n"
+"				bool result = true;\n"
+"				{\n"
+"					float Min0,Max0;\n"
+"					float Min1,Max1;\n"
+"					projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);\n"
+"					project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);\n"
+"				\n"
+"					if(Max0<Min1 || Max1<Min0)\n"
+"						result = false;\n"
+"				\n"
+"					float d0 = Max0 - Min1;\n"
+"					float d1 = Max1 - Min0;\n"
+"					dist = d0<d1 ? d0:d1;\n"
+"					result = true;\n"
+"				}\n"
+"				\n"
+"				if(dist<*dmin)\n"
+"				{\n"
+"					*dmin = dist;\n"
+"					*sep = crossje;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"}\n"
+"inline int	findClippingFaces(const float4 separatingNormal,\n"
+"                      const ConvexPolyhedronCL* hullA, \n"
+"					  __global const ConvexPolyhedronCL* hullB,\n"
+"                      const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,\n"
+"                       __global float4* worldVertsA1,\n"
+"                      __global float4* worldNormalsA1,\n"
+"                      __global float4* worldVertsB1,\n"
+"                      int capacityWorldVerts,\n"
+"                      const float minDist, float maxDist,\n"
+"					  const float4* verticesA,\n"
+"                      const btGpuFace* facesA,\n"
+"                      const int* indicesA,\n"
+"					  __global const float4* verticesB,\n"
+"                      __global const btGpuFace* facesB,\n"
+"                      __global const int* indicesB,\n"
+"                      __global int4* clippingFaces, int pairIndex)\n"
+"{\n"
+"	int numContactsOut = 0;\n"
+"	int numWorldVertsB1= 0;\n"
+"    \n"
+"    \n"
+"	int closestFaceB=0;\n"
+"	float dmax = -FLT_MAX;\n"
+"    \n"
+"	{\n"
+"		for(int face=0;face<hullB->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x,\n"
+"                                              facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 WorldNormal = qtRotate(ornB, Normal);\n"
+"			float d = dot3F4(WorldNormal,separatingNormal);\n"
+"			if (d > dmax)\n"
+"			{\n"
+"				dmax = d;\n"
+"				closestFaceB = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"    \n"
+"	{\n"
+"		const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n"
+"		int numVertices = polyB.m_numIndices;\n"
+"        if (numVertices>capacityWorldVerts)\n"
+"            numVertices = capacityWorldVerts;\n"
+"        if (numVertices<0)\n"
+"            numVertices = 0;\n"
+"        \n"
+"		for(int e0=0;e0<numVertices;e0++)\n"
+"		{\n"
+"            if (e0<capacityWorldVerts)\n"
+"            {\n"
+"                const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];\n"
+"                worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);\n"
+"            }\n"
+"		}\n"
+"	}\n"
+"    \n"
+"    int closestFaceA=0;\n"
+"	{\n"
+"		float dmin = FLT_MAX;\n"
+"		for(int face=0;face<hullA->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(\n"
+"                                              facesA[hullA->m_faceOffset+face].m_plane.x,\n"
+"                                              facesA[hullA->m_faceOffset+face].m_plane.y,\n"
+"                                              facesA[hullA->m_faceOffset+face].m_plane.z,\n"
+"                                              0.f);\n"
+"			const float4 faceANormalWS = qtRotate(ornA,Normal);\n"
+"            \n"
+"			float d = dot3F4(faceANormalWS,separatingNormal);\n"
+"			if (d < dmin)\n"
+"			{\n"
+"				dmin = d;\n"
+"				closestFaceA = face;\n"
+"                worldNormalsA1[pairIndex] = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"    \n"
+"    int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices;\n"
+"    if (numVerticesA>capacityWorldVerts)\n"
+"       numVerticesA = capacityWorldVerts;\n"
+"    if (numVerticesA<0)\n"
+"        numVerticesA=0;\n"
+"    \n"
+"	for(int e0=0;e0<numVerticesA;e0++)\n"
+"	{\n"
+"        if (e0<capacityWorldVerts)\n"
+"        {\n"
+"            const float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];\n"
+"            worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);\n"
+"        }\n"
+"    }\n"
+"    \n"
+"    clippingFaces[pairIndex].x = closestFaceA;\n"
+"    clippingFaces[pairIndex].y = closestFaceB;\n"
+"    clippingFaces[pairIndex].z = numVerticesA;\n"
+"    clippingFaces[pairIndex].w = numWorldVertsB1;\n"
+"    \n"
+"    \n"
+"	return numContactsOut;\n"
+"}\n"
+"// work-in-progress\n"
+"__kernel void   findConcaveSeparatingAxisVertexFaceKernel( __global int4* concavePairs,\n"
+"                                                __global const BodyData* rigidBodies,\n"
+"                                                __global const btCollidableGpu* collidables,\n"
+"                                                __global const ConvexPolyhedronCL* convexShapes,\n"
+"                                                __global const float4* vertices,\n"
+"                                                __global const float4* uniqueEdges,\n"
+"                                                __global const btGpuFace* faces,\n"
+"                                                __global const int* indices,\n"
+"                                                __global const btGpuChildShape* gpuChildShapes,\n"
+"                                                __global btAabbCL* aabbs,\n"
+"                                                __global float4* concaveSeparatingNormalsOut,\n"
+"                                                __global int* concaveHasSeparatingNormals,\n"
+"                                                __global int4* clippingFacesOut,\n"
+"                                                __global float4* worldVertsA1GPU,\n"
+"                                                __global float4*  worldNormalsAGPU,\n"
+"                                                __global float4* worldVertsB1GPU,\n"
+"                                                __global float* dmins,\n"
+"                                                int vertexFaceCapacity,\n"
+"                                                int numConcavePairs\n"
+"                                                )\n"
+"{\n"
+"    \n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numConcavePairs)\n"
+"		return;\n"
+"    \n"
+"	concaveHasSeparatingNormals[i] = 0;\n"
+"    \n"
+"	int pairIdx = i;\n"
+"    \n"
+"	int bodyIndexA = concavePairs[i].x;\n"
+"	int bodyIndexB = concavePairs[i].y;\n"
+"    \n"
+"	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"    \n"
+"	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"    \n"
+"	if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&&\n"
+"		collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"	{\n"
+"		concavePairs[pairIdx].w = -1;\n"
+"		return;\n"
+"	}\n"
+"    \n"
+"    \n"
+"    \n"
+"	int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"	int numActualConcaveConvexTests = 0;\n"
+"	\n"
+"	int f = concavePairs[i].z;\n"
+"	\n"
+"	bool overlap = false;\n"
+"	\n"
+"	ConvexPolyhedronCL convexPolyhedronA;\n"
+"    \n"
+"	//add 3 vertices of the triangle\n"
+"	convexPolyhedronA.m_numVertices = 3;\n"
+"	convexPolyhedronA.m_vertexOffset = 0;\n"
+"	float4	localCenter = make_float4(0.f,0.f,0.f,0.f);\n"
+"    \n"
+"	btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
+"	float4 triMinAabb, triMaxAabb;\n"
+"	btAabbCL triAabb;\n"
+"	triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n"
+"	triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n"
+"	\n"
+"	float4 verticesA[3];\n"
+"	for (int i=0;i<3;i++)\n"
+"	{\n"
+"		int index = indices[face.m_indexOffset+i];\n"
+"		float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n"
+"		verticesA[i] = vert;\n"
+"		localCenter += vert;\n"
+"        \n"
+"		triAabb.m_min = min(triAabb.m_min,vert);\n"
+"		triAabb.m_max = max(triAabb.m_max,vert);\n"
+"        \n"
+"	}\n"
+"    \n"
+"	overlap = true;\n"
+"	overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;\n"
+"	overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;\n"
+"	overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;\n"
+"    \n"
+"	if (overlap)\n"
+"	{\n"
+"		float dmin = FLT_MAX;\n"
+"		int hasSeparatingAxis=5;\n"
+"		float4 sepAxis=make_float4(1,2,3,4);\n"
+"        \n"
+"		int localCC=0;\n"
+"		numActualConcaveConvexTests++;\n"
+"        \n"
+"		//a triangle has 3 unique edges\n"
+"		convexPolyhedronA.m_numUniqueEdges = 3;\n"
+"		convexPolyhedronA.m_uniqueEdgesOffset = 0;\n"
+"		float4 uniqueEdgesA[3];\n"
+"		\n"
+"		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n"
+"		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n"
+"		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n"
+"        \n"
+"        \n"
+"		convexPolyhedronA.m_faceOffset = 0;\n"
+"        \n"
+"		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n"
+"        \n"
+"		btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
+"		int indicesA[3+3+2+2+2];\n"
+"		int curUsedIndices=0;\n"
+"		int fidx=0;\n"
+"        \n"
+"		//front size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[0] = 0;\n"
+"			indicesA[1] = 1;\n"
+"			indicesA[2] = 2;\n"
+"			curUsedIndices+=3;\n"
+"			float c = face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = normal.x;\n"
+"			facesA[fidx].m_plane.y = normal.y;\n"
+"			facesA[fidx].m_plane.z = normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"		//back size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[3]=2;\n"
+"			indicesA[4]=1;\n"
+"			indicesA[5]=0;\n"
+"			curUsedIndices+=3;\n"
+"			float c = dot(normal,verticesA[0]);\n"
+"			float c1 = -face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = -normal.x;\n"
+"			facesA[fidx].m_plane.y = -normal.y;\n"
+"			facesA[fidx].m_plane.z = -normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"        \n"
+"		bool addEdgePlanes = true;\n"
+"		if (addEdgePlanes)\n"
+"		{\n"
+"			int numVertices=3;\n"
+"			int prevVertex = numVertices-1;\n"
+"			for (int i=0;i<numVertices;i++)\n"
+"			{\n"
+"				float4 v0 = verticesA[i];\n"
+"				float4 v1 = verticesA[prevVertex];\n"
+"                \n"
+"				float4 edgeNormal = normalize(cross(normal,v1-v0));\n"
+"				float c = -dot(edgeNormal,v0);\n"
+"                \n"
+"				facesA[fidx].m_numIndices = 2;\n"
+"				facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"				indicesA[curUsedIndices++]=i;\n"
+"				indicesA[curUsedIndices++]=prevVertex;\n"
+"                \n"
+"				facesA[fidx].m_plane.x = edgeNormal.x;\n"
+"				facesA[fidx].m_plane.y = edgeNormal.y;\n"
+"				facesA[fidx].m_plane.z = edgeNormal.z;\n"
+"				facesA[fidx].m_plane.w = c;\n"
+"				fidx++;\n"
+"				prevVertex = i;\n"
+"			}\n"
+"		}\n"
+"		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n"
+"		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n"
+"        \n"
+"        \n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		posA.w = 0.f;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"		posB.w = 0.f;\n"
+"        \n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"        \n"
+"		\n"
+"        \n"
+"        \n"
+"		///////////////////\n"
+"		///compound shape support\n"
+"        \n"
+"		if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"		{\n"
+"			int compoundChild = concavePairs[pairIdx].w;\n"
+"			int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;\n"
+"			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"			float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"			float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"			posB = newPosB;\n"
+"			ornB = newOrnB;\n"
+"			shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n"
+"		}\n"
+"		//////////////////\n"
+"        \n"
+"		float4 c0local = convexPolyhedronA.m_localCenter;\n"
+"		float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"		float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"		float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"		const float4 DeltaC2 = c0 - c1;\n"
+"        \n"
+"        \n"
+"		bool sepA = findSeparatingAxisLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],\n"
+"                                             posA,ornA,\n"
+"                                             posB,ornB,\n"
+"                                             DeltaC2,\n"
+"                                             verticesA,uniqueEdgesA,facesA,indicesA,\n"
+"                                             vertices,uniqueEdges,faces,indices,\n"
+"                                             &sepAxis,&dmin);\n"
+"		hasSeparatingAxis = 4;\n"
+"		if (!sepA)\n"
+"		{\n"
+"			hasSeparatingAxis = 0;\n"
+"		} else\n"
+"		{\n"
+"			bool sepB = findSeparatingAxisLocalB(	&convexShapes[shapeIndexB],&convexPolyhedronA,\n"
+"                                                 posB,ornB,\n"
+"                                                 posA,ornA,\n"
+"                                                 DeltaC2,\n"
+"                                                 vertices,uniqueEdges,faces,indices,\n"
+"                                                 verticesA,uniqueEdgesA,facesA,indicesA,\n"
+"                                                 &sepAxis,&dmin);\n"
+"            \n"
+"			if (!sepB)\n"
+"			{\n"
+"				hasSeparatingAxis = 0;\n"
+"			} else\n"
+"			{\n"
+"				hasSeparatingAxis = 1;\n"
+"			}\n"
+"		}	\n"
+"		\n"
+"		if (hasSeparatingAxis)\n"
+"		{\n"
+"            dmins[i] = dmin;\n"
+"			concaveSeparatingNormalsOut[pairIdx]=sepAxis;\n"
+"			concaveHasSeparatingNormals[i]=1;\n"
+"            \n"
+"		} else\n"
+"		{	\n"
+"			//mark this pair as in-active\n"
+"			concavePairs[pairIdx].w = -1;\n"
+"		}\n"
+"	}\n"
+"	else\n"
+"	{	\n"
+"		//mark this pair as in-active\n"
+"		concavePairs[pairIdx].w = -1;\n"
+"	}\n"
+"}\n"
+"// work-in-progress\n"
+"__kernel void   findConcaveSeparatingAxisEdgeEdgeKernel( __global int4* concavePairs,\n"
+"                                                          __global const BodyData* rigidBodies,\n"
+"                                                          __global const btCollidableGpu* collidables,\n"
+"                                                          __global const ConvexPolyhedronCL* convexShapes,\n"
+"                                                          __global const float4* vertices,\n"
+"                                                          __global const float4* uniqueEdges,\n"
+"                                                          __global const btGpuFace* faces,\n"
+"                                                          __global const int* indices,\n"
+"                                                          __global const btGpuChildShape* gpuChildShapes,\n"
+"                                                          __global btAabbCL* aabbs,\n"
+"                                                          __global float4* concaveSeparatingNormalsOut,\n"
+"                                                          __global int* concaveHasSeparatingNormals,\n"
+"                                                          __global int4* clippingFacesOut,\n"
+"                                                          __global float4* worldVertsA1GPU,\n"
+"                                                          __global float4*  worldNormalsAGPU,\n"
+"                                                          __global float4* worldVertsB1GPU,\n"
+"                                                          __global float* dmins,\n"
+"                                                          int vertexFaceCapacity,\n"
+"                                                          int numConcavePairs\n"
+"                                                          )\n"
+"{\n"
+"    \n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numConcavePairs)\n"
+"		return;\n"
+"    \n"
+"	if (!concaveHasSeparatingNormals[i])\n"
+"        return;\n"
+"    \n"
+"	int pairIdx = i;\n"
+"    \n"
+"	int bodyIndexA = concavePairs[i].x;\n"
+"	int bodyIndexB = concavePairs[i].y;\n"
+"    \n"
+"	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"    \n"
+"	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"    \n"
+"    \n"
+"	int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"	int numActualConcaveConvexTests = 0;\n"
+"	\n"
+"	int f = concavePairs[i].z;\n"
+"	\n"
+"	bool overlap = false;\n"
+"	\n"
+"	ConvexPolyhedronCL convexPolyhedronA;\n"
+"    \n"
+"	//add 3 vertices of the triangle\n"
+"	convexPolyhedronA.m_numVertices = 3;\n"
+"	convexPolyhedronA.m_vertexOffset = 0;\n"
+"	float4	localCenter = make_float4(0.f,0.f,0.f,0.f);\n"
+"    \n"
+"	btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
+"	float4 triMinAabb, triMaxAabb;\n"
+"	btAabbCL triAabb;\n"
+"	triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n"
+"	triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n"
+"	\n"
+"	float4 verticesA[3];\n"
+"	for (int i=0;i<3;i++)\n"
+"	{\n"
+"		int index = indices[face.m_indexOffset+i];\n"
+"		float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n"
+"		verticesA[i] = vert;\n"
+"		localCenter += vert;\n"
+"        \n"
+"		triAabb.m_min = min(triAabb.m_min,vert);\n"
+"		triAabb.m_max = max(triAabb.m_max,vert);\n"
+"        \n"
+"	}\n"
+"    \n"
+"	overlap = true;\n"
+"	overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;\n"
+"	overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;\n"
+"	overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;\n"
+"    \n"
+"	if (overlap)\n"
+"	{\n"
+"		float dmin = dmins[i];\n"
+"		int hasSeparatingAxis=5;\n"
+"		float4 sepAxis=make_float4(1,2,3,4);\n"
+"        sepAxis = concaveSeparatingNormalsOut[pairIdx];\n"
+"        \n"
+"		int localCC=0;\n"
+"		numActualConcaveConvexTests++;\n"
+"        \n"
+"		//a triangle has 3 unique edges\n"
+"		convexPolyhedronA.m_numUniqueEdges = 3;\n"
+"		convexPolyhedronA.m_uniqueEdgesOffset = 0;\n"
+"		float4 uniqueEdgesA[3];\n"
+"		\n"
+"		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n"
+"		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n"
+"		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n"
+"        \n"
+"        \n"
+"		convexPolyhedronA.m_faceOffset = 0;\n"
+"        \n"
+"		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n"
+"        \n"
+"		btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
+"		int indicesA[3+3+2+2+2];\n"
+"		int curUsedIndices=0;\n"
+"		int fidx=0;\n"
+"        \n"
+"		//front size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[0] = 0;\n"
+"			indicesA[1] = 1;\n"
+"			indicesA[2] = 2;\n"
+"			curUsedIndices+=3;\n"
+"			float c = face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = normal.x;\n"
+"			facesA[fidx].m_plane.y = normal.y;\n"
+"			facesA[fidx].m_plane.z = normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"		//back size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[3]=2;\n"
+"			indicesA[4]=1;\n"
+"			indicesA[5]=0;\n"
+"			curUsedIndices+=3;\n"
+"			float c = dot(normal,verticesA[0]);\n"
+"			float c1 = -face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = -normal.x;\n"
+"			facesA[fidx].m_plane.y = -normal.y;\n"
+"			facesA[fidx].m_plane.z = -normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"        \n"
+"		bool addEdgePlanes = true;\n"
+"		if (addEdgePlanes)\n"
+"		{\n"
+"			int numVertices=3;\n"
+"			int prevVertex = numVertices-1;\n"
+"			for (int i=0;i<numVertices;i++)\n"
+"			{\n"
+"				float4 v0 = verticesA[i];\n"
+"				float4 v1 = verticesA[prevVertex];\n"
+"                \n"
+"				float4 edgeNormal = normalize(cross(normal,v1-v0));\n"
+"				float c = -dot(edgeNormal,v0);\n"
+"                \n"
+"				facesA[fidx].m_numIndices = 2;\n"
+"				facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"				indicesA[curUsedIndices++]=i;\n"
+"				indicesA[curUsedIndices++]=prevVertex;\n"
+"                \n"
+"				facesA[fidx].m_plane.x = edgeNormal.x;\n"
+"				facesA[fidx].m_plane.y = edgeNormal.y;\n"
+"				facesA[fidx].m_plane.z = edgeNormal.z;\n"
+"				facesA[fidx].m_plane.w = c;\n"
+"				fidx++;\n"
+"				prevVertex = i;\n"
+"			}\n"
+"		}\n"
+"		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n"
+"		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n"
+"        \n"
+"        \n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		posA.w = 0.f;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"		posB.w = 0.f;\n"
+"        \n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"        \n"
+"		\n"
+"        \n"
+"        \n"
+"		///////////////////\n"
+"		///compound shape support\n"
+"        \n"
+"		if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"		{\n"
+"			int compoundChild = concavePairs[pairIdx].w;\n"
+"			int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;\n"
+"			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"			float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"			float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"			posB = newPosB;\n"
+"			ornB = newOrnB;\n"
+"			shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n"
+"		}\n"
+"		//////////////////\n"
+"        \n"
+"		float4 c0local = convexPolyhedronA.m_localCenter;\n"
+"		float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"		float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"		float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"		const float4 DeltaC2 = c0 - c1;\n"
+"        \n"
+"        \n"
+"		{\n"
+"			bool sepEE = findSeparatingAxisEdgeEdgeLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],\n"
+"                                                              posA,ornA,\n"
+"                                                              posB,ornB,\n"
+"                                                              DeltaC2,\n"
+"                                                              verticesA,uniqueEdgesA,facesA,indicesA,\n"
+"                                                              vertices,uniqueEdges,faces,indices,\n"
+"                                                              &sepAxis,&dmin);\n"
+"                \n"
+"			if (!sepEE)\n"
+"			{\n"
+"				hasSeparatingAxis = 0;\n"
+"			} else\n"
+"			{\n"
+"				hasSeparatingAxis = 1;\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		\n"
+"		if (hasSeparatingAxis)\n"
+"		{\n"
+"			sepAxis.w = dmin;\n"
+"            dmins[i] = dmin;\n"
+"			concaveSeparatingNormalsOut[pairIdx]=sepAxis;\n"
+"			concaveHasSeparatingNormals[i]=1;\n"
+"           \n"
+" 	float minDist = -1e30f;\n"
+"			float maxDist = 0.02f;\n"
+"            \n"
+"            findClippingFaces(sepAxis,\n"
+"                              &convexPolyhedronA,\n"
+"                              &convexShapes[shapeIndexB],\n"
+"                              posA,ornA,\n"
+"                              posB,ornB,\n"
+"                              worldVertsA1GPU,\n"
+"                              worldNormalsAGPU,\n"
+"                              worldVertsB1GPU,\n"
+"                              vertexFaceCapacity,\n"
+"                              minDist, maxDist,\n"
+"                              verticesA,\n"
+"                              facesA,\n"
+"                              indicesA,\n"
+"                              vertices,\n"
+"                              faces,\n"
+"                              indices,\n"
+"                              clippingFacesOut, pairIdx);\n"
+"	           \n"
+"            \n"
+"		} else\n"
+"		{	\n"
+"			//mark this pair as in-active\n"
+"			concavePairs[pairIdx].w = -1;\n"
+"		}\n"
+"	}\n"
+"	else\n"
+"	{	\n"
+"		//mark this pair as in-active\n"
+"		concavePairs[pairIdx].w = -1;\n"
+"	}\n"
+"	\n"
+"	concavePairs[i].z = -1;//for the next stage, z is used to determine existing contact points\n"
+"}\n"
+;
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h
new file mode 100644
index 0000000000..6f8b0a90db
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h
@@ -0,0 +1,2104 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* satKernelsCL= \
+"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
+"//written by Erwin Coumans\n"
+"#define SHAPE_CONVEX_HULL 3\n"
+"#define SHAPE_CONCAVE_TRIMESH 5\n"
+"#define TRIANGLE_NUM_CONVEX_FACES 5\n"
+"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
+"#define B3_MAX_STACK_DEPTH 256\n"
+"typedef unsigned int u32;\n"
+"///keep this in sync with btCollidable.h\n"
+"typedef struct\n"
+"{\n"
+"	union {\n"
+"		int m_numChildShapes;\n"
+"		int m_bvhIndex;\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float m_radius;\n"
+"		int	m_compoundBvhIndex;\n"
+"	};\n"
+"	\n"
+"	int m_shapeType;\n"
+"	int m_shapeIndex;\n"
+"	\n"
+"} btCollidableGpu;\n"
+"#define MAX_NUM_PARTS_IN_BITS 10\n"
+"///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
+"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n"
+"typedef struct\n"
+"{\n"
+"	//12 bytes\n"
+"	unsigned short int	m_quantizedAabbMin[3];\n"
+"	unsigned short int	m_quantizedAabbMax[3];\n"
+"	//4 bytes\n"
+"	int	m_escapeIndexOrTriangleIndex;\n"
+"} b3QuantizedBvhNode;\n"
+"typedef struct\n"
+"{\n"
+"	float4		m_aabbMin;\n"
+"	float4		m_aabbMax;\n"
+"	float4		m_quantization;\n"
+"	int			m_numNodes;\n"
+"	int			m_numSubTrees;\n"
+"	int			m_nodeOffset;\n"
+"	int			m_subTreeOffset;\n"
+"} b3BvhInfo;\n"
+"int	getTriangleIndex(const b3QuantizedBvhNode* rootNode)\n"
+"{\n"
+"	unsigned int x=0;\n"
+"	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
+"	// Get only the lower bits where the triangle index is stored\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
+"}\n"
+"int	getTriangleIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n"
+"{\n"
+"	unsigned int x=0;\n"
+"	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
+"	// Get only the lower bits where the triangle index is stored\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
+"}\n"
+"int isLeafNode(const b3QuantizedBvhNode* rootNode)\n"
+"{\n"
+"	//skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
+"}\n"
+"int isLeafNodeGlobal(__global const b3QuantizedBvhNode* rootNode)\n"
+"{\n"
+"	//skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
+"	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
+"}\n"
+"	\n"
+"int getEscapeIndex(const b3QuantizedBvhNode* rootNode)\n"
+"{\n"
+"	return -rootNode->m_escapeIndexOrTriangleIndex;\n"
+"}\n"
+"int getEscapeIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n"
+"{\n"
+"	return -rootNode->m_escapeIndexOrTriangleIndex;\n"
+"}\n"
+"typedef struct\n"
+"{\n"
+"	//12 bytes\n"
+"	unsigned short int	m_quantizedAabbMin[3];\n"
+"	unsigned short int	m_quantizedAabbMax[3];\n"
+"	//4 bytes, points to the root of the subtree\n"
+"	int			m_rootNodeIndex;\n"
+"	//4 bytes\n"
+"	int			m_subtreeSize;\n"
+"	int			m_padding[3];\n"
+"} b3BvhSubtreeInfo;\n"
+"typedef struct\n"
+"{\n"
+"	float4	m_childPosition;\n"
+"	float4	m_childOrientation;\n"
+"	int m_shapeIndex;\n"
+"	int m_unused0;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"} btGpuChildShape;\n"
+"typedef struct\n"
+"{\n"
+"	float4 m_pos;\n"
+"	float4 m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"	u32 m_collidableIdx;\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} BodyData;\n"
+"typedef struct  \n"
+"{\n"
+"	float4		m_localCenter;\n"
+"	float4		m_extents;\n"
+"	float4		mC;\n"
+"	float4		mE;\n"
+"	\n"
+"	float			m_radius;\n"
+"	int	m_faceOffset;\n"
+"	int m_numFaces;\n"
+"	int	m_numVertices;\n"
+"	int m_vertexOffset;\n"
+"	int	m_uniqueEdgesOffset;\n"
+"	int	m_numUniqueEdges;\n"
+"	int m_unused;\n"
+"} ConvexPolyhedronCL;\n"
+"typedef struct \n"
+"{\n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} btAabbCL;\n"
+"#ifndef B3_AABB_H\n"
+"#define B3_AABB_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#define B3_PLATFORM_DEFINITIONS_H\n"
+"struct MyTest\n"
+"{\n"
+"	int bla;\n"
+"};\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3AtomicAdd atomic_add\n"
+"#define b3Fabs fabs\n"
+"#define b3Sqrt native_sqrt\n"
+"#define b3Sin native_sin\n"
+"#define b3Cos native_cos\n"
+"#define B3_STATIC\n"
+"#endif\n"
+"#endif\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"	#define b3MinFloat4 min\n"
+"	#define b3MaxFloat4 max\n"
+"	#define b3Normalized(a) normalize(a)\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"{\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
+"}\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"{\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"}\n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_MAT3x3_H\n"
+"#define B3_MAT3x3_H\n"
+"#ifndef B3_QUAT_H\n"
+"#define B3_QUAT_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif\n"
+"#endif\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"	typedef float4	b3Quat;\n"
+"	#define b3QuatConstArg const b3Quat\n"
+"	\n"
+"	\n"
+"inline float4 b3FastNormalize4(float4 v)\n"
+"{\n"
+"	v = (float4)(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"}\n"
+"	\n"
+"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
+"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
+"{\n"
+"	b3Quat ans;\n"
+"	ans = b3Cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
+"	return ans;\n"
+"}\n"
+"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
+"{\n"
+"	b3Quat q;\n"
+"	q=in;\n"
+"	//return b3FastNormalize4(in);\n"
+"	float len = native_sqrt(dot(q, q));\n"
+"	if(len > 0.f)\n"
+"	{\n"
+"		q *= 1.f / len;\n"
+"	}\n"
+"	else\n"
+"	{\n"
+"		q.x = q.y = q.z = 0.f;\n"
+"		q.w = 1.f;\n"
+"	}\n"
+"	return q;\n"
+"}\n"
+"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"{\n"
+"	b3Quat qInv = b3QuatInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"}\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
+"{\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"}\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
+"{\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"}\n"
+"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"{\n"
+"	return b3QuatRotate( b3QuatInvert( q ), vec );\n"
+"}\n"
+"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)\n"
+"{\n"
+"	return b3QuatRotate( orientation, point ) + (translation);\n"
+"}\n"
+"	\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"typedef struct\n"
+"{\n"
+"	b3Float4 m_row[3];\n"
+"}b3Mat3x3;\n"
+"#define b3Mat3x3ConstArg const b3Mat3x3\n"
+"#define b3GetRow(m,row) (m.m_row[row])\n"
+"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
+"{\n"
+"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
+"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
+"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
+"	out.m_row[0].w = 0.f;\n"
+"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
+"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
+"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
+"	out.m_row[1].w = 0.f;\n"
+"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
+"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
+"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
+"	out.m_row[2].w = 0.f;\n"
+"	return out;\n"
+"}\n"
+"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
+"{\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = fabs(matIn.m_row[0]);\n"
+"	out.m_row[1] = fabs(matIn.m_row[1]);\n"
+"	out.m_row[2] = fabs(matIn.m_row[2]);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtZero();\n"
+"__inline\n"
+"b3Mat3x3 mtIdentity();\n"
+"__inline\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
+"__inline\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
+"__inline\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
+"__inline\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
+"__inline\n"
+"b3Mat3x3 mtZero()\n"
+"{\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(0.f);\n"
+"	m.m_row[1] = (b3Float4)(0.f);\n"
+"	m.m_row[2] = (b3Float4)(0.f);\n"
+"	return m;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtIdentity()\n"
+"{\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
+"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
+"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
+"	return m;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
+"{\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
+"{\n"
+"	b3Mat3x3 transB;\n"
+"	transB = mtTranspose( b );\n"
+"	b3Mat3x3 ans;\n"
+"	//	why this doesn't run when 0ing in the for{}\n"
+"	a.m_row[0].w = 0.f;\n"
+"	a.m_row[1].w = 0.f;\n"
+"	a.m_row[2].w = 0.f;\n"
+"	for(int i=0; i<3; i++)\n"
+"	{\n"
+"//	a.m_row[i].w = 0.f;\n"
+"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
+"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
+"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
+"		ans.m_row[i].w = 0.f;\n"
+"	}\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
+"{\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
+"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
+"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
+"{\n"
+"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a, colx );\n"
+"	ans.y = b3Dot3F4( a, coly );\n"
+"	ans.z = b3Dot3F4( a, colz );\n"
+"	return ans;\n"
+"}\n"
+"#endif\n"
+"#endif //B3_MAT3x3_H\n"
+"typedef struct b3Aabb b3Aabb_t;\n"
+"struct b3Aabb\n"
+"{\n"
+"	union\n"
+"	{\n"
+"		float m_min[4];\n"
+"		b3Float4 m_minVec;\n"
+"		int m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float	m_max[4];\n"
+"		b3Float4 m_maxVec;\n"
+"		int m_signedMaxIndices[4];\n"
+"	};\n"
+"};\n"
+"inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n"
+"						b3Float4ConstArg pos,\n"
+"						b3QuatConstArg orn,\n"
+"						b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n"
+"{\n"
+"		b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n"
+"		localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n"
+"		b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n"
+"		b3Mat3x3 m;\n"
+"		m = b3QuatGetRotationMatrix(orn);\n"
+"		b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n"
+"		b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n"
+"		\n"
+"		b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n"
+"										 b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n"
+"										 b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n"
+"										 0.f);\n"
+"		*aabbMinOut = center-extent;\n"
+"		*aabbMaxOut = center+extent;\n"
+"}\n"
+"/// conservative test for overlap between two aabbs\n"
+"inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n"
+"								b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n"
+"{\n"
+"	bool overlap = true;\n"
+"	overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n"
+"	overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n"
+"	overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n"
+"	return overlap;\n"
+"}\n"
+"#endif //B3_AABB_H\n"
+"/*\n"
+"Bullet Continuous Collision Detection and Physics Library\n"
+"Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose,\n"
+"including commercial applications, and to alter it and redistribute it freely,\n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"#ifndef B3_INT2_H\n"
+"#define B3_INT2_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#define b3UnsignedInt2 uint2\n"
+"#define b3Int2 int2\n"
+"#define b3MakeInt2 (int2)\n"
+"#endif //__cplusplus\n"
+"#endif\n"
+"typedef struct\n"
+"{\n"
+"	float4 m_plane;\n"
+"	int m_indexOffset;\n"
+"	int m_numIndices;\n"
+"} btGpuFace;\n"
+"#define make_float4 (float4)\n"
+"__inline\n"
+"float4 cross3(float4 a, float4 b)\n"
+"{\n"
+"	return cross(a,b);\n"
+"	\n"
+"//	float4 a1 = make_float4(a.xyz,0.f);\n"
+"//	float4 b1 = make_float4(b.xyz,0.f);\n"
+"//	return cross(a1,b1);\n"
+"//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);\n"
+"	\n"
+"	//	float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);\n"
+"	\n"
+"	//return c;\n"
+"}\n"
+"__inline\n"
+"float dot3F4(float4 a, float4 b)\n"
+"{\n"
+"	float4 a1 = make_float4(a.xyz,0.f);\n"
+"	float4 b1 = make_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"}\n"
+"__inline\n"
+"float4 fastNormalize4(float4 v)\n"
+"{\n"
+"	v = make_float4(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"}\n"
+"///////////////////////////////////////\n"
+"//	Quaternion\n"
+"///////////////////////////////////////\n"
+"typedef float4 Quaternion;\n"
+"__inline\n"
+"Quaternion qtMul(Quaternion a, Quaternion b);\n"
+"__inline\n"
+"Quaternion qtNormalize(Quaternion in);\n"
+"__inline\n"
+"float4 qtRotate(Quaternion q, float4 vec);\n"
+"__inline\n"
+"Quaternion qtInvert(Quaternion q);\n"
+"__inline\n"
+"Quaternion qtMul(Quaternion a, Quaternion b)\n"
+"{\n"
+"	Quaternion ans;\n"
+"	ans = cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - dot3F4(a, b);\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"Quaternion qtNormalize(Quaternion in)\n"
+"{\n"
+"	return fastNormalize4(in);\n"
+"//	in /= length( in );\n"
+"//	return in;\n"
+"}\n"
+"__inline\n"
+"float4 qtRotate(Quaternion q, float4 vec)\n"
+"{\n"
+"	Quaternion qInv = qtInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"Quaternion qtInvert(Quaternion q)\n"
+"{\n"
+"	return (Quaternion)(-q.xyz, q.w);\n"
+"}\n"
+"__inline\n"
+"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
+"{\n"
+"	return qtRotate( qtInvert( q ), vec );\n"
+"}\n"
+"__inline\n"
+"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
+"{\n"
+"	return qtRotate( *orientation, *p ) + (*translation);\n"
+"}\n"
+"__inline\n"
+"float4 normalize3(const float4 a)\n"
+"{\n"
+"	float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
+"	return fastNormalize4( n );\n"
+"}\n"
+"inline void projectLocal(const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, \n"
+"const float4* dir, const float4* vertices, float* min, float* max)\n"
+"{\n"
+"	min[0] = FLT_MAX;\n"
+"	max[0] = -FLT_MAX;\n"
+"	int numVerts = hull->m_numVertices;\n"
+"	const float4 localDir = qtInvRotate(orn,*dir);\n"
+"	float offset = dot(pos,*dir);\n"
+"	for(int i=0;i<numVerts;i++)\n"
+"	{\n"
+"		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n"
+"		if(dp < min[0])	\n"
+"			min[0] = dp;\n"
+"		if(dp > max[0])	\n"
+"			max[0] = dp;\n"
+"	}\n"
+"	if(min[0]>max[0])\n"
+"	{\n"
+"		float tmp = min[0];\n"
+"		min[0] = max[0];\n"
+"		max[0] = tmp;\n"
+"	}\n"
+"	min[0] += offset;\n"
+"	max[0] += offset;\n"
+"}\n"
+"inline void project(__global const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, \n"
+"const float4* dir, __global const float4* vertices, float* min, float* max)\n"
+"{\n"
+"	min[0] = FLT_MAX;\n"
+"	max[0] = -FLT_MAX;\n"
+"	int numVerts = hull->m_numVertices;\n"
+"	const float4 localDir = qtInvRotate(orn,*dir);\n"
+"	float offset = dot(pos,*dir);\n"
+"	for(int i=0;i<numVerts;i++)\n"
+"	{\n"
+"		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n"
+"		if(dp < min[0])	\n"
+"			min[0] = dp;\n"
+"		if(dp > max[0])	\n"
+"			max[0] = dp;\n"
+"	}\n"
+"	if(min[0]>max[0])\n"
+"	{\n"
+"		float tmp = min[0];\n"
+"		min[0] = max[0];\n"
+"		max[0] = tmp;\n"
+"	}\n"
+"	min[0] += offset;\n"
+"	max[0] += offset;\n"
+"}\n"
+"inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA,const float4 ornA,\n"
+"	const float4 posB,const float4 ornB,\n"
+"	float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth)\n"
+"{\n"
+"	float Min0,Max0;\n"
+"	float Min1,Max1;\n"
+"	projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);\n"
+"	project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);\n"
+"	if(Max0<Min1 || Max1<Min0)\n"
+"		return false;\n"
+"	float d0 = Max0 - Min1;\n"
+"	float d1 = Max1 - Min0;\n"
+"	*depth = d0<d1 ? d0:d1;\n"
+"	return true;\n"
+"}\n"
+"inline bool IsAlmostZero(const float4 v)\n"
+"{\n"
+"	if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)\n"
+"		return false;\n"
+"	return true;\n"
+"}\n"
+"bool findSeparatingAxisLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	\n"
+"	const float4* verticesA, \n"
+"	const float4* uniqueEdgesA, \n"
+"	const btGpuFace* facesA,\n"
+"	const int*  indicesA,\n"
+"	__global const float4* verticesB, \n"
+"	__global const float4* uniqueEdgesB, \n"
+"	__global const btGpuFace* facesB,\n"
+"	__global const int*  indicesB,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"{\n"
+"	\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	{\n"
+"		int numFacesA = hullA->m_numFaces;\n"
+"		// Test normals from hullA\n"
+"		for(int i=0;i<numFacesA;i++)\n"
+"		{\n"
+"			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n"
+"			float4 faceANormalWS = qtRotate(ornA,normal);\n"
+"			if (dot3F4(DeltaC2,faceANormalWS)<0)\n"
+"				faceANormalWS*=-1.f;\n"
+"			curPlaneTests++;\n"
+"			float d;\n"
+"			if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))\n"
+"				return false;\n"
+"			if(d<*dmin)\n"
+"			{\n"
+"				*dmin = d;\n"
+"				*sep = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"}\n"
+"bool findSeparatingAxisLocalB(	__global const ConvexPolyhedronCL* hullA,  const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	__global const float4* verticesA, \n"
+"	__global const float4* uniqueEdgesA, \n"
+"	__global const btGpuFace* facesA,\n"
+"	__global const int*  indicesA,\n"
+"	const float4* verticesB,\n"
+"	const float4* uniqueEdgesB, \n"
+"	const btGpuFace* facesB,\n"
+"	const int*  indicesB,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"{\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	{\n"
+"		int numFacesA = hullA->m_numFaces;\n"
+"		// Test normals from hullA\n"
+"		for(int i=0;i<numFacesA;i++)\n"
+"		{\n"
+"			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n"
+"			float4 faceANormalWS = qtRotate(ornA,normal);\n"
+"			if (dot3F4(DeltaC2,faceANormalWS)<0)\n"
+"				faceANormalWS *= -1.f;\n"
+"			curPlaneTests++;\n"
+"			float d;\n"
+"			if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d))\n"
+"				return false;\n"
+"			if(d<*dmin)\n"
+"			{\n"
+"				*dmin = d;\n"
+"				*sep = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"}\n"
+"bool findSeparatingAxisEdgeEdgeLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	const float4* verticesA, \n"
+"	const float4* uniqueEdgesA, \n"
+"	const btGpuFace* facesA,\n"
+"	const int*  indicesA,\n"
+"	__global const float4* verticesB, \n"
+"	__global const float4* uniqueEdgesB, \n"
+"	__global const btGpuFace* facesB,\n"
+"	__global const int*  indicesB,\n"
+"		float4* sep,\n"
+"	float* dmin)\n"
+"{\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	int curEdgeEdge = 0;\n"
+"	// Test edges\n"
+"	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n"
+"	{\n"
+"		const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];\n"
+"		float4 edge0World = qtRotate(ornA,edge0);\n"
+"		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n"
+"		{\n"
+"			const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];\n"
+"			float4 edge1World = qtRotate(ornB,edge1);\n"
+"			float4 crossje = cross3(edge0World,edge1World);\n"
+"			curEdgeEdge++;\n"
+"			if(!IsAlmostZero(crossje))\n"
+"			{\n"
+"				crossje = normalize3(crossje);\n"
+"				if (dot3F4(DeltaC2,crossje)<0)\n"
+"					crossje *= -1.f;\n"
+"				float dist;\n"
+"				bool result = true;\n"
+"				{\n"
+"					float Min0,Max0;\n"
+"					float Min1,Max1;\n"
+"					projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);\n"
+"					project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);\n"
+"				\n"
+"					if(Max0<Min1 || Max1<Min0)\n"
+"						result = false;\n"
+"				\n"
+"					float d0 = Max0 - Min1;\n"
+"					float d1 = Max1 - Min0;\n"
+"					dist = d0<d1 ? d0:d1;\n"
+"					result = true;\n"
+"				}\n"
+"				\n"
+"				if(dist<*dmin)\n"
+"				{\n"
+"					*dmin = dist;\n"
+"					*sep = crossje;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"}\n"
+"inline bool TestSepAxis(__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA,const float4 ornA,\n"
+"	const float4 posB,const float4 ornB,\n"
+"	float4* sep_axis, __global const float4* vertices,float* depth)\n"
+"{\n"
+"	float Min0,Max0;\n"
+"	float Min1,Max1;\n"
+"	project(hullA,posA,ornA,sep_axis,vertices, &Min0, &Max0);\n"
+"	project(hullB,posB,ornB, sep_axis,vertices, &Min1, &Max1);\n"
+"	if(Max0<Min1 || Max1<Min0)\n"
+"		return false;\n"
+"	float d0 = Max0 - Min1;\n"
+"	float d1 = Max1 - Min0;\n"
+"	*depth = d0<d1 ? d0:d1;\n"
+"	return true;\n"
+"}\n"
+"bool findSeparatingAxis(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	__global const float4* vertices, \n"
+"	__global const float4* uniqueEdges, \n"
+"	__global const btGpuFace* faces,\n"
+"	__global const int*  indices,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"{\n"
+"	\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	\n"
+"	int curPlaneTests=0;\n"
+"	{\n"
+"		int numFacesA = hullA->m_numFaces;\n"
+"		// Test normals from hullA\n"
+"		for(int i=0;i<numFacesA;i++)\n"
+"		{\n"
+"			const float4 normal = faces[hullA->m_faceOffset+i].m_plane;\n"
+"			float4 faceANormalWS = qtRotate(ornA,normal);\n"
+"	\n"
+"			if (dot3F4(DeltaC2,faceANormalWS)<0)\n"
+"				faceANormalWS*=-1.f;\n"
+"				\n"
+"			curPlaneTests++;\n"
+"	\n"
+"			float d;\n"
+"			if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, vertices,&d))\n"
+"				return false;\n"
+"	\n"
+"			if(d<*dmin)\n"
+"			{\n"
+"				*dmin = d;\n"
+"				*sep = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"		if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"		{\n"
+"			*sep = -(*sep);\n"
+"		}\n"
+"	\n"
+"	return true;\n"
+"}\n"
+"bool findSeparatingAxisUnitSphere(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	__global const float4* vertices,\n"
+"	__global const float4* unitSphereDirections,\n"
+"	int numUnitSphereDirections,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"{\n"
+"	\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	int curEdgeEdge = 0;\n"
+"	// Test unit sphere directions\n"
+"	for (int i=0;i<numUnitSphereDirections;i++)\n"
+"	{\n"
+"		float4 crossje;\n"
+"		crossje = unitSphereDirections[i];	\n"
+"		if (dot3F4(DeltaC2,crossje)>0)\n"
+"			crossje *= -1.f;\n"
+"		{\n"
+"			float dist;\n"
+"			bool result = true;\n"
+"			float Min0,Max0;\n"
+"			float Min1,Max1;\n"
+"			project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);\n"
+"			project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);\n"
+"		\n"
+"			if(Max0<Min1 || Max1<Min0)\n"
+"				return false;\n"
+"		\n"
+"			float d0 = Max0 - Min1;\n"
+"			float d1 = Max1 - Min0;\n"
+"			dist = d0<d1 ? d0:d1;\n"
+"			result = true;\n"
+"	\n"
+"			if(dist<*dmin)\n"
+"			{\n"
+"				*dmin = dist;\n"
+"				*sep = crossje;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"}\n"
+"bool findSeparatingAxisEdgeEdge(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	__global const float4* vertices, \n"
+"	__global const float4* uniqueEdges, \n"
+"	__global const btGpuFace* faces,\n"
+"	__global const int*  indices,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"{\n"
+"	\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	int curEdgeEdge = 0;\n"
+"	// Test edges\n"
+"	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n"
+"	{\n"
+"		const float4 edge0 = uniqueEdges[hullA->m_uniqueEdgesOffset+e0];\n"
+"		float4 edge0World = qtRotate(ornA,edge0);\n"
+"		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n"
+"		{\n"
+"			const float4 edge1 = uniqueEdges[hullB->m_uniqueEdgesOffset+e1];\n"
+"			float4 edge1World = qtRotate(ornB,edge1);\n"
+"			float4 crossje = cross3(edge0World,edge1World);\n"
+"			curEdgeEdge++;\n"
+"			if(!IsAlmostZero(crossje))\n"
+"			{\n"
+"				crossje = normalize3(crossje);\n"
+"				if (dot3F4(DeltaC2,crossje)<0)\n"
+"					crossje*=-1.f;\n"
+"					\n"
+"				float dist;\n"
+"				bool result = true;\n"
+"				{\n"
+"					float Min0,Max0;\n"
+"					float Min1,Max1;\n"
+"					project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);\n"
+"					project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);\n"
+"				\n"
+"					if(Max0<Min1 || Max1<Min0)\n"
+"						return false;\n"
+"				\n"
+"					float d0 = Max0 - Min1;\n"
+"					float d1 = Max1 - Min0;\n"
+"					dist = d0<d1 ? d0:d1;\n"
+"					result = true;\n"
+"				}\n"
+"				\n"
+"				if(dist<*dmin)\n"
+"				{\n"
+"					*dmin = dist;\n"
+"					*sep = crossje;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"}\n"
+"// work-in-progress\n"
+"__kernel void   processCompoundPairsKernel( __global const int4* gpuCompoundPairs,\n"
+"																					__global const BodyData* rigidBodies, \n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const ConvexPolyhedronCL* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const btGpuFace* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global btAabbCL* aabbs,\n"
+"																					__global const btGpuChildShape* gpuChildShapes,\n"
+"																					__global volatile float4* gpuCompoundSepNormalsOut,\n"
+"																					__global volatile int* gpuHasCompoundSepNormalsOut,\n"
+"																					int numCompoundPairs\n"
+"																					)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i<numCompoundPairs)\n"
+"	{\n"
+"		int bodyIndexA = gpuCompoundPairs[i].x;\n"
+"		int bodyIndexB = gpuCompoundPairs[i].y;\n"
+"		int childShapeIndexA = gpuCompoundPairs[i].z;\n"
+"		int childShapeIndexB = gpuCompoundPairs[i].w;\n"
+"		\n"
+"		int collidableIndexA = -1;\n"
+"		int collidableIndexB = -1;\n"
+"		\n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		\n"
+"		float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"							\n"
+"		if (childShapeIndexA >= 0)\n"
+"		{\n"
+"			collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n"
+"			float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n"
+"			float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n"
+"			float4 newPosA = qtRotate(ornA,childPosA)+posA;\n"
+"			float4 newOrnA = qtMul(ornA,childOrnA);\n"
+"			posA = newPosA;\n"
+"			ornA = newOrnA;\n"
+"		} else\n"
+"		{\n"
+"			collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		}\n"
+"		\n"
+"		if (childShapeIndexB>=0)\n"
+"		{\n"
+"			collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"			float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"			float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"			posB = newPosB;\n"
+"			ornB = newOrnB;\n"
+"		} else\n"
+"		{\n"
+"			collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	\n"
+"		}\n"
+"	\n"
+"		gpuHasCompoundSepNormalsOut[i] = 0;\n"
+"	\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"	\n"
+"		int shapeTypeA = collidables[collidableIndexA].m_shapeType;\n"
+"		int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
+"	\n"
+"		if ((shapeTypeA != SHAPE_CONVEX_HULL) || (shapeTypeB != SHAPE_CONVEX_HULL))\n"
+"		{\n"
+"			return;\n"
+"		}\n"
+"		int hasSeparatingAxis = 5;\n"
+"							\n"
+"		int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"		float dmin = FLT_MAX;\n"
+"		posA.w = 0.f;\n"
+"		posB.w = 0.f;\n"
+"		float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"		float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"		float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"		float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"		const float4 DeltaC2 = c0 - c1;\n"
+"		float4 sepNormal = make_float4(1,0,0,0);\n"
+"		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n"
+"		hasSeparatingAxis = 4;\n"
+"		if (!sepA)\n"
+"		{\n"
+"			hasSeparatingAxis = 0;\n"
+"		} else\n"
+"		{\n"
+"			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,posA,ornA,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n"
+"			if (!sepB)\n"
+"			{\n"
+"				hasSeparatingAxis = 0;\n"
+"			} else//(!sepB)\n"
+"			{\n"
+"				bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n"
+"				if (sepEE)\n"
+"				{\n"
+"						gpuCompoundSepNormalsOut[i] = sepNormal;//fastNormalize4(sepNormal);\n"
+"						gpuHasCompoundSepNormalsOut[i] = 1;\n"
+"				}//sepEE\n"
+"			}//(!sepB)\n"
+"		}//(!sepA)\n"
+"		\n"
+"		\n"
+"	}\n"
+"		\n"
+"}\n"
+"inline b3Float4 MyUnQuantize(const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin)\n"
+"{\n"
+"		b3Float4 vecOut;\n"
+"		vecOut = b3MakeFloat4(\n"
+"			(float)(vecIn[0]) / (quantization.x),\n"
+"			(float)(vecIn[1]) / (quantization.y),\n"
+"			(float)(vecIn[2]) / (quantization.z),\n"
+"			0.f);\n"
+"		vecOut += bvhAabbMin;\n"
+"		return vecOut;\n"
+"}\n"
+"inline b3Float4 MyUnQuantizeGlobal(__global const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin)\n"
+"{\n"
+"		b3Float4 vecOut;\n"
+"		vecOut = b3MakeFloat4(\n"
+"			(float)(vecIn[0]) / (quantization.x),\n"
+"			(float)(vecIn[1]) / (quantization.y),\n"
+"			(float)(vecIn[2]) / (quantization.z),\n"
+"			0.f);\n"
+"		vecOut += bvhAabbMin;\n"
+"		return vecOut;\n"
+"}\n"
+"// work-in-progress\n"
+"__kernel void   findCompoundPairsKernel( __global const int4* pairs, \n"
+"	__global const BodyData* rigidBodies, \n"
+"	__global const btCollidableGpu* collidables,\n"
+"	__global const ConvexPolyhedronCL* convexShapes, \n"
+"	__global const float4* vertices,\n"
+"	__global const float4* uniqueEdges,\n"
+"	__global const btGpuFace* faces,\n"
+"	__global const int* indices,\n"
+"	__global b3Aabb_t* aabbLocalSpace,\n"
+"	__global const btGpuChildShape* gpuChildShapes,\n"
+"	__global volatile int4* gpuCompoundPairsOut,\n"
+"	__global volatile int* numCompoundPairsOut,\n"
+"	__global const b3BvhSubtreeInfo* subtrees,\n"
+"	__global const b3QuantizedBvhNode* quantizedNodes,\n"
+"	__global const b3BvhInfo* bvhInfos,\n"
+"	int numPairs,\n"
+"	int maxNumCompoundPairsCapacity\n"
+"	)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"		//once the broadphase avoids static-static pairs, we can remove this test\n"
+"		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
+"		{\n"
+"			return;\n"
+"		}\n"
+"		if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) &&(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n"
+"		{\n"
+"			int bvhA = collidables[collidableIndexA].m_compoundBvhIndex;\n"
+"			int bvhB = collidables[collidableIndexB].m_compoundBvhIndex;\n"
+"			int numSubTreesA = bvhInfos[bvhA].m_numSubTrees;\n"
+"			int subTreesOffsetA = bvhInfos[bvhA].m_subTreeOffset;\n"
+"			int subTreesOffsetB = bvhInfos[bvhB].m_subTreeOffset;\n"
+"			int numSubTreesB = bvhInfos[bvhB].m_numSubTrees;\n"
+"			\n"
+"			float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"			b3Quat ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"			b3Quat ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"			float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"			\n"
+"			for (int p=0;p<numSubTreesA;p++)\n"
+"			{\n"
+"				b3BvhSubtreeInfo subtreeA = subtrees[subTreesOffsetA+p];\n"
+"				//bvhInfos[bvhA].m_quantization\n"
+"				b3Float4 treeAminLocal = MyUnQuantize(subtreeA.m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n"
+"				b3Float4 treeAmaxLocal = MyUnQuantize(subtreeA.m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n"
+"				b3Float4 aabbAMinOut,aabbAMaxOut;\n"
+"				float margin=0.f;\n"
+"				b3TransformAabb2(treeAminLocal,treeAmaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut);\n"
+"				\n"
+"				for (int q=0;q<numSubTreesB;q++)\n"
+"				{\n"
+"					b3BvhSubtreeInfo subtreeB = subtrees[subTreesOffsetB+q];\n"
+"					b3Float4 treeBminLocal = MyUnQuantize(subtreeB.m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n"
+"					b3Float4 treeBmaxLocal = MyUnQuantize(subtreeB.m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n"
+"					b3Float4 aabbBMinOut,aabbBMaxOut;\n"
+"					float margin=0.f;\n"
+"					b3TransformAabb2(treeBminLocal,treeBmaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut);\n"
+"					\n"
+"					\n"
+"					bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);\n"
+"					if (aabbOverlap)\n"
+"					{\n"
+"						\n"
+"						int startNodeIndexA = subtreeA.m_rootNodeIndex+bvhInfos[bvhA].m_nodeOffset;\n"
+"						int endNodeIndexA = startNodeIndexA+subtreeA.m_subtreeSize;\n"
+"						int startNodeIndexB = subtreeB.m_rootNodeIndex+bvhInfos[bvhB].m_nodeOffset;\n"
+"						int endNodeIndexB = startNodeIndexB+subtreeB.m_subtreeSize;\n"
+"						b3Int2 nodeStack[B3_MAX_STACK_DEPTH];\n"
+"						b3Int2 node0;\n"
+"						node0.x = startNodeIndexA;\n"
+"						node0.y = startNodeIndexB;\n"
+"						int maxStackDepth = B3_MAX_STACK_DEPTH;\n"
+"						int depth=0;\n"
+"						nodeStack[depth++]=node0;\n"
+"						do\n"
+"						{\n"
+"							b3Int2 node = nodeStack[--depth];\n"
+"							b3Float4 aMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n"
+"							b3Float4 aMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n"
+"							b3Float4 bMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n"
+"							b3Float4 bMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n"
+"							float margin=0.f;\n"
+"							b3Float4 aabbAMinOut,aabbAMaxOut;\n"
+"							b3TransformAabb2(aMinLocal,aMaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut);\n"
+"							b3Float4 aabbBMinOut,aabbBMaxOut;\n"
+"							b3TransformAabb2(bMinLocal,bMaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut);\n"
+"							\n"
+"							bool nodeOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);\n"
+"							if (nodeOverlap)\n"
+"							{\n"
+"								bool isLeafA = isLeafNodeGlobal(&quantizedNodes[node.x]);\n"
+"								bool isLeafB = isLeafNodeGlobal(&quantizedNodes[node.y]);\n"
+"								bool isInternalA = !isLeafA;\n"
+"								bool isInternalB = !isLeafB;\n"
+"								//fail, even though it might hit two leaf nodes\n"
+"								if (depth+4>maxStackDepth && !(isLeafA && isLeafB))\n"
+"								{\n"
+"									//printf(\"Error: traversal exceeded maxStackDepth\");\n"
+"									continue;\n"
+"								}\n"
+"								if(isInternalA)\n"
+"								{\n"
+"									int nodeAleftChild = node.x+1;\n"
+"									bool isNodeALeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.x+1]);\n"
+"									int nodeArightChild = isNodeALeftChildLeaf? node.x+2 : node.x+1 + getEscapeIndexGlobal(&quantizedNodes[node.x+1]);\n"
+"									if(isInternalB)\n"
+"									{					\n"
+"										int nodeBleftChild = node.y+1;\n"
+"										bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]);\n"
+"										int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]);\n"
+"										nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBleftChild);\n"
+"										nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBleftChild);\n"
+"										nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBrightChild);\n"
+"										nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBrightChild);\n"
+"									}\n"
+"									else\n"
+"									{\n"
+"										nodeStack[depth++] = b3MakeInt2(nodeAleftChild,node.y);\n"
+"										nodeStack[depth++] = b3MakeInt2(nodeArightChild,node.y);\n"
+"									}\n"
+"								}\n"
+"								else\n"
+"								{\n"
+"									if(isInternalB)\n"
+"									{\n"
+"										int nodeBleftChild = node.y+1;\n"
+"										bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]);\n"
+"										int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]);\n"
+"										nodeStack[depth++] = b3MakeInt2(node.x,nodeBleftChild);\n"
+"										nodeStack[depth++] = b3MakeInt2(node.x,nodeBrightChild);\n"
+"									}\n"
+"									else\n"
+"									{\n"
+"										int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n"
+"										if (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
+"										{\n"
+"											int childShapeIndexA = getTriangleIndexGlobal(&quantizedNodes[node.x]);\n"
+"											int childShapeIndexB = getTriangleIndexGlobal(&quantizedNodes[node.y]);\n"
+"											gpuCompoundPairsOut[compoundPairIdx]  = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);\n"
+"										}\n"
+"									}\n"
+"								}\n"
+"							}\n"
+"						} while (depth);\n"
+"					}\n"
+"				}\n"
+"			}\n"
+"			\n"
+"			return;\n"
+"		}\n"
+"		if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n"
+"		{\n"
+"			if (collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) \n"
+"			{\n"
+"				int numChildrenA = collidables[collidableIndexA].m_numChildShapes;\n"
+"				for (int c=0;c<numChildrenA;c++)\n"
+"				{\n"
+"					int childShapeIndexA = collidables[collidableIndexA].m_shapeIndex+c;\n"
+"					int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n"
+"					float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"					float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"					float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n"
+"					float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n"
+"					float4 newPosA = qtRotate(ornA,childPosA)+posA;\n"
+"					float4 newOrnA = qtMul(ornA,childOrnA);\n"
+"					int shapeIndexA = collidables[childColIndexA].m_shapeIndex;\n"
+"					b3Aabb_t aabbAlocal = aabbLocalSpace[shapeIndexA];\n"
+"					float margin = 0.f;\n"
+"					\n"
+"					b3Float4 aabbAMinWS;\n"
+"					b3Float4 aabbAMaxWS;\n"
+"					\n"
+"					b3TransformAabb2(aabbAlocal.m_minVec,aabbAlocal.m_maxVec,margin,\n"
+"						newPosA,\n"
+"						newOrnA,\n"
+"						&aabbAMinWS,&aabbAMaxWS);\n"
+"						\n"
+"					\n"
+"					if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"					{\n"
+"						int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n"
+"						for (int b=0;b<numChildrenB;b++)\n"
+"						{\n"
+"							int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n"
+"							int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"							float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"							float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"							float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"							float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"							float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"							float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"							int shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n"
+"							b3Aabb_t aabbBlocal = aabbLocalSpace[shapeIndexB];\n"
+"							\n"
+"							b3Float4 aabbBMinWS;\n"
+"							b3Float4 aabbBMaxWS;\n"
+"							\n"
+"							b3TransformAabb2(aabbBlocal.m_minVec,aabbBlocal.m_maxVec,margin,\n"
+"								newPosB,\n"
+"								newOrnB,\n"
+"								&aabbBMinWS,&aabbBMaxWS);\n"
+"								\n"
+"								\n"
+"							\n"
+"							bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinWS,aabbAMaxWS,aabbBMinWS,aabbBMaxWS);\n"
+"							if (aabbOverlap)\n"
+"							{\n"
+"								int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"								float dmin = FLT_MAX;\n"
+"								float4 posA = newPosA;\n"
+"								posA.w = 0.f;\n"
+"								float4 posB = newPosB;\n"
+"								posB.w = 0.f;\n"
+"								float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"								float4 ornA = newOrnA;\n"
+"								float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"								float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"								float4 ornB =newOrnB;\n"
+"								float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"								const float4 DeltaC2 = c0 - c1;\n"
+"								{//\n"
+"									int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n"
+"									if (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
+"									{\n"
+"										gpuCompoundPairsOut[compoundPairIdx]  = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);\n"
+"									}\n"
+"								}//\n"
+"							}//fi(1)\n"
+"						} //for (int b=0\n"
+"					}//if (collidables[collidableIndexB].\n"
+"					else//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"					{\n"
+"						if (1)\n"
+"						{\n"
+"							int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"							float dmin = FLT_MAX;\n"
+"							float4 posA = newPosA;\n"
+"							posA.w = 0.f;\n"
+"							float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"							posB.w = 0.f;\n"
+"							float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"							float4 ornA = newOrnA;\n"
+"							float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"							float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"							float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"							float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"							const float4 DeltaC2 = c0 - c1;\n"
+"							{\n"
+"								int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n"
+"								if (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
+"								{\n"
+"									gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,-1);\n"
+"								}//if (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
+"							}//\n"
+"						}//fi (1)\n"
+"					}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"				}//for (int b=0;b<numChildrenB;b++)	\n"
+"				return;\n"
+"			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"			if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH) \n"
+"				&& (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n"
+"			{\n"
+"				int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n"
+"				for (int b=0;b<numChildrenB;b++)\n"
+"				{\n"
+"					int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n"
+"					int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"					float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"					float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"					float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"					float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"					float4 newPosB = qtRotate(ornB,childPosB)+posB;\n"
+"					float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"					int shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n"
+"					//////////////////////////////////////\n"
+"					if (1)\n"
+"					{\n"
+"						int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"						float dmin = FLT_MAX;\n"
+"						float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"						posA.w = 0.f;\n"
+"						float4 posB = newPosB;\n"
+"						posB.w = 0.f;\n"
+"						float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"						float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"						float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"						float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"						float4 ornB =newOrnB;\n"
+"						float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"						const float4 DeltaC2 = c0 - c1;\n"
+"						{//\n"
+"							int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n"
+"							if (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
+"							{\n"
+"								gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,-1,childShapeIndexB);\n"
+"							}//fi (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
+"						}//\n"
+"					}//fi (1)	\n"
+"				}//for (int b=0;b<numChildrenB;b++)\n"
+"				return;\n"
+"			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"			return;\n"
+"		}//fi ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n"
+"	}//i<numPairs\n"
+"}\n"
+"// work-in-progress\n"
+"__kernel void   findSeparatingAxisKernel( __global const int4* pairs, \n"
+"																					__global const BodyData* rigidBodies, \n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const ConvexPolyhedronCL* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const btGpuFace* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global btAabbCL* aabbs,\n"
+"																					__global volatile float4* separatingNormals,\n"
+"																					__global volatile int* hasSeparatingAxis,\n"
+"																					int numPairs\n"
+"																					)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"	\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"	\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"		\n"
+"		\n"
+"		//once the broadphase avoids static-static pairs, we can remove this test\n"
+"		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
+"		{\n"
+"			hasSeparatingAxis[i] = 0;\n"
+"			return;\n"
+"		}\n"
+"		\n"
+"		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))\n"
+"		{\n"
+"			hasSeparatingAxis[i] = 0;\n"
+"			return;\n"
+"		}\n"
+"			\n"
+"		if ((collidables[collidableIndexA].m_shapeType==SHAPE_CONCAVE_TRIMESH))\n"
+"		{\n"
+"			hasSeparatingAxis[i] = 0;\n"
+"			return;\n"
+"		}\n"
+"		int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"		float dmin = FLT_MAX;\n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		posA.w = 0.f;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"		posB.w = 0.f;\n"
+"		float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"		float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"		float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"		float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"		const float4 DeltaC2 = c0 - c1;\n"
+"		float4 sepNormal;\n"
+"		\n"
+"		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n"
+"																								posB,ornB,\n"
+"																								DeltaC2,\n"
+"																								vertices,uniqueEdges,faces,\n"
+"																								indices,&sepNormal,&dmin);\n"
+"		hasSeparatingAxis[i] = 4;\n"
+"		if (!sepA)\n"
+"		{\n"
+"			hasSeparatingAxis[i] = 0;\n"
+"		} else\n"
+"		{\n"
+"			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,\n"
+"																									posA,ornA,\n"
+"																									DeltaC2,\n"
+"																									vertices,uniqueEdges,faces,\n"
+"																									indices,&sepNormal,&dmin);\n"
+"			if (!sepB)\n"
+"			{\n"
+"				hasSeparatingAxis[i] = 0;\n"
+"			} else\n"
+"			{\n"
+"				bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n"
+"																									posB,ornB,\n"
+"																									DeltaC2,\n"
+"																									vertices,uniqueEdges,faces,\n"
+"																									indices,&sepNormal,&dmin);\n"
+"				if (!sepEE)\n"
+"				{\n"
+"					hasSeparatingAxis[i] = 0;\n"
+"				} else\n"
+"				{\n"
+"					hasSeparatingAxis[i] = 1;\n"
+"					separatingNormals[i] = sepNormal;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"	}\n"
+"}\n"
+"__kernel void   findSeparatingAxisVertexFaceKernel( __global const int4* pairs, \n"
+"																					__global const BodyData* rigidBodies, \n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const ConvexPolyhedronCL* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const btGpuFace* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global btAabbCL* aabbs,\n"
+"																					__global volatile float4* separatingNormals,\n"
+"																					__global volatile int* hasSeparatingAxis,\n"
+"																					__global  float* dmins,\n"
+"																					int numPairs\n"
+"																					)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"	\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"	\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"	\n"
+"		hasSeparatingAxis[i] = 0;	\n"
+"		\n"
+"		//once the broadphase avoids static-static pairs, we can remove this test\n"
+"		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
+"		{\n"
+"			return;\n"
+"		}\n"
+"		\n"
+"		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))\n"
+"		{\n"
+"			return;\n"
+"		}\n"
+"			\n"
+"		int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"		float dmin = FLT_MAX;\n"
+"		dmins[i] = dmin;\n"
+"		\n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		posA.w = 0.f;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"		posB.w = 0.f;\n"
+"		float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"		float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"		float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"		float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"		const float4 DeltaC2 = c0 - c1;\n"
+"		float4 sepNormal;\n"
+"		\n"
+"		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n"
+"																								posB,ornB,\n"
+"																								DeltaC2,\n"
+"																								vertices,uniqueEdges,faces,\n"
+"																								indices,&sepNormal,&dmin);\n"
+"		hasSeparatingAxis[i] = 4;\n"
+"		if (!sepA)\n"
+"		{\n"
+"			hasSeparatingAxis[i] = 0;\n"
+"		} else\n"
+"		{\n"
+"			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,\n"
+"																									posA,ornA,\n"
+"																									DeltaC2,\n"
+"																									vertices,uniqueEdges,faces,\n"
+"																									indices,&sepNormal,&dmin);\n"
+"			if (sepB)\n"
+"			{\n"
+"				dmins[i] = dmin;\n"
+"				hasSeparatingAxis[i] = 1;\n"
+"				separatingNormals[i] = sepNormal;\n"
+"			}\n"
+"		}\n"
+"		\n"
+"	}\n"
+"}\n"
+"__kernel void   findSeparatingAxisEdgeEdgeKernel( __global const int4* pairs, \n"
+"																					__global const BodyData* rigidBodies, \n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const ConvexPolyhedronCL* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const btGpuFace* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global btAabbCL* aabbs,\n"
+"																					__global  float4* separatingNormals,\n"
+"																					__global  int* hasSeparatingAxis,\n"
+"																					__global  float* dmins,\n"
+"																					__global const float4* unitSphereDirections,\n"
+"																					int numUnitSphereDirections,\n"
+"																					int numPairs\n"
+"																					)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"		if (hasSeparatingAxis[i])\n"
+"		{\n"
+"	\n"
+"			int bodyIndexA = pairs[i].x;\n"
+"			int bodyIndexB = pairs[i].y;\n"
+"	\n"
+"			int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"			int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"		\n"
+"			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"			\n"
+"			\n"
+"			int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"	\n"
+"			float dmin = dmins[i];\n"
+"	\n"
+"			float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"			posA.w = 0.f;\n"
+"			float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"			posB.w = 0.f;\n"
+"			float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"			float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"			float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"			float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"			float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"			float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"			const float4 DeltaC2 = c0 - c1;\n"
+"			float4 sepNormal = separatingNormals[i];\n"
+"			\n"
+"			\n"
+"			\n"
+"			bool sepEE = false;\n"
+"			int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;\n"
+"			if (numEdgeEdgeDirections<=numUnitSphereDirections)\n"
+"			{\n"
+"				sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n"
+"																									posB,ornB,\n"
+"																									DeltaC2,\n"
+"																									vertices,uniqueEdges,faces,\n"
+"																									indices,&sepNormal,&dmin);\n"
+"																									\n"
+"					if (!sepEE)\n"
+"					{\n"
+"						hasSeparatingAxis[i] = 0;\n"
+"					} else\n"
+"					{\n"
+"						hasSeparatingAxis[i] = 1;\n"
+"						separatingNormals[i] = sepNormal;\n"
+"					}\n"
+"			}\n"
+"			/*\n"
+"			///else case is a separate kernel, to make Mac OSX OpenCL compiler happy\n"
+"			else\n"
+"			{\n"
+"				sepEE = findSeparatingAxisUnitSphere(&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n"
+"																									posB,ornB,\n"
+"																									DeltaC2,\n"
+"																									vertices,unitSphereDirections,numUnitSphereDirections,\n"
+"																									&sepNormal,&dmin);\n"
+"					if (!sepEE)\n"
+"					{\n"
+"						hasSeparatingAxis[i] = 0;\n"
+"					} else\n"
+"					{\n"
+"						hasSeparatingAxis[i] = 1;\n"
+"						separatingNormals[i] = sepNormal;\n"
+"					}\n"
+"			}\n"
+"			*/\n"
+"		}		//if (hasSeparatingAxis[i])\n"
+"	}//(i<numPairs)\n"
+"}\n"
+"inline int	findClippingFaces(const float4 separatingNormal,\n"
+"                      const ConvexPolyhedronCL* hullA, \n"
+"					  __global const ConvexPolyhedronCL* hullB,\n"
+"                      const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,\n"
+"                       __global float4* worldVertsA1,\n"
+"                      __global float4* worldNormalsA1,\n"
+"                      __global float4* worldVertsB1,\n"
+"                      int capacityWorldVerts,\n"
+"                      const float minDist, float maxDist,\n"
+"					  const float4* verticesA,\n"
+"                      const btGpuFace* facesA,\n"
+"                      const int* indicesA,\n"
+"					  __global const float4* verticesB,\n"
+"                      __global const btGpuFace* facesB,\n"
+"                      __global const int* indicesB,\n"
+"                      __global int4* clippingFaces, int pairIndex)\n"
+"{\n"
+"	int numContactsOut = 0;\n"
+"	int numWorldVertsB1= 0;\n"
+"    \n"
+"    \n"
+"	int closestFaceB=0;\n"
+"	float dmax = -FLT_MAX;\n"
+"    \n"
+"	{\n"
+"		for(int face=0;face<hullB->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x,\n"
+"                                              facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 WorldNormal = qtRotate(ornB, Normal);\n"
+"			float d = dot3F4(WorldNormal,separatingNormal);\n"
+"			if (d > dmax)\n"
+"			{\n"
+"				dmax = d;\n"
+"				closestFaceB = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"    \n"
+"	{\n"
+"		const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n"
+"		int numVertices = polyB.m_numIndices;\n"
+"        if (numVertices>capacityWorldVerts)\n"
+"            numVertices = capacityWorldVerts;\n"
+"        \n"
+"		for(int e0=0;e0<numVertices;e0++)\n"
+"		{\n"
+"            if (e0<capacityWorldVerts)\n"
+"            {\n"
+"                const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];\n"
+"                worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);\n"
+"            }\n"
+"		}\n"
+"	}\n"
+"    \n"
+"    int closestFaceA=0;\n"
+"	{\n"
+"		float dmin = FLT_MAX;\n"
+"		for(int face=0;face<hullA->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(\n"
+"                                              facesA[hullA->m_faceOffset+face].m_plane.x,\n"
+"                                              facesA[hullA->m_faceOffset+face].m_plane.y,\n"
+"                                              facesA[hullA->m_faceOffset+face].m_plane.z,\n"
+"                                              0.f);\n"
+"			const float4 faceANormalWS = qtRotate(ornA,Normal);\n"
+"            \n"
+"			float d = dot3F4(faceANormalWS,separatingNormal);\n"
+"			if (d < dmin)\n"
+"			{\n"
+"				dmin = d;\n"
+"				closestFaceA = face;\n"
+"                worldNormalsA1[pairIndex] = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"    \n"
+"    int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices;\n"
+"    if (numVerticesA>capacityWorldVerts)\n"
+"       numVerticesA = capacityWorldVerts;\n"
+"    \n"
+"	for(int e0=0;e0<numVerticesA;e0++)\n"
+"	{\n"
+"        if (e0<capacityWorldVerts)\n"
+"        {\n"
+"            const float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];\n"
+"            worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);\n"
+"        }\n"
+"    }\n"
+"    \n"
+"    clippingFaces[pairIndex].x = closestFaceA;\n"
+"    clippingFaces[pairIndex].y = closestFaceB;\n"
+"    clippingFaces[pairIndex].z = numVerticesA;\n"
+"    clippingFaces[pairIndex].w = numWorldVertsB1;\n"
+"    \n"
+"    \n"
+"	return numContactsOut;\n"
+"}\n"
+"// work-in-progress\n"
+"__kernel void   findConcaveSeparatingAxisKernel( __global int4* concavePairs,\n"
+"																					__global const BodyData* rigidBodies,\n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const ConvexPolyhedronCL* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const btGpuFace* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global const btGpuChildShape* gpuChildShapes,\n"
+"																					__global btAabbCL* aabbs,\n"
+"																					__global float4* concaveSeparatingNormalsOut,\n"
+"																					__global int* concaveHasSeparatingNormals,\n"
+"																					__global int4* clippingFacesOut,\n"
+"																					__global float4* worldVertsA1GPU,\n"
+"																					__global float4*  worldNormalsAGPU,\n"
+"																					__global float4* worldVertsB1GPU,\n"
+"																					int vertexFaceCapacity,\n"
+"																					int numConcavePairs\n"
+"																					)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numConcavePairs)\n"
+"		return;\n"
+"	concaveHasSeparatingNormals[i] = 0;\n"
+"	int pairIdx = i;\n"
+"	int bodyIndexA = concavePairs[i].x;\n"
+"	int bodyIndexB = concavePairs[i].y;\n"
+"	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"	if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&&\n"
+"		collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"	{\n"
+"		concavePairs[pairIdx].w = -1;\n"
+"		return;\n"
+"	}\n"
+"	int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"	int numActualConcaveConvexTests = 0;\n"
+"	\n"
+"	int f = concavePairs[i].z;\n"
+"	\n"
+"	bool overlap = false;\n"
+"	\n"
+"	ConvexPolyhedronCL convexPolyhedronA;\n"
+"	//add 3 vertices of the triangle\n"
+"	convexPolyhedronA.m_numVertices = 3;\n"
+"	convexPolyhedronA.m_vertexOffset = 0;\n"
+"	float4	localCenter = make_float4(0.f,0.f,0.f,0.f);\n"
+"	btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
+"	float4 triMinAabb, triMaxAabb;\n"
+"	btAabbCL triAabb;\n"
+"	triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n"
+"	triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n"
+"	\n"
+"	float4 verticesA[3];\n"
+"	for (int i=0;i<3;i++)\n"
+"	{\n"
+"		int index = indices[face.m_indexOffset+i];\n"
+"		float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n"
+"		verticesA[i] = vert;\n"
+"		localCenter += vert;\n"
+"			\n"
+"		triAabb.m_min = min(triAabb.m_min,vert);		\n"
+"		triAabb.m_max = max(triAabb.m_max,vert);		\n"
+"	}\n"
+"	overlap = true;\n"
+"	overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;\n"
+"	overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;\n"
+"	overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;\n"
+"		\n"
+"	if (overlap)\n"
+"	{\n"
+"		float dmin = FLT_MAX;\n"
+"		int hasSeparatingAxis=5;\n"
+"		float4 sepAxis=make_float4(1,2,3,4);\n"
+"		int localCC=0;\n"
+"		numActualConcaveConvexTests++;\n"
+"		//a triangle has 3 unique edges\n"
+"		convexPolyhedronA.m_numUniqueEdges = 3;\n"
+"		convexPolyhedronA.m_uniqueEdgesOffset = 0;\n"
+"		float4 uniqueEdgesA[3];\n"
+"		\n"
+"		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n"
+"		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n"
+"		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n"
+"		convexPolyhedronA.m_faceOffset = 0;\n"
+"                                  \n"
+"		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n"
+"                             \n"
+"		btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
+"		int indicesA[3+3+2+2+2];\n"
+"		int curUsedIndices=0;\n"
+"		int fidx=0;\n"
+"		//front size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[0] = 0;\n"
+"			indicesA[1] = 1;\n"
+"			indicesA[2] = 2;\n"
+"			curUsedIndices+=3;\n"
+"			float c = face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = normal.x;\n"
+"			facesA[fidx].m_plane.y = normal.y;\n"
+"			facesA[fidx].m_plane.z = normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"		//back size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[3]=2;\n"
+"			indicesA[4]=1;\n"
+"			indicesA[5]=0;\n"
+"			curUsedIndices+=3;\n"
+"			float c = dot(normal,verticesA[0]);\n"
+"			float c1 = -face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = -normal.x;\n"
+"			facesA[fidx].m_plane.y = -normal.y;\n"
+"			facesA[fidx].m_plane.z = -normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"		bool addEdgePlanes = true;\n"
+"		if (addEdgePlanes)\n"
+"		{\n"
+"			int numVertices=3;\n"
+"			int prevVertex = numVertices-1;\n"
+"			for (int i=0;i<numVertices;i++)\n"
+"			{\n"
+"				float4 v0 = verticesA[i];\n"
+"				float4 v1 = verticesA[prevVertex];\n"
+"                                            \n"
+"				float4 edgeNormal = normalize(cross(normal,v1-v0));\n"
+"				float c = -dot(edgeNormal,v0);\n"
+"				facesA[fidx].m_numIndices = 2;\n"
+"				facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"				indicesA[curUsedIndices++]=i;\n"
+"				indicesA[curUsedIndices++]=prevVertex;\n"
+"                                            \n"
+"				facesA[fidx].m_plane.x = edgeNormal.x;\n"
+"				facesA[fidx].m_plane.y = edgeNormal.y;\n"
+"				facesA[fidx].m_plane.z = edgeNormal.z;\n"
+"				facesA[fidx].m_plane.w = c;\n"
+"				fidx++;\n"
+"				prevVertex = i;\n"
+"			}\n"
+"		}\n"
+"		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n"
+"		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		posA.w = 0.f;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"		posB.w = 0.f;\n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"		\n"
+"		///////////////////\n"
+"		///compound shape support\n"
+"		if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"		{\n"
+"			int compoundChild = concavePairs[pairIdx].w;\n"
+"			int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;\n"
+"			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"			float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"			float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"			posB = newPosB;\n"
+"			ornB = newOrnB;\n"
+"			shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n"
+"		}\n"
+"		//////////////////\n"
+"		float4 c0local = convexPolyhedronA.m_localCenter;\n"
+"		float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"		float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"		float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"		const float4 DeltaC2 = c0 - c1;\n"
+"		bool sepA = findSeparatingAxisLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],\n"
+"												posA,ornA,\n"
+"												posB,ornB,\n"
+"												DeltaC2,\n"
+"												verticesA,uniqueEdgesA,facesA,indicesA,\n"
+"												vertices,uniqueEdges,faces,indices,\n"
+"												&sepAxis,&dmin);\n"
+"		hasSeparatingAxis = 4;\n"
+"		if (!sepA)\n"
+"		{\n"
+"			hasSeparatingAxis = 0;\n"
+"		} else\n"
+"		{\n"
+"			bool sepB = findSeparatingAxisLocalB(	&convexShapes[shapeIndexB],&convexPolyhedronA,\n"
+"												posB,ornB,\n"
+"												posA,ornA,\n"
+"												DeltaC2,\n"
+"												vertices,uniqueEdges,faces,indices,\n"
+"												verticesA,uniqueEdgesA,facesA,indicesA,\n"
+"												&sepAxis,&dmin);\n"
+"			if (!sepB)\n"
+"			{\n"
+"				hasSeparatingAxis = 0;\n"
+"			} else\n"
+"			{\n"
+"				bool sepEE = findSeparatingAxisEdgeEdgeLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],\n"
+"															posA,ornA,\n"
+"															posB,ornB,\n"
+"															DeltaC2,\n"
+"															verticesA,uniqueEdgesA,facesA,indicesA,\n"
+"															vertices,uniqueEdges,faces,indices,\n"
+"															&sepAxis,&dmin);\n"
+"	\n"
+"				if (!sepEE)\n"
+"				{\n"
+"					hasSeparatingAxis = 0;\n"
+"				} else\n"
+"				{\n"
+"					hasSeparatingAxis = 1;\n"
+"				}\n"
+"			}\n"
+"		}	\n"
+"		\n"
+"		if (hasSeparatingAxis)\n"
+"		{\n"
+"			sepAxis.w = dmin;\n"
+"			concaveSeparatingNormalsOut[pairIdx]=sepAxis;\n"
+"			concaveHasSeparatingNormals[i]=1;\n"
+"			float minDist = -1e30f;\n"
+"			float maxDist = 0.02f;\n"
+"		\n"
+"			findClippingFaces(sepAxis,\n"
+"                     &convexPolyhedronA,\n"
+"					 &convexShapes[shapeIndexB],\n"
+"					 posA,ornA,\n"
+"					 posB,ornB,\n"
+"                      worldVertsA1GPU,\n"
+"                      worldNormalsAGPU,\n"
+"                      worldVertsB1GPU,\n"
+"					  vertexFaceCapacity,\n"
+"                      minDist, maxDist,\n"
+"                      verticesA,\n"
+"                      facesA,\n"
+"                      indicesA,\n"
+" 					  vertices,\n"
+"                      faces,\n"
+"                      indices,\n"
+"                      clippingFacesOut, pairIdx);\n"
+"		} else\n"
+"		{	\n"
+"			//mark this pair as in-active\n"
+"			concavePairs[pairIdx].w = -1;\n"
+"		}\n"
+"	}\n"
+"	else\n"
+"	{	\n"
+"		//mark this pair as in-active\n"
+"		concavePairs[pairIdx].w = -1;\n"
+"	}\n"
+"	\n"
+"	concavePairs[pairIdx].z = -1;//now z is used for existing/persistent contacts\n"
+"}\n"
+;