1 files changed, 2018 insertions, 0 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl
new file mode 100644
index 0000000000..a6565fd6fa
--- /dev/null
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl
@@ -0,0 +1,2018 @@
+//keep this enum in sync with the CPU version (in btCollidable.h)
+//written by Erwin Coumans
+
+
+#define SHAPE_CONVEX_HULL 3
+#define SHAPE_CONCAVE_TRIMESH 5
+#define TRIANGLE_NUM_CONVEX_FACES 5
+#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6
+
+#define B3_MAX_STACK_DEPTH 256
+
+
+typedef unsigned int u32;
+
+///keep this in sync with btCollidable.h
+typedef struct
+{
+	union {
+		int m_numChildShapes;
+		int m_bvhIndex;
+	};
+	union
+	{
+		float m_radius;
+		int	m_compoundBvhIndex;
+	};
+	
+	int m_shapeType;
+	int m_shapeIndex;
+	
+} btCollidableGpu;
+
+#define MAX_NUM_PARTS_IN_BITS 10
+
+///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.
+///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
+typedef struct
+{
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes
+	int	m_escapeIndexOrTriangleIndex;
+} b3QuantizedBvhNode;
+
+typedef struct
+{
+	float4		m_aabbMin;
+	float4		m_aabbMax;
+	float4		m_quantization;
+	int			m_numNodes;
+	int			m_numSubTrees;
+	int			m_nodeOffset;
+	int			m_subTreeOffset;
+
+} b3BvhInfo;
+
+
+int	getTriangleIndex(const b3QuantizedBvhNode* rootNode)
+{
+	unsigned int x=0;
+	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+	// Get only the lower bits where the triangle index is stored
+	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));
+}
+
+int	getTriangleIndexGlobal(__global const b3QuantizedBvhNode* rootNode)
+{
+	unsigned int x=0;
+	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+	// Get only the lower bits where the triangle index is stored
+	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));
+}
+
+int isLeafNode(const b3QuantizedBvhNode* rootNode)
+{
+	//skipindex is negative (internal node), triangleindex >=0 (leafnode)
+	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;
+}
+
+int isLeafNodeGlobal(__global const b3QuantizedBvhNode* rootNode)
+{
+	//skipindex is negative (internal node), triangleindex >=0 (leafnode)
+	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;
+}
+	
+int getEscapeIndex(const b3QuantizedBvhNode* rootNode)
+{
+	return -rootNode->m_escapeIndexOrTriangleIndex;
+}
+
+int getEscapeIndexGlobal(__global const b3QuantizedBvhNode* rootNode)
+{
+	return -rootNode->m_escapeIndexOrTriangleIndex;
+}
+
+
+typedef struct
+{
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes, points to the root of the subtree
+	int			m_rootNodeIndex;
+	//4 bytes
+	int			m_subtreeSize;
+	int			m_padding[3];
+} b3BvhSubtreeInfo;
+
+
+
+
+
+
+
+typedef struct
+{
+	float4	m_childPosition;
+	float4	m_childOrientation;
+	int m_shapeIndex;
+	int m_unused0;
+	int m_unused1;
+	int m_unused2;
+} btGpuChildShape;
+
+
+typedef struct
+{
+	float4 m_pos;
+	float4 m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+
+	u32 m_collidableIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} BodyData;
+
+
+typedef struct  
+{
+	float4		m_localCenter;
+	float4		m_extents;
+	float4		mC;
+	float4		mE;
+	
+	float			m_radius;
+	int	m_faceOffset;
+	int m_numFaces;
+	int	m_numVertices;
+
+	int m_vertexOffset;
+	int	m_uniqueEdgesOffset;
+	int	m_numUniqueEdges;
+	int m_unused;
+} ConvexPolyhedronCL;
+
+typedef struct 
+{
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3Common/shared/b3Int2.h"
+
+
+
+typedef struct
+{
+	float4 m_plane;
+	int m_indexOffset;
+	int m_numIndices;
+} btGpuFace;
+
+#define make_float4 (float4)
+
+
+__inline
+float4 cross3(float4 a, float4 b)
+{
+	return cross(a,b);
+
+	
+//	float4 a1 = make_float4(a.xyz,0.f);
+//	float4 b1 = make_float4(b.xyz,0.f);
+
+//	return cross(a1,b1);
+
+//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);
+	
+	//	float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);
+	
+	//return c;
+}
+
+__inline
+float dot3F4(float4 a, float4 b)
+{
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+}
+
+__inline
+float4 fastNormalize4(float4 v)
+{
+	v = make_float4(v.xyz,0.f);
+	return fast_normalize(v);
+}
+
+
+///////////////////////////////////////
+//	Quaternion
+///////////////////////////////////////
+
+typedef float4 Quaternion;
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b);
+
+__inline
+Quaternion qtNormalize(Quaternion in);
+
+__inline
+float4 qtRotate(Quaternion q, float4 vec);
+
+__inline
+Quaternion qtInvert(Quaternion q);
+
+
+
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b)
+{
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+}
+
+__inline
+Quaternion qtNormalize(Quaternion in)
+{
+	return fastNormalize4(in);
+//	in /= length( in );
+//	return in;
+}
+__inline
+float4 qtRotate(Quaternion q, float4 vec)
+{
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+}
+
+__inline
+Quaternion qtInvert(Quaternion q)
+{
+	return (Quaternion)(-q.xyz, q.w);
+}
+
+__inline
+float4 qtInvRotate(const Quaternion q, float4 vec)
+{
+	return qtRotate( qtInvert( q ), vec );
+}
+
+__inline
+float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)
+{
+	return qtRotate( *orientation, *p ) + (*translation);
+}
+
+
+
+__inline
+float4 normalize3(const float4 a)
+{
+	float4 n = make_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+}
+
+inline void projectLocal(const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, 
+const float4* dir, const float4* vertices, float* min, float* max)
+{
+	min[0] = FLT_MAX;
+	max[0] = -FLT_MAX;
+	int numVerts = hull->m_numVertices;
+
+	const float4 localDir = qtInvRotate(orn,*dir);
+	float offset = dot(pos,*dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);
+		if(dp < min[0])	
+			min[0] = dp;
+		if(dp > max[0])	
+			max[0] = dp;
+	}
+	if(min[0]>max[0])
+	{
+		float tmp = min[0];
+		min[0] = max[0];
+		max[0] = tmp;
+	}
+	min[0] += offset;
+	max[0] += offset;
+}
+
+inline void project(__global const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, 
+const float4* dir, __global const float4* vertices, float* min, float* max)
+{
+	min[0] = FLT_MAX;
+	max[0] = -FLT_MAX;
+	int numVerts = hull->m_numVertices;
+
+	const float4 localDir = qtInvRotate(orn,*dir);
+	float offset = dot(pos,*dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);
+		if(dp < min[0])	
+			min[0] = dp;
+		if(dp > max[0])	
+			max[0] = dp;
+	}
+	if(min[0]>max[0])
+	{
+		float tmp = min[0];
+		min[0] = max[0];
+		max[0] = tmp;
+	}
+	min[0] += offset;
+	max[0] += offset;
+}
+
+inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA,const float4 ornA,
+	const float4 posB,const float4 ornB,
+	float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth)
+{
+	float Min0,Max0;
+	float Min1,Max1;
+	projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);
+	project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);
+
+	if(Max0<Min1 || Max1<Min0)
+		return false;
+
+	float d0 = Max0 - Min1;
+	float d1 = Max1 - Min0;
+	*depth = d0<d1 ? d0:d1;
+	return true;
+}
+
+
+
+
+inline bool IsAlmostZero(const float4 v)
+{
+	if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)
+		return false;
+	return true;
+}
+
+
+
+bool findSeparatingAxisLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	
+	const float4* verticesA, 
+	const float4* uniqueEdgesA, 
+	const btGpuFace* facesA,
+	const int*  indicesA,
+
+	__global const float4* verticesB, 
+	__global const float4* uniqueEdgesB, 
+	__global const btGpuFace* facesB,
+	__global const int*  indicesB,
+	float4* sep,
+	float* dmin)
+{
+	
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for(int i=0;i<numFacesA;i++)
+		{
+			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;
+			float4 faceANormalWS = qtRotate(ornA,normal);
+			if (dot3F4(DeltaC2,faceANormalWS)<0)
+				faceANormalWS*=-1.f;
+			curPlaneTests++;
+			float d;
+			if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))
+				return false;
+			if(d<*dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+bool findSeparatingAxisLocalB(	__global const ConvexPolyhedronCL* hullA,  const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* verticesA, 
+	__global const float4* uniqueEdgesA, 
+	__global const btGpuFace* facesA,
+	__global const int*  indicesA,
+	const float4* verticesB,
+	const float4* uniqueEdgesB, 
+	const btGpuFace* facesB,
+	const int*  indicesB,
+	float4* sep,
+	float* dmin)
+{
+
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for(int i=0;i<numFacesA;i++)
+		{
+			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;
+			float4 faceANormalWS = qtRotate(ornA,normal);
+			if (dot3F4(DeltaC2,faceANormalWS)<0)
+				faceANormalWS *= -1.f;
+			curPlaneTests++;
+			float d;
+			if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d))
+				return false;
+			if(d<*dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+
+
+bool findSeparatingAxisEdgeEdgeLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	const float4* verticesA, 
+	const float4* uniqueEdgesA, 
+	const btGpuFace* facesA,
+	const int*  indicesA,
+	__global const float4* verticesB, 
+	__global const float4* uniqueEdgesB, 
+	__global const btGpuFace* facesB,
+	__global const int*  indicesB,
+		float4* sep,
+	float* dmin)
+{
+
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+
+	int curPlaneTests=0;
+
+	int curEdgeEdge = 0;
+	// Test edges
+	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)
+	{
+		const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];
+		float4 edge0World = qtRotate(ornA,edge0);
+
+		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)
+		{
+			const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];
+			float4 edge1World = qtRotate(ornB,edge1);
+
+
+			float4 crossje = cross3(edge0World,edge1World);
+
+			curEdgeEdge++;
+			if(!IsAlmostZero(crossje))
+			{
+				crossje = normalize3(crossje);
+				if (dot3F4(DeltaC2,crossje)<0)
+					crossje *= -1.f;
+
+				float dist;
+				bool result = true;
+				{
+					float Min0,Max0;
+					float Min1,Max1;
+					projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);
+					project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);
+				
+					if(Max0<Min1 || Max1<Min0)
+						result = false;
+				
+					float d0 = Max0 - Min1;
+					float d1 = Max1 - Min0;
+					dist = d0<d1 ? d0:d1;
+					result = true;
+
+				}
+				
+
+				if(dist<*dmin)
+				{
+					*dmin = dist;
+					*sep = crossje;
+				}
+			}
+		}
+
+	}
+
+	
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+
+inline bool TestSepAxis(__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA,const float4 ornA,
+	const float4 posB,const float4 ornB,
+	float4* sep_axis, __global const float4* vertices,float* depth)
+{
+	float Min0,Max0;
+	float Min1,Max1;
+	project(hullA,posA,ornA,sep_axis,vertices, &Min0, &Max0);
+	project(hullB,posB,ornB, sep_axis,vertices, &Min1, &Max1);
+
+	if(Max0<Min1 || Max1<Min0)
+		return false;
+
+	float d0 = Max0 - Min1;
+	float d1 = Max1 - Min0;
+	*depth = d0<d1 ? d0:d1;
+	return true;
+}
+
+
+bool findSeparatingAxis(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* vertices, 
+	__global const float4* uniqueEdges, 
+	__global const btGpuFace* faces,
+	__global const int*  indices,
+	float4* sep,
+	float* dmin)
+{
+	
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	
+	int curPlaneTests=0;
+
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for(int i=0;i<numFacesA;i++)
+		{
+			const float4 normal = faces[hullA->m_faceOffset+i].m_plane;
+			float4 faceANormalWS = qtRotate(ornA,normal);
+	
+			if (dot3F4(DeltaC2,faceANormalWS)<0)
+				faceANormalWS*=-1.f;
+				
+			curPlaneTests++;
+	
+			float d;
+			if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, vertices,&d))
+				return false;
+	
+			if(d<*dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+
+
+		if((dot3F4(-DeltaC2,*sep))>0.0f)
+		{
+			*sep = -(*sep);
+		}
+	
+	return true;
+}
+
+
+
+
+bool findSeparatingAxisUnitSphere(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* vertices,
+	__global const float4* unitSphereDirections,
+	int numUnitSphereDirections,
+	float4* sep,
+	float* dmin)
+{
+	
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+
+	int curPlaneTests=0;
+
+	int curEdgeEdge = 0;
+	// Test unit sphere directions
+	for (int i=0;i<numUnitSphereDirections;i++)
+	{
+
+		float4 crossje;
+		crossje = unitSphereDirections[i];	
+
+		if (dot3F4(DeltaC2,crossje)>0)
+			crossje *= -1.f;
+		{
+			float dist;
+			bool result = true;
+			float Min0,Max0;
+			float Min1,Max1;
+			project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);
+			project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);
+		
+			if(Max0<Min1 || Max1<Min0)
+				return false;
+		
+			float d0 = Max0 - Min1;
+			float d1 = Max1 - Min0;
+			dist = d0<d1 ? d0:d1;
+			result = true;
+	
+			if(dist<*dmin)
+			{
+				*dmin = dist;
+				*sep = crossje;
+			}
+		}
+	}
+
+	
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+
+bool findSeparatingAxisEdgeEdge(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* vertices, 
+	__global const float4* uniqueEdges, 
+	__global const btGpuFace* faces,
+	__global const int*  indices,
+	float4* sep,
+	float* dmin)
+{
+	
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+
+	int curPlaneTests=0;
+
+	int curEdgeEdge = 0;
+	// Test edges
+	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)
+	{
+		const float4 edge0 = uniqueEdges[hullA->m_uniqueEdgesOffset+e0];
+		float4 edge0World = qtRotate(ornA,edge0);
+
+		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)
+		{
+			const float4 edge1 = uniqueEdges[hullB->m_uniqueEdgesOffset+e1];
+			float4 edge1World = qtRotate(ornB,edge1);
+
+
+			float4 crossje = cross3(edge0World,edge1World);
+
+			curEdgeEdge++;
+			if(!IsAlmostZero(crossje))
+			{
+				crossje = normalize3(crossje);
+				if (dot3F4(DeltaC2,crossje)<0)
+					crossje*=-1.f;
+					
+				float dist;
+				bool result = true;
+				{
+					float Min0,Max0;
+					float Min1,Max1;
+					project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);
+					project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);
+				
+					if(Max0<Min1 || Max1<Min0)
+						return false;
+				
+					float d0 = Max0 - Min1;
+					float d1 = Max1 - Min0;
+					dist = d0<d1 ? d0:d1;
+					result = true;
+
+				}
+				
+
+				if(dist<*dmin)
+				{
+					*dmin = dist;
+					*sep = crossje;
+				}
+			}
+		}
+
+	}
+
+	
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+
+// work-in-progress
+__kernel void   processCompoundPairsKernel( __global const int4* gpuCompoundPairs,
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global btAabbCL* aabbs,
+																					__global const btGpuChildShape* gpuChildShapes,
+																					__global volatile float4* gpuCompoundSepNormalsOut,
+																					__global volatile int* gpuHasCompoundSepNormalsOut,
+																					int numCompoundPairs
+																					)
+{
+
+	int i = get_global_id(0);
+	if (i<numCompoundPairs)
+	{
+		int bodyIndexA = gpuCompoundPairs[i].x;
+		int bodyIndexB = gpuCompoundPairs[i].y;
+
+		int childShapeIndexA = gpuCompoundPairs[i].z;
+		int childShapeIndexB = gpuCompoundPairs[i].w;
+		
+		int collidableIndexA = -1;
+		int collidableIndexB = -1;
+		
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		
+		float4 ornB = rigidBodies[bodyIndexB].m_quat;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+							
+		if (childShapeIndexA >= 0)
+		{
+			collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+			float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+			float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+			float4 newPosA = qtRotate(ornA,childPosA)+posA;
+			float4 newOrnA = qtMul(ornA,childOrnA);
+			posA = newPosA;
+			ornA = newOrnA;
+		} else
+		{
+			collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		}
+		
+		if (childShapeIndexB>=0)
+		{
+			collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			float4 newPosB = transform(&childPosB,&posB,&ornB);
+			float4 newOrnB = qtMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+		} else
+		{
+			collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	
+		}
+	
+		gpuHasCompoundSepNormalsOut[i] = 0;
+	
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+	
+		int shapeTypeA = collidables[collidableIndexA].m_shapeType;
+		int shapeTypeB = collidables[collidableIndexB].m_shapeType;
+	
+
+		if ((shapeTypeA != SHAPE_CONVEX_HULL) || (shapeTypeB != SHAPE_CONVEX_HULL))
+		{
+			return;
+		}
+
+		int hasSeparatingAxis = 5;
+							
+		int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+		float dmin = FLT_MAX;
+		posA.w = 0.f;
+		posB.w = 0.f;
+		float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+		float4 sepNormal = make_float4(1,0,0,0);
+		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);
+		hasSeparatingAxis = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,posA,ornA,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);
+
+			if (!sepB)
+			{
+				hasSeparatingAxis = 0;
+			} else//(!sepB)
+			{
+				bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);
+				if (sepEE)
+				{
+						gpuCompoundSepNormalsOut[i] = sepNormal;//fastNormalize4(sepNormal);
+						gpuHasCompoundSepNormalsOut[i] = 1;
+				}//sepEE
+			}//(!sepB)
+		}//(!sepA)
+		
+		
+	}
+		
+}
+
+
+inline b3Float4 MyUnQuantize(const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin)
+{
+		b3Float4 vecOut;
+		vecOut = b3MakeFloat4(
+			(float)(vecIn[0]) / (quantization.x),
+			(float)(vecIn[1]) / (quantization.y),
+			(float)(vecIn[2]) / (quantization.z),
+			0.f);
+
+		vecOut += bvhAabbMin;
+		return vecOut;
+}
+
+inline b3Float4 MyUnQuantizeGlobal(__global const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin)
+{
+		b3Float4 vecOut;
+		vecOut = b3MakeFloat4(
+			(float)(vecIn[0]) / (quantization.x),
+			(float)(vecIn[1]) / (quantization.y),
+			(float)(vecIn[2]) / (quantization.z),
+			0.f);
+
+		vecOut += bvhAabbMin;
+		return vecOut;
+}
+
+
+// work-in-progress
+__kernel void   findCompoundPairsKernel( __global const int4* pairs, 
+	__global const BodyData* rigidBodies, 
+	__global const btCollidableGpu* collidables,
+	__global const ConvexPolyhedronCL* convexShapes, 
+	__global const float4* vertices,
+	__global const float4* uniqueEdges,
+	__global const btGpuFace* faces,
+	__global const int* indices,
+	__global b3Aabb_t* aabbLocalSpace,
+	__global const btGpuChildShape* gpuChildShapes,
+	__global volatile int4* gpuCompoundPairsOut,
+	__global volatile int* numCompoundPairsOut,
+	__global const b3BvhSubtreeInfo* subtrees,
+	__global const b3QuantizedBvhNode* quantizedNodes,
+	__global const b3BvhInfo* bvhInfos,
+	int numPairs,
+	int maxNumCompoundPairsCapacity
+	)
+{
+
+	int i = get_global_id(0);
+
+	if (i<numPairs)
+	{
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+
+
+		//once the broadphase avoids static-static pairs, we can remove this test
+		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+		{
+			return;
+		}
+
+		if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) &&(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+		{
+			int bvhA = collidables[collidableIndexA].m_compoundBvhIndex;
+			int bvhB = collidables[collidableIndexB].m_compoundBvhIndex;
+			int numSubTreesA = bvhInfos[bvhA].m_numSubTrees;
+			int subTreesOffsetA = bvhInfos[bvhA].m_subTreeOffset;
+			int subTreesOffsetB = bvhInfos[bvhB].m_subTreeOffset;
+
+
+			int numSubTreesB = bvhInfos[bvhB].m_numSubTrees;
+			
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			b3Quat ornA = rigidBodies[bodyIndexA].m_quat;
+
+			b3Quat ornB = rigidBodies[bodyIndexB].m_quat;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+
+			
+			for (int p=0;p<numSubTreesA;p++)
+			{
+				b3BvhSubtreeInfo subtreeA = subtrees[subTreesOffsetA+p];
+				//bvhInfos[bvhA].m_quantization
+				b3Float4 treeAminLocal = MyUnQuantize(subtreeA.m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);
+				b3Float4 treeAmaxLocal = MyUnQuantize(subtreeA.m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);
+
+				b3Float4 aabbAMinOut,aabbAMaxOut;
+				float margin=0.f;
+				b3TransformAabb2(treeAminLocal,treeAmaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut);
+				
+				for (int q=0;q<numSubTreesB;q++)
+				{
+					b3BvhSubtreeInfo subtreeB = subtrees[subTreesOffsetB+q];
+
+					b3Float4 treeBminLocal = MyUnQuantize(subtreeB.m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);
+					b3Float4 treeBmaxLocal = MyUnQuantize(subtreeB.m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);
+
+					b3Float4 aabbBMinOut,aabbBMaxOut;
+					float margin=0.f;
+					b3TransformAabb2(treeBminLocal,treeBmaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut);
+
+					
+					
+					bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);
+					if (aabbOverlap)
+					{
+						
+						int startNodeIndexA = subtreeA.m_rootNodeIndex+bvhInfos[bvhA].m_nodeOffset;
+						int endNodeIndexA = startNodeIndexA+subtreeA.m_subtreeSize;
+
+						int startNodeIndexB = subtreeB.m_rootNodeIndex+bvhInfos[bvhB].m_nodeOffset;
+						int endNodeIndexB = startNodeIndexB+subtreeB.m_subtreeSize;
+
+
+						b3Int2 nodeStack[B3_MAX_STACK_DEPTH];
+						b3Int2 node0;
+						node0.x = startNodeIndexA;
+						node0.y = startNodeIndexB;
+						int maxStackDepth = B3_MAX_STACK_DEPTH;
+						int depth=0;
+						nodeStack[depth++]=node0;
+
+						do
+						{
+							b3Int2 node = nodeStack[--depth];
+
+							b3Float4 aMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);
+							b3Float4 aMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);
+
+							b3Float4 bMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);
+							b3Float4 bMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);
+
+							float margin=0.f;
+							b3Float4 aabbAMinOut,aabbAMaxOut;
+							b3TransformAabb2(aMinLocal,aMaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut);
+
+							b3Float4 aabbBMinOut,aabbBMaxOut;
+							b3TransformAabb2(bMinLocal,bMaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut);
+
+							
+							bool nodeOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);
+							if (nodeOverlap)
+							{
+								bool isLeafA = isLeafNodeGlobal(&quantizedNodes[node.x]);
+								bool isLeafB = isLeafNodeGlobal(&quantizedNodes[node.y]);
+								bool isInternalA = !isLeafA;
+								bool isInternalB = !isLeafB;
+
+								//fail, even though it might hit two leaf nodes
+								if (depth+4>maxStackDepth && !(isLeafA && isLeafB))
+								{
+									//printf("Error: traversal exceeded maxStackDepth");
+									continue;
+								}
+
+								if(isInternalA)
+								{
+									int nodeAleftChild = node.x+1;
+									bool isNodeALeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.x+1]);
+									int nodeArightChild = isNodeALeftChildLeaf? node.x+2 : node.x+1 + getEscapeIndexGlobal(&quantizedNodes[node.x+1]);
+
+									if(isInternalB)
+									{					
+										int nodeBleftChild = node.y+1;
+										bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]);
+										int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]);
+
+										nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBleftChild);
+										nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBleftChild);
+										nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBrightChild);
+										nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBrightChild);
+									}
+									else
+									{
+										nodeStack[depth++] = b3MakeInt2(nodeAleftChild,node.y);
+										nodeStack[depth++] = b3MakeInt2(nodeArightChild,node.y);
+									}
+								}
+								else
+								{
+									if(isInternalB)
+									{
+										int nodeBleftChild = node.y+1;
+										bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]);
+										int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]);
+										nodeStack[depth++] = b3MakeInt2(node.x,nodeBleftChild);
+										nodeStack[depth++] = b3MakeInt2(node.x,nodeBrightChild);
+									}
+									else
+									{
+										int compoundPairIdx = atomic_inc(numCompoundPairsOut);
+										if (compoundPairIdx<maxNumCompoundPairsCapacity)
+										{
+											int childShapeIndexA = getTriangleIndexGlobal(&quantizedNodes[node.x]);
+											int childShapeIndexB = getTriangleIndexGlobal(&quantizedNodes[node.y]);
+											gpuCompoundPairsOut[compoundPairIdx]  = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);
+										}
+									}
+								}
+							}
+						} while (depth);
+					}
+				}
+			}
+			
+			return;
+		}
+
+
+
+
+
+		if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+		{
+
+			if (collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) 
+			{
+
+				int numChildrenA = collidables[collidableIndexA].m_numChildShapes;
+				for (int c=0;c<numChildrenA;c++)
+				{
+					int childShapeIndexA = collidables[collidableIndexA].m_shapeIndex+c;
+					int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+
+					float4 posA = rigidBodies[bodyIndexA].m_pos;
+					float4 ornA = rigidBodies[bodyIndexA].m_quat;
+					float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+					float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+					float4 newPosA = qtRotate(ornA,childPosA)+posA;
+					float4 newOrnA = qtMul(ornA,childOrnA);
+
+					int shapeIndexA = collidables[childColIndexA].m_shapeIndex;
+					b3Aabb_t aabbAlocal = aabbLocalSpace[shapeIndexA];
+					float margin = 0.f;
+					
+					b3Float4 aabbAMinWS;
+					b3Float4 aabbAMaxWS;
+					
+					b3TransformAabb2(aabbAlocal.m_minVec,aabbAlocal.m_maxVec,margin,
+						newPosA,
+						newOrnA,
+						&aabbAMinWS,&aabbAMaxWS);
+						
+					
+					if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+					{
+						int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+						for (int b=0;b<numChildrenB;b++)
+						{
+							int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
+							int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+							float4 ornB = rigidBodies[bodyIndexB].m_quat;
+							float4 posB = rigidBodies[bodyIndexB].m_pos;
+							float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+							float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+							float4 newPosB = transform(&childPosB,&posB,&ornB);
+							float4 newOrnB = qtMul(ornB,childOrnB);
+
+							int shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+							b3Aabb_t aabbBlocal = aabbLocalSpace[shapeIndexB];
+							
+							b3Float4 aabbBMinWS;
+							b3Float4 aabbBMaxWS;
+							
+							b3TransformAabb2(aabbBlocal.m_minVec,aabbBlocal.m_maxVec,margin,
+								newPosB,
+								newOrnB,
+								&aabbBMinWS,&aabbBMaxWS);
+								
+								
+							
+							bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinWS,aabbAMaxWS,aabbBMinWS,aabbBMaxWS);
+							if (aabbOverlap)
+							{
+								int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+								float dmin = FLT_MAX;
+								float4 posA = newPosA;
+								posA.w = 0.f;
+								float4 posB = newPosB;
+								posB.w = 0.f;
+								float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+								float4 ornA = newOrnA;
+								float4 c0 = transform(&c0local, &posA, &ornA);
+								float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+								float4 ornB =newOrnB;
+								float4 c1 = transform(&c1local,&posB,&ornB);
+								const float4 DeltaC2 = c0 - c1;
+
+								{//
+									int compoundPairIdx = atomic_inc(numCompoundPairsOut);
+									if (compoundPairIdx<maxNumCompoundPairsCapacity)
+									{
+										gpuCompoundPairsOut[compoundPairIdx]  = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);
+									}
+								}//
+							}//fi(1)
+						} //for (int b=0
+					}//if (collidables[collidableIndexB].
+					else//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+					{
+						if (1)
+						{
+							int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+							float dmin = FLT_MAX;
+							float4 posA = newPosA;
+							posA.w = 0.f;
+							float4 posB = rigidBodies[bodyIndexB].m_pos;
+							posB.w = 0.f;
+							float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+							float4 ornA = newOrnA;
+							float4 c0 = transform(&c0local, &posA, &ornA);
+							float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+							float4 ornB = rigidBodies[bodyIndexB].m_quat;
+							float4 c1 = transform(&c1local,&posB,&ornB);
+							const float4 DeltaC2 = c0 - c1;
+
+							{
+								int compoundPairIdx = atomic_inc(numCompoundPairsOut);
+								if (compoundPairIdx<maxNumCompoundPairsCapacity)
+								{
+									gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,-1);
+								}//if (compoundPairIdx<maxNumCompoundPairsCapacity)
+							}//
+						}//fi (1)
+					}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+				}//for (int b=0;b<numChildrenB;b++)	
+				return;
+			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+			if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH) 
+				&& (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+			{
+				int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+				for (int b=0;b<numChildrenB;b++)
+				{
+					int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
+					int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+					float4 ornB = rigidBodies[bodyIndexB].m_quat;
+					float4 posB = rigidBodies[bodyIndexB].m_pos;
+					float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+					float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+					float4 newPosB = qtRotate(ornB,childPosB)+posB;
+					float4 newOrnB = qtMul(ornB,childOrnB);
+
+					int shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+
+
+					//////////////////////////////////////
+
+					if (1)
+					{
+						int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+						float dmin = FLT_MAX;
+						float4 posA = rigidBodies[bodyIndexA].m_pos;
+						posA.w = 0.f;
+						float4 posB = newPosB;
+						posB.w = 0.f;
+						float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+						float4 ornA = rigidBodies[bodyIndexA].m_quat;
+						float4 c0 = transform(&c0local, &posA, &ornA);
+						float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+						float4 ornB =newOrnB;
+						float4 c1 = transform(&c1local,&posB,&ornB);
+						const float4 DeltaC2 = c0 - c1;
+						{//
+							int compoundPairIdx = atomic_inc(numCompoundPairsOut);
+							if (compoundPairIdx<maxNumCompoundPairsCapacity)
+							{
+								gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,-1,childShapeIndexB);
+							}//fi (compoundPairIdx<maxNumCompoundPairsCapacity)
+						}//
+					}//fi (1)	
+				}//for (int b=0;b<numChildrenB;b++)
+				return;
+			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+			return;
+		}//fi ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+	}//i<numPairs
+}
+
+// work-in-progress
+__kernel void   findSeparatingAxisKernel( __global const int4* pairs, 
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global btAabbCL* aabbs,
+																					__global volatile float4* separatingNormals,
+																					__global volatile int* hasSeparatingAxis,
+																					int numPairs
+																					)
+{
+
+	int i = get_global_id(0);
+	
+	if (i<numPairs)
+	{
+
+	
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+	
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		
+		
+		//once the broadphase avoids static-static pairs, we can remove this test
+		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+		{
+			hasSeparatingAxis[i] = 0;
+			return;
+		}
+		
+
+		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))
+		{
+			hasSeparatingAxis[i] = 0;
+			return;
+		}
+			
+		if ((collidables[collidableIndexA].m_shapeType==SHAPE_CONCAVE_TRIMESH))
+		{
+			hasSeparatingAxis[i] = 0;
+			return;
+		}
+
+		int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+
+		float dmin = FLT_MAX;
+
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+		float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 ornB =rigidBodies[bodyIndexB].m_quat;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+		float4 sepNormal;
+		
+		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																								posB,ornB,
+																								DeltaC2,
+																								vertices,uniqueEdges,faces,
+																								indices,&sepNormal,&dmin);
+		hasSeparatingAxis[i] = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis[i] = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,
+																									posA,ornA,
+																									DeltaC2,
+																									vertices,uniqueEdges,faces,
+																									indices,&sepNormal,&dmin);
+
+			if (!sepB)
+			{
+				hasSeparatingAxis[i] = 0;
+			} else
+			{
+				bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																									posB,ornB,
+																									DeltaC2,
+																									vertices,uniqueEdges,faces,
+																									indices,&sepNormal,&dmin);
+				if (!sepEE)
+				{
+					hasSeparatingAxis[i] = 0;
+				} else
+				{
+					hasSeparatingAxis[i] = 1;
+					separatingNormals[i] = sepNormal;
+				}
+			}
+		}
+		
+	}
+
+}
+
+
+__kernel void   findSeparatingAxisVertexFaceKernel( __global const int4* pairs, 
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global btAabbCL* aabbs,
+																					__global volatile float4* separatingNormals,
+																					__global volatile int* hasSeparatingAxis,
+																					__global  float* dmins,
+																					int numPairs
+																					)
+{
+
+	int i = get_global_id(0);
+	
+	if (i<numPairs)
+	{
+
+	
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+	
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+	
+		hasSeparatingAxis[i] = 0;	
+		
+		//once the broadphase avoids static-static pairs, we can remove this test
+		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+		{
+			return;
+		}
+		
+
+		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))
+		{
+			return;
+		}
+			
+
+		int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+
+		float dmin = FLT_MAX;
+
+		dmins[i] = dmin;
+		
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+		float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 ornB =rigidBodies[bodyIndexB].m_quat;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+		float4 sepNormal;
+		
+		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																								posB,ornB,
+																								DeltaC2,
+																								vertices,uniqueEdges,faces,
+																								indices,&sepNormal,&dmin);
+		hasSeparatingAxis[i] = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis[i] = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,
+																									posA,ornA,
+																									DeltaC2,
+																									vertices,uniqueEdges,faces,
+																									indices,&sepNormal,&dmin);
+
+			if (sepB)
+			{
+				dmins[i] = dmin;
+				hasSeparatingAxis[i] = 1;
+				separatingNormals[i] = sepNormal;
+			}
+		}
+		
+	}
+
+}
+
+
+__kernel void   findSeparatingAxisEdgeEdgeKernel( __global const int4* pairs, 
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global btAabbCL* aabbs,
+																					__global  float4* separatingNormals,
+																					__global  int* hasSeparatingAxis,
+																					__global  float* dmins,
+																					__global const float4* unitSphereDirections,
+																					int numUnitSphereDirections,
+																					int numPairs
+																					)
+{
+
+	int i = get_global_id(0);
+	
+	if (i<numPairs)
+	{
+
+		if (hasSeparatingAxis[i])
+		{
+	
+			int bodyIndexA = pairs[i].x;
+			int bodyIndexB = pairs[i].y;
+	
+			int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+			int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+		
+			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+			
+			
+			int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+	
+			float dmin = dmins[i];
+	
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			posA.w = 0.f;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+			posB.w = 0.f;
+			float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+			float4 ornA = rigidBodies[bodyIndexA].m_quat;
+			float4 c0 = transform(&c0local, &posA, &ornA);
+			float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+			float4 ornB =rigidBodies[bodyIndexB].m_quat;
+			float4 c1 = transform(&c1local,&posB,&ornB);
+			const float4 DeltaC2 = c0 - c1;
+			float4 sepNormal = separatingNormals[i];
+			
+			
+			
+			bool sepEE = false;
+			int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;
+			if (numEdgeEdgeDirections<=numUnitSphereDirections)
+			{
+				sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																									posB,ornB,
+																									DeltaC2,
+																									vertices,uniqueEdges,faces,
+																									indices,&sepNormal,&dmin);
+																									
+					if (!sepEE)
+					{
+						hasSeparatingAxis[i] = 0;
+					} else
+					{
+						hasSeparatingAxis[i] = 1;
+						separatingNormals[i] = sepNormal;
+					}
+			}
+			/*
+			///else case is a separate kernel, to make Mac OSX OpenCL compiler happy
+			else
+			{
+				sepEE = findSeparatingAxisUnitSphere(&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																									posB,ornB,
+																									DeltaC2,
+																									vertices,unitSphereDirections,numUnitSphereDirections,
+																									&sepNormal,&dmin);
+					if (!sepEE)
+					{
+						hasSeparatingAxis[i] = 0;
+					} else
+					{
+						hasSeparatingAxis[i] = 1;
+						separatingNormals[i] = sepNormal;
+					}
+			}
+			*/
+		}		//if (hasSeparatingAxis[i])
+	}//(i<numPairs)
+}
+
+
+
+
+
+inline int	findClippingFaces(const float4 separatingNormal,
+                      const ConvexPolyhedronCL* hullA, 
+					  __global const ConvexPolyhedronCL* hullB,
+                      const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,
+                       __global float4* worldVertsA1,
+                      __global float4* worldNormalsA1,
+                      __global float4* worldVertsB1,
+                      int capacityWorldVerts,
+                      const float minDist, float maxDist,
+					  const float4* verticesA,
+                      const btGpuFace* facesA,
+                      const int* indicesA,
+					  __global const float4* verticesB,
+                      __global const btGpuFace* facesB,
+                      __global const int* indicesB,
+                      __global int4* clippingFaces, int pairIndex)
+{
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+    
+    
+	int closestFaceB=0;
+	float dmax = -FLT_MAX;
+    
+	{
+		for(int face=0;face<hullB->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x,
+                                              facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);
+			const float4 WorldNormal = qtRotate(ornB, Normal);
+			float d = dot3F4(WorldNormal,separatingNormal);
+			if (d > dmax)
+			{
+				dmax = d;
+				closestFaceB = face;
+			}
+		}
+	}
+    
+	{
+		const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];
+		int numVertices = polyB.m_numIndices;
+        if (numVertices>capacityWorldVerts)
+            numVertices = capacityWorldVerts;
+        
+		for(int e0=0;e0<numVertices;e0++)
+		{
+            if (e0<capacityWorldVerts)
+            {
+                const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];
+                worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);
+            }
+		}
+	}
+    
+    int closestFaceA=0;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(
+                                              facesA[hullA->m_faceOffset+face].m_plane.x,
+                                              facesA[hullA->m_faceOffset+face].m_plane.y,
+                                              facesA[hullA->m_faceOffset+face].m_plane.z,
+                                              0.f);
+			const float4 faceANormalWS = qtRotate(ornA,Normal);
+            
+			float d = dot3F4(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+                worldNormalsA1[pairIndex] = faceANormalWS;
+			}
+		}
+	}
+    
+    int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices;
+    if (numVerticesA>capacityWorldVerts)
+       numVerticesA = capacityWorldVerts;
+    
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+        if (e0<capacityWorldVerts)
+        {
+            const float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];
+            worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);
+        }
+    }
+    
+    clippingFaces[pairIndex].x = closestFaceA;
+    clippingFaces[pairIndex].y = closestFaceB;
+    clippingFaces[pairIndex].z = numVerticesA;
+    clippingFaces[pairIndex].w = numWorldVertsB1;
+    
+    
+	return numContactsOut;
+}
+
+
+
+
+// work-in-progress
+__kernel void   findConcaveSeparatingAxisKernel( __global int4* concavePairs,
+																					__global const BodyData* rigidBodies,
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global const btGpuChildShape* gpuChildShapes,
+																					__global btAabbCL* aabbs,
+																					__global float4* concaveSeparatingNormalsOut,
+																					__global int* concaveHasSeparatingNormals,
+																					__global int4* clippingFacesOut,
+																					__global float4* worldVertsA1GPU,
+																					__global float4*  worldNormalsAGPU,
+																					__global float4* worldVertsB1GPU,
+																					int vertexFaceCapacity,
+																					int numConcavePairs
+																					)
+{
+
+	int i = get_global_id(0);
+	if (i>=numConcavePairs)
+		return;
+
+	concaveHasSeparatingNormals[i] = 0;
+
+	int pairIdx = i;
+
+	int bodyIndexA = concavePairs[i].x;
+	int bodyIndexB = concavePairs[i].y;
+
+	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+
+	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+
+	if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&&
+		collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS)
+	{
+		concavePairs[pairIdx].w = -1;
+		return;
+	}
+
+
+
+	int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+	int numActualConcaveConvexTests = 0;
+	
+	int f = concavePairs[i].z;
+	
+	bool overlap = false;
+	
+	ConvexPolyhedronCL convexPolyhedronA;
+
+	//add 3 vertices of the triangle
+	convexPolyhedronA.m_numVertices = 3;
+	convexPolyhedronA.m_vertexOffset = 0;
+	float4	localCenter = make_float4(0.f,0.f,0.f,0.f);
+
+	btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];
+	float4 triMinAabb, triMaxAabb;
+	btAabbCL triAabb;
+	triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);
+	triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);
+	
+	float4 verticesA[3];
+	for (int i=0;i<3;i++)
+	{
+		int index = indices[face.m_indexOffset+i];
+		float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];
+		verticesA[i] = vert;
+		localCenter += vert;
+			
+		triAabb.m_min = min(triAabb.m_min,vert);		
+		triAabb.m_max = max(triAabb.m_max,vert);		
+
+	}
+
+	overlap = true;
+	overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;
+	overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;
+	overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;
+		
+	if (overlap)
+	{
+		float dmin = FLT_MAX;
+		int hasSeparatingAxis=5;
+		float4 sepAxis=make_float4(1,2,3,4);
+
+		int localCC=0;
+		numActualConcaveConvexTests++;
+
+		//a triangle has 3 unique edges
+		convexPolyhedronA.m_numUniqueEdges = 3;
+		convexPolyhedronA.m_uniqueEdgesOffset = 0;
+		float4 uniqueEdgesA[3];
+		
+		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);
+		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);
+		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);
+
+
+		convexPolyhedronA.m_faceOffset = 0;
+                                  
+		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);
+                             
+		btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];
+		int indicesA[3+3+2+2+2];
+		int curUsedIndices=0;
+		int fidx=0;
+
+		//front size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[0] = 0;
+			indicesA[1] = 1;
+			indicesA[2] = 2;
+			curUsedIndices+=3;
+			float c = face.m_plane.w;
+			facesA[fidx].m_plane.x = normal.x;
+			facesA[fidx].m_plane.y = normal.y;
+			facesA[fidx].m_plane.z = normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+		//back size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[3]=2;
+			indicesA[4]=1;
+			indicesA[5]=0;
+			curUsedIndices+=3;
+			float c = dot(normal,verticesA[0]);
+			float c1 = -face.m_plane.w;
+			facesA[fidx].m_plane.x = -normal.x;
+			facesA[fidx].m_plane.y = -normal.y;
+			facesA[fidx].m_plane.z = -normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+
+		bool addEdgePlanes = true;
+		if (addEdgePlanes)
+		{
+			int numVertices=3;
+			int prevVertex = numVertices-1;
+			for (int i=0;i<numVertices;i++)
+			{
+				float4 v0 = verticesA[i];
+				float4 v1 = verticesA[prevVertex];
+                                            
+				float4 edgeNormal = normalize(cross(normal,v1-v0));
+				float c = -dot(edgeNormal,v0);
+
+				facesA[fidx].m_numIndices = 2;
+				facesA[fidx].m_indexOffset=curUsedIndices;
+				indicesA[curUsedIndices++]=i;
+				indicesA[curUsedIndices++]=prevVertex;
+                                            
+				facesA[fidx].m_plane.x = edgeNormal.x;
+				facesA[fidx].m_plane.y = edgeNormal.y;
+				facesA[fidx].m_plane.z = edgeNormal.z;
+				facesA[fidx].m_plane.w = c;
+				fidx++;
+				prevVertex = i;
+			}
+		}
+		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;
+		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);
+
+
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 ornB =rigidBodies[bodyIndexB].m_quat;
+
+		
+
+
+		///////////////////
+		///compound shape support
+
+		if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+			int compoundChild = concavePairs[pairIdx].w;
+			int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;
+			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			float4 newPosB = transform(&childPosB,&posB,&ornB);
+			float4 newOrnB = qtMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+			shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+		}
+		//////////////////
+
+		float4 c0local = convexPolyhedronA.m_localCenter;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+
+
+		bool sepA = findSeparatingAxisLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],
+												posA,ornA,
+												posB,ornB,
+												DeltaC2,
+												verticesA,uniqueEdgesA,facesA,indicesA,
+												vertices,uniqueEdges,faces,indices,
+												&sepAxis,&dmin);
+		hasSeparatingAxis = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxisLocalB(	&convexShapes[shapeIndexB],&convexPolyhedronA,
+												posB,ornB,
+												posA,ornA,
+												DeltaC2,
+												vertices,uniqueEdges,faces,indices,
+												verticesA,uniqueEdgesA,facesA,indicesA,
+												&sepAxis,&dmin);
+
+			if (!sepB)
+			{
+				hasSeparatingAxis = 0;
+			} else
+			{
+				bool sepEE = findSeparatingAxisEdgeEdgeLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],
+															posA,ornA,
+															posB,ornB,
+															DeltaC2,
+															verticesA,uniqueEdgesA,facesA,indicesA,
+															vertices,uniqueEdges,faces,indices,
+															&sepAxis,&dmin);
+	
+				if (!sepEE)
+				{
+					hasSeparatingAxis = 0;
+				} else
+				{
+					hasSeparatingAxis = 1;
+				}
+			}
+		}	
+		
+		if (hasSeparatingAxis)
+		{
+			sepAxis.w = dmin;
+			concaveSeparatingNormalsOut[pairIdx]=sepAxis;
+			concaveHasSeparatingNormals[i]=1;
+
+
+			float minDist = -1e30f;
+			float maxDist = 0.02f;
+
+		
+
+			findClippingFaces(sepAxis,
+                     &convexPolyhedronA,
+					 &convexShapes[shapeIndexB],
+					 posA,ornA,
+					 posB,ornB,
+                      worldVertsA1GPU,
+                      worldNormalsAGPU,
+                      worldVertsB1GPU,
+					  vertexFaceCapacity,
+                      minDist, maxDist,
+                      verticesA,
+                      facesA,
+                      indicesA,
+ 					  vertices,
+                      faces,
+                      indices,
+                      clippingFacesOut, pairIdx);
+
+
+		} else
+		{	
+			//mark this pair as in-active
+			concavePairs[pairIdx].w = -1;
+		}
+	}
+	else
+	{	
+		//mark this pair as in-active
+		concavePairs[pairIdx].w = -1;
+	}
+	
+	concavePairs[pairIdx].z = -1;//now z is used for existing/persistent contacts
+}
+
+
+