3 files changed, 1264 insertions, 1267 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h
index dad42477c3..0185417786 100644
--- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h
+++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h
@@ -1,199 +1,198 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* gridBroadphaseCL= \
-"int getPosHash(int4 gridPos, __global float4* pParams)\n"
-"{\n"
-"	int4 gridDim = *((__global int4*)(pParams + 1));\n"
-"	gridPos.x &= gridDim.x - 1;\n"
-"	gridPos.y &= gridDim.y - 1;\n"
-"	gridPos.z &= gridDim.z - 1;\n"
-"	int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n"
-"	return hash;\n"
-"} \n"
-"int4 getGridPos(float4 worldPos, __global float4* pParams)\n"
-"{\n"
-"    int4 gridPos;\n"
-"	int4 gridDim = *((__global int4*)(pParams + 1));\n"
-"    gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n"
-"    gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n"
-"    gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n"
-"    return gridPos;\n"
-"}\n"
-"// calculate grid hash value for each body using its AABB\n"
-"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n"
-"{\n"
-"    int index = get_global_id(0);\n"
-"    if(index >= numObjects)\n"
-"	{\n"
-"		return;\n"
-"	}\n"
-"	float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n"
-"	float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n"
-"	float4 pos;\n"
-"	pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
-"	pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
-"	pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
-"	pos.w = 0.f;\n"
-"    // get address in grid\n"
-"    int4 gridPos = getGridPos(pos, pParams);\n"
-"    int gridHash = getPosHash(gridPos, pParams);\n"
-"    // store grid hash and body index\n"
-"    int2 hashVal;\n"
-"    hashVal.x = gridHash;\n"
-"    hashVal.y = index;\n"
-"    pHash[index] = hashVal;\n"
-"}\n"
-"__kernel void kClearCellStart(	int numCells, \n"
-"								__global int* pCellStart )\n"
-"{\n"
-"    int index = get_global_id(0);\n"
-"    if(index >= numCells)\n"
-"	{\n"
-"		return;\n"
-"	}\n"
-"	pCellStart[index] = -1;\n"
-"}\n"
-"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n"
-"{\n"
-"	__local int sharedHash[513];\n"
-"    int index = get_global_id(0);\n"
-"	int2 sortedData;\n"
-"    if(index < numObjects)\n"
-"	{\n"
-"		sortedData = pHash[index];\n"
-"		// Load hash data into shared memory so that we can look \n"
-"		// at neighboring body's hash value without loading\n"
-"		// two hash values per thread\n"
-"		sharedHash[get_local_id(0) + 1] = sortedData.x;\n"
-"		if((index > 0) && (get_local_id(0) == 0))\n"
-"		{\n"
-"			// first thread in block must load neighbor body hash\n"
-"			sharedHash[0] = pHash[index-1].x;\n"
-"		}\n"
-"	}\n"
-"    barrier(CLK_LOCAL_MEM_FENCE);\n"
-"    if(index < numObjects)\n"
-"	{\n"
-"		if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n"
-"		{\n"
-"			cellStart[sortedData.x] = index;\n"
-"		}\n"
-"	}\n"
-"}\n"
-"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n"
-"{\n"
-"	return	(min0.x <= max1.x)&& (min1.x <= max0.x) && \n"
-"			(min0.y <= max1.y)&& (min1.y <= max0.y) && \n"
-"			(min0.z <= max1.z)&& (min1.z <= max0.z); \n"
-"}\n"
-"//search for AABB 'index' against other AABBs' in this cell\n"
-"void findPairsInCell(	int numObjects,\n"
-"						int4	gridPos,\n"
-"						int    index,\n"
-"						__global int2*  pHash,\n"
-"						__global int*   pCellStart,\n"
-"						__global float4* allpAABB, \n"
-"						__global const int* smallAabbMapping,\n"
-"						__global float4* pParams,\n"
-"							volatile  __global int* pairCount,\n"
-"						__global int4*   pPairBuff2,\n"
-"						int maxPairs\n"
-"						)\n"
-"{\n"
-"	int4 pGridDim = *((__global int4*)(pParams + 1));\n"
-"	int maxBodiesPerCell = pGridDim.w;\n"
-"    int gridHash = getPosHash(gridPos, pParams);\n"
-"    // get start of bucket for this cell\n"
-"    int bucketStart = pCellStart[gridHash];\n"
-"    if (bucketStart == -1)\n"
-"	{\n"
-"        return;   // cell empty\n"
-"	}\n"
-"	// iterate over bodies in this cell\n"
-"    int2 sortedData = pHash[index];\n"
-"	int unsorted_indx = sortedData.y;\n"
-"    float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n"
-"	float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
-"	int handleIndex =  as_int(min0.w);\n"
-"	\n"
-"	int bucketEnd = bucketStart + maxBodiesPerCell;\n"
-"	bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n"
-"	for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n"
-"	{\n"
-"        int2 cellData = pHash[index2];\n"
-"        if (cellData.x != gridHash)\n"
-"        {\n"
-"			break;   // no longer in same bucket\n"
-"		}\n"
-"		int unsorted_indx2 = cellData.y;\n"
-"        //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n"
-"		if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n"
-"        {   \n"
-"			float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n"
-"			float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n"
-"			if(testAABBOverlap(min0, max0, min1, max1))\n"
-"			{\n"
-"				if (pairCount)\n"
-"				{\n"
-"					int handleIndex2 = as_int(min1.w);\n"
-"					if (handleIndex<handleIndex2)\n"
-"					{\n"
-"						int curPair = atomic_add(pairCount,1);\n"
-"						if (curPair<maxPairs)\n"
-"						{\n"
-"							int4 newpair;\n"
-"							newpair.x = handleIndex;\n"
-"							newpair.y = handleIndex2;\n"
-"							newpair.z = -1;\n"
-"							newpair.w = -1;\n"
-"							pPairBuff2[curPair] = newpair;\n"
-"						}\n"
-"					}\n"
-"				\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"}\n"
-"__kernel void kFindOverlappingPairs(	int numObjects,\n"
-"										__global float4* allpAABB, \n"
-"										__global const int* smallAabbMapping,\n"
-"										__global int2* pHash, \n"
-"										__global int* pCellStart, \n"
-"										__global float4* pParams ,\n"
-"										volatile  __global int* pairCount,\n"
-"										__global int4*   pPairBuff2,\n"
-"										int maxPairs\n"
-"										)\n"
-"{\n"
-"    int index = get_global_id(0);\n"
-"    if(index >= numObjects)\n"
-"	{\n"
-"		return;\n"
-"	}\n"
-"    int2 sortedData = pHash[index];\n"
-"	int unsorted_indx = sortedData.y;\n"
-"	float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n"
-"	float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
-"	float4 pos;\n"
-"	pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
-"	pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
-"	pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
-"    // get address in grid\n"
-"    int4 gridPosA = getGridPos(pos, pParams);\n"
-"    int4 gridPosB; \n"
-"    // examine only neighbouring cells\n"
-"    for(int z=-1; z<=1; z++) \n"
-"    {\n"
-"		gridPosB.z = gridPosA.z + z;\n"
-"        for(int y=-1; y<=1; y++) \n"
-"        {\n"
-"			gridPosB.y = gridPosA.y + y;\n"
-"            for(int x=-1; x<=1; x++) \n"
-"            {\n"
-"				gridPosB.x = gridPosA.x + x;\n"
-"                findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n"
-"            }\n"
-"        }\n"
-"    }\n"
-"}\n"
-;
+static const char* gridBroadphaseCL =
+	"int getPosHash(int4 gridPos, __global float4* pParams)\n"
+	"{\n"
+	"	int4 gridDim = *((__global int4*)(pParams + 1));\n"
+	"	gridPos.x &= gridDim.x - 1;\n"
+	"	gridPos.y &= gridDim.y - 1;\n"
+	"	gridPos.z &= gridDim.z - 1;\n"
+	"	int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n"
+	"	return hash;\n"
+	"} \n"
+	"int4 getGridPos(float4 worldPos, __global float4* pParams)\n"
+	"{\n"
+	"    int4 gridPos;\n"
+	"	int4 gridDim = *((__global int4*)(pParams + 1));\n"
+	"    gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n"
+	"    gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n"
+	"    gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n"
+	"    return gridPos;\n"
+	"}\n"
+	"// calculate grid hash value for each body using its AABB\n"
+	"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n"
+	"{\n"
+	"    int index = get_global_id(0);\n"
+	"    if(index >= numObjects)\n"
+	"	{\n"
+	"		return;\n"
+	"	}\n"
+	"	float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n"
+	"	float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n"
+	"	float4 pos;\n"
+	"	pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
+	"	pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
+	"	pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
+	"	pos.w = 0.f;\n"
+	"    // get address in grid\n"
+	"    int4 gridPos = getGridPos(pos, pParams);\n"
+	"    int gridHash = getPosHash(gridPos, pParams);\n"
+	"    // store grid hash and body index\n"
+	"    int2 hashVal;\n"
+	"    hashVal.x = gridHash;\n"
+	"    hashVal.y = index;\n"
+	"    pHash[index] = hashVal;\n"
+	"}\n"
+	"__kernel void kClearCellStart(	int numCells, \n"
+	"								__global int* pCellStart )\n"
+	"{\n"
+	"    int index = get_global_id(0);\n"
+	"    if(index >= numCells)\n"
+	"	{\n"
+	"		return;\n"
+	"	}\n"
+	"	pCellStart[index] = -1;\n"
+	"}\n"
+	"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n"
+	"{\n"
+	"	__local int sharedHash[513];\n"
+	"    int index = get_global_id(0);\n"
+	"	int2 sortedData;\n"
+	"    if(index < numObjects)\n"
+	"	{\n"
+	"		sortedData = pHash[index];\n"
+	"		// Load hash data into shared memory so that we can look \n"
+	"		// at neighboring body's hash value without loading\n"
+	"		// two hash values per thread\n"
+	"		sharedHash[get_local_id(0) + 1] = sortedData.x;\n"
+	"		if((index > 0) && (get_local_id(0) == 0))\n"
+	"		{\n"
+	"			// first thread in block must load neighbor body hash\n"
+	"			sharedHash[0] = pHash[index-1].x;\n"
+	"		}\n"
+	"	}\n"
+	"    barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"    if(index < numObjects)\n"
+	"	{\n"
+	"		if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n"
+	"		{\n"
+	"			cellStart[sortedData.x] = index;\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n"
+	"{\n"
+	"	return	(min0.x <= max1.x)&& (min1.x <= max0.x) && \n"
+	"			(min0.y <= max1.y)&& (min1.y <= max0.y) && \n"
+	"			(min0.z <= max1.z)&& (min1.z <= max0.z); \n"
+	"}\n"
+	"//search for AABB 'index' against other AABBs' in this cell\n"
+	"void findPairsInCell(	int numObjects,\n"
+	"						int4	gridPos,\n"
+	"						int    index,\n"
+	"						__global int2*  pHash,\n"
+	"						__global int*   pCellStart,\n"
+	"						__global float4* allpAABB, \n"
+	"						__global const int* smallAabbMapping,\n"
+	"						__global float4* pParams,\n"
+	"							volatile  __global int* pairCount,\n"
+	"						__global int4*   pPairBuff2,\n"
+	"						int maxPairs\n"
+	"						)\n"
+	"{\n"
+	"	int4 pGridDim = *((__global int4*)(pParams + 1));\n"
+	"	int maxBodiesPerCell = pGridDim.w;\n"
+	"    int gridHash = getPosHash(gridPos, pParams);\n"
+	"    // get start of bucket for this cell\n"
+	"    int bucketStart = pCellStart[gridHash];\n"
+	"    if (bucketStart == -1)\n"
+	"	{\n"
+	"        return;   // cell empty\n"
+	"	}\n"
+	"	// iterate over bodies in this cell\n"
+	"    int2 sortedData = pHash[index];\n"
+	"	int unsorted_indx = sortedData.y;\n"
+	"    float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n"
+	"	float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
+	"	int handleIndex =  as_int(min0.w);\n"
+	"	\n"
+	"	int bucketEnd = bucketStart + maxBodiesPerCell;\n"
+	"	bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n"
+	"	for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n"
+	"	{\n"
+	"        int2 cellData = pHash[index2];\n"
+	"        if (cellData.x != gridHash)\n"
+	"        {\n"
+	"			break;   // no longer in same bucket\n"
+	"		}\n"
+	"		int unsorted_indx2 = cellData.y;\n"
+	"        //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n"
+	"		if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n"
+	"        {   \n"
+	"			float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n"
+	"			float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n"
+	"			if(testAABBOverlap(min0, max0, min1, max1))\n"
+	"			{\n"
+	"				if (pairCount)\n"
+	"				{\n"
+	"					int handleIndex2 = as_int(min1.w);\n"
+	"					if (handleIndex<handleIndex2)\n"
+	"					{\n"
+	"						int curPair = atomic_add(pairCount,1);\n"
+	"						if (curPair<maxPairs)\n"
+	"						{\n"
+	"							int4 newpair;\n"
+	"							newpair.x = handleIndex;\n"
+	"							newpair.y = handleIndex2;\n"
+	"							newpair.z = -1;\n"
+	"							newpair.w = -1;\n"
+	"							pPairBuff2[curPair] = newpair;\n"
+	"						}\n"
+	"					}\n"
+	"				\n"
+	"				}\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"__kernel void kFindOverlappingPairs(	int numObjects,\n"
+	"										__global float4* allpAABB, \n"
+	"										__global const int* smallAabbMapping,\n"
+	"										__global int2* pHash, \n"
+	"										__global int* pCellStart, \n"
+	"										__global float4* pParams ,\n"
+	"										volatile  __global int* pairCount,\n"
+	"										__global int4*   pPairBuff2,\n"
+	"										int maxPairs\n"
+	"										)\n"
+	"{\n"
+	"    int index = get_global_id(0);\n"
+	"    if(index >= numObjects)\n"
+	"	{\n"
+	"		return;\n"
+	"	}\n"
+	"    int2 sortedData = pHash[index];\n"
+	"	int unsorted_indx = sortedData.y;\n"
+	"	float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n"
+	"	float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
+	"	float4 pos;\n"
+	"	pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
+	"	pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
+	"	pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
+	"    // get address in grid\n"
+	"    int4 gridPosA = getGridPos(pos, pParams);\n"
+	"    int4 gridPosB; \n"
+	"    // examine only neighbouring cells\n"
+	"    for(int z=-1; z<=1; z++) \n"
+	"    {\n"
+	"		gridPosB.z = gridPosA.z + z;\n"
+	"        for(int y=-1; y<=1; y++) \n"
+	"        {\n"
+	"			gridPosB.y = gridPosA.y + y;\n"
+	"            for(int x=-1; x<=1; x++) \n"
+	"            {\n"
+	"				gridPosB.x = gridPosA.x + x;\n"
+	"                findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n"
+	"            }\n"
+	"        }\n"
+	"    }\n"
+	"}\n";
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
index 5eb8f45b16..c02877dde9 100644
--- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
+++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
@@ -1,729 +1,728 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* parallelLinearBvhCL= \
-"/*\n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose,\n"
-"including commercial applications, and to alter it and redistribute it freely,\n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Initial Author Jackson Lee, 2014\n"
-"typedef float b3Scalar;\n"
-"typedef float4 b3Vector3;\n"
-"#define b3Max max\n"
-"#define b3Min min\n"
-"#define b3Sqrt sqrt\n"
-"typedef struct\n"
-"{\n"
-"	unsigned int m_key;\n"
-"	unsigned int m_value;\n"
-"} SortDataCL;\n"
-"typedef struct \n"
-"{\n"
-"	union\n"
-"	{\n"
-"		float4	m_min;\n"
-"		float   m_minElems[4];\n"
-"		int			m_minIndices[4];\n"
-"	};\n"
-"	union\n"
-"	{\n"
-"		float4	m_max;\n"
-"		float   m_maxElems[4];\n"
-"		int			m_maxIndices[4];\n"
-"	};\n"
-"} b3AabbCL;\n"
-"unsigned int interleaveBits(unsigned int x)\n"
-"{\n"
-"	//........ ........ ......12 3456789A	//x\n"
-"	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x after interleaving bits\n"
-"	\n"
-"	//......12 3456789A ......12 3456789A	//x ^ (x << 16)\n"
-"	//11111111 ........ ........ 11111111	//0x FF 00 00 FF\n"
-"	//......12 ........ ........ 3456789A	//x = (x ^ (x << 16)) & 0xFF0000FF;\n"
-"	\n"
-"	//......12 ........ 3456789A 3456789A	//x ^ (x <<  8)\n"
-"	//......11 ........ 1111.... ....1111	//0x 03 00 F0 0F\n"
-"	//......12 ........ 3456.... ....789A	//x = (x ^ (x <<  8)) & 0x0300F00F;\n"
-"	\n"
-"	//..12..12 ....3456 3456.... 789A789A	//x ^ (x <<  4)\n"
-"	//......11 ....11.. ..11.... 11....11	//0x 03 0C 30 C3\n"
-"	//......12 ....34.. ..56.... 78....9A	//x = (x ^ (x <<  4)) & 0x030C30C3;\n"
-"	\n"
-"	//....1212 ..3434.. 5656..78 78..9A9A	//x ^ (x <<  2)\n"
-"	//....1..1 ..1..1.. 1..1..1. .1..1..1	//0x 09 24 92 49\n"
-"	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x = (x ^ (x <<  2)) & 0x09249249;\n"
-"	\n"
-"	//........ ........ ......11 11111111	//0x000003FF\n"
-"	x &= 0x000003FF;		//Clear all bits above bit 10\n"
-"	\n"
-"	x = (x ^ (x << 16)) & 0xFF0000FF;\n"
-"	x = (x ^ (x <<  8)) & 0x0300F00F;\n"
-"	x = (x ^ (x <<  4)) & 0x030C30C3;\n"
-"	x = (x ^ (x <<  2)) & 0x09249249;\n"
-"	\n"
-"	return x;\n"
-"}\n"
-"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n"
-"{\n"
-"	return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n"
-"}\n"
-"__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n"
-"{\n"
-"	int separatedAabbIndex = get_global_id(0);\n"
-"	if(separatedAabbIndex >= numAabbsToSeparate) return;\n"
-"	int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n"
-"	out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n"
-"}\n"
-"//Should replace with an optimized parallel reduction\n"
-"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n"
-"{\n"
-"	//Each time this kernel is added to the command queue, \n"
-"	//the number of AABBs needing to be merged is halved\n"
-"	//\n"
-"	//Example with 159 AABBs:\n"
-"	//	numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n"
-"	//	numMergedAabbs == 159 - 80 == 79\n"
-"	//So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n"
-"	\n"
-"	int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n"
-"	int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n"
-"	\n"
-"	int aabbIndex = get_global_id(0);\n"
-"	if(aabbIndex >= numMergedAabbs) return;\n"
-"	\n"
-"	int otherAabbIndex = aabbIndex + numRemainingAabbs;\n"
-"	\n"
-"	b3AabbCL aabb = out_mergedAabb[aabbIndex];\n"
-"	b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n"
-"		\n"
-"	b3AabbCL mergedAabb;\n"
-"	mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n"
-"	mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n"
-"	out_mergedAabb[aabbIndex] = mergedAabb;\n"
-"}\n"
-"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n"
-"												__global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n"
-"{\n"
-"	int leafNodeIndex = get_global_id(0);	//Leaf node index == AABB index\n"
-"	if(leafNodeIndex >= numAabbs) return;\n"
-"	\n"
-"	b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n"
-"	b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n"
-"	b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n"
-"	\n"
-"	b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n"
-"	b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n"
-"	b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n"
-"	\n"
-"	//Quantize into integer coordinates\n"
-"	//floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n"
-"	b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n"
-"	\n"
-"	int4 discretePosition;\n"
-"	discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n"
-"	discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n"
-"	discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n"
-"	\n"
-"	//Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n"
-"	discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n"
-"	discretePosition += 512;\n"
-"	\n"
-"	//Interleave bits(assign a morton code, also known as a z-curve)\n"
-"	unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n"
-"	\n"
-"	//\n"
-"	SortDataCL mortonCodeIndexPair;\n"
-"	mortonCodeIndexPair.m_key = mortonCode;\n"
-"	mortonCodeIndexPair.m_value = leafNodeIndex;\n"
-"	\n"
-"	out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n"
-"}\n"
-"#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n"
-"//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n"
-"//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n"
-"//In both cases, the bit should be cleared to access the actual node index.\n"
-"int isLeafNode(int index) { return (index >> 31 == 0); }\n"
-"int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n"
-"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n"
-"//From sap.cl\n"
-"#define NEW_PAIR_MARKER -1\n"
-"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n"
-"{\n"
-"	bool overlap = true;\n"
-"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
-"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
-"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
-"	return overlap;\n"
-"}\n"
-"//From sap.cl\n"
-"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n"
-"											__global int* rootNodeIndex, \n"
-"											__global int2* internalNodeChildIndices, \n"
-"											__global b3AabbCL* internalNodeAabbs,\n"
-"											__global int2* internalNodeLeafIndexRanges,\n"
-"											\n"
-"											__global SortDataCL* mortonCodesAndAabbIndices,\n"
-"											__global int* out_numPairs, __global int4* out_overlappingPairs, \n"
-"											int maxPairs, int numQueryAabbs)\n"
-"{\n"
-"	//Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n"
-"	//mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n"
-"	int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
-"	if(queryBvhNodeIndex >= numQueryAabbs) return;\n"
-"	\n"
-"	int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n"
-"	b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n"
-"	\n"
-"	int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
-"	\n"
-"	int stackSize = 1;\n"
-"	stack[0] = *rootNodeIndex;\n"
-"	\n"
-"	while(stackSize)\n"
-"	{\n"
-"		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
-"		--stackSize;\n"
-"		\n"
-"		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false\n"
-"		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
-"		\n"
-"		//Optimization - if the BVH is structured as a binary radix tree, then\n"
-"		//each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n"
-"		//This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n"
-"		{\n"
-"			int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n"
-"			if(highestLeafIndex <= queryBvhNodeIndex) continue;\n"
-"		}\n"
-"		\n"
-"		//bvhRigidIndex is not used if internal node\n"
-"		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
-"	\n"
-"		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
-"		if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n"
-"		{\n"
-"			if(isLeaf)\n"
-"			{\n"
-"				int4 pair;\n"
-"				pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n"
-"				pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
-"				pair.z = NEW_PAIR_MARKER;\n"
-"				pair.w = NEW_PAIR_MARKER;\n"
-"				\n"
-"				int pairIndex = atomic_inc(out_numPairs);\n"
-"				if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
-"			}\n"
-"			\n"
-"			if(!isLeaf)	//Internal node\n"
-"			{\n"
-"				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
-"				{\n"
-"					//Error\n"
-"				}\n"
-"				else\n"
-"				{\n"
-"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
-"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"		\n"
-"	}\n"
-"}\n"
-"//From rayCastKernels.cl\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_from;\n"
-"	float4 m_to;\n"
-"} b3RayInfo;\n"
-"//From rayCastKernels.cl\n"
-"b3Vector3 b3Vector3_normalize(b3Vector3 v)\n"
-"{\n"
-"	b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n"
-"	return normalize(normal);	//OpenCL normalize == vector4 normalize\n"
-"}\n"
-"b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n"
-"b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n"
-"int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n"
-"{\n"
-"	//AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n"
-"	//t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n"
-"	//\n"
-"	//if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n"
-"	//and min.x will be the far plane; otherwise, it is reversed.\n"
-"	//\n"
-"	//In order for there to be a collision, the t_min and t_max of each pair must overlap.\n"
-"	//This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n"
-"	\n"
-"	int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) );	//isless(x,y) returns (x < y)\n"
-"	\n"
-"	//When using vector types, the select() function checks the most signficant bit, \n"
-"	//but isless() sets the least significant bit.\n"
-"	isNegative <<= 31;\n"
-"	//select(b, a, condition) == condition ? a : b\n"
-"	//When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n"
-"	b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
-"	b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
-"	\n"
-"	b3Scalar t_min_final = 0.0f;\n"
-"	b3Scalar t_max_final = rayLength;\n"
-"	\n"
-"	//Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n"
-"	//Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n"
-"	//Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n"
-"	t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n"
-"	t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n"
-"	\n"
-"	return (t_min_final <= t_max_final);\n"
-"}\n"
-"__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n"
-"								__global int* rootNodeIndex, \n"
-"								__global int2* internalNodeChildIndices, \n"
-"								__global b3AabbCL* internalNodeAabbs,\n"
-"								__global int2* internalNodeLeafIndexRanges,\n"
-"								__global SortDataCL* mortonCodesAndAabbIndices,\n"
-"								\n"
-"								__global b3RayInfo* rays,\n"
-"								\n"
-"								__global int* out_numRayRigidPairs, \n"
-"								__global int2* out_rayRigidPairs,\n"
-"								int maxRayRigidPairs, int numRays)\n"
-"{\n"
-"	int rayIndex = get_global_id(0);\n"
-"	if(rayIndex >= numRays) return;\n"
-"	\n"
-"	//\n"
-"	b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
-"	b3Vector3 rayTo = rays[rayIndex].m_to;\n"
-"	b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
-"	b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
-"	\n"
-"	//\n"
-"	int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
-"	\n"
-"	int stackSize = 1;\n"
-"	stack[0] = *rootNodeIndex;\n"
-"	\n"
-"	while(stackSize)\n"
-"	{\n"
-"		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
-"		--stackSize;\n"
-"		\n"
-"		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false\n"
-"		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
-"		\n"
-"		//bvhRigidIndex is not used if internal node\n"
-"		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
-"	\n"
-"		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
-"		if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb)  )\n"
-"		{\n"
-"			if(isLeaf)\n"
-"			{\n"
-"				int2 rayRigidPair;\n"
-"				rayRigidPair.x = rayIndex;\n"
-"				rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
-"				\n"
-"				int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
-"				if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
-"			}\n"
-"			\n"
-"			if(!isLeaf)	//Internal node\n"
-"			{\n"
-"				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
-"				{\n"
-"					//Error\n"
-"				}\n"
-"				else\n"
-"				{\n"
-"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
-"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"}\n"
-"__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n"
-"									__global int* out_numPairs, __global int4* out_overlappingPairs, \n"
-"									int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n"
-"{\n"
-"	int smallAabbIndex = get_global_id(0);\n"
-"	if(smallAabbIndex >= numSmallAabbRigids) return;\n"
-"	\n"
-"	b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n"
-"	for(int i = 0; i < numLargeAabbRigids; ++i)\n"
-"	{\n"
-"		b3AabbCL largeAabb = largeAabbs[i];\n"
-"		if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n"
-"		{\n"
-"			int4 pair;\n"
-"			pair.x = largeAabb.m_minIndices[3];\n"
-"			pair.y = smallAabb.m_minIndices[3];\n"
-"			pair.z = NEW_PAIR_MARKER;\n"
-"			pair.w = NEW_PAIR_MARKER;\n"
-"			\n"
-"			int pairIndex = atomic_inc(out_numPairs);\n"
-"			if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
-"		}\n"
-"	}\n"
-"}\n"
-"__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n"
-"									__global int* out_numRayRigidPairs,  __global int2* out_rayRigidPairs,\n"
-"									int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n"
-"{\n"
-"	int rayIndex = get_global_id(0);\n"
-"	if(rayIndex >= numRays) return;\n"
-"	\n"
-"	b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
-"	b3Vector3 rayTo = rays[rayIndex].m_to;\n"
-"	b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
-"	b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
-"	\n"
-"	for(int i = 0; i < numLargeAabbRigids; ++i)\n"
-"	{\n"
-"		b3AabbCL rigidAabb = largeRigidAabbs[i];\n"
-"		if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n"
-"		{\n"
-"			int2 rayRigidPair;\n"
-"			rayRigidPair.x = rayIndex;\n"
-"			rayRigidPair.y = rigidAabb.m_minIndices[3];\n"
-"			\n"
-"			int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
-"			if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
-"		}\n"
-"	}\n"
-"}\n"
-"//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n"
-"//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n"
-"//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n"
-"//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n"
-"#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n"
-"#define B3_PLBVH_ROOT_NODE_MARKER -1\n"
-"#define b3Int64 long\n"
-"int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n"
-"b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n"
-"{\n"
-"	//This function only needs to return (i & j) in order for the algorithm to work,\n"
-"	//but it may help with debugging to mask out the lower bits.\n"
-"	b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n"
-"	b3Int64 sharedBits = i & j;\n"
-"	b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength);	//Set all bits after the common prefix to 0\n"
-"	\n"
-"	return sharedBits & bitmask;\n"
-"}\n"
-"//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n"
-"int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n"
-"{\n"
-"	return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n"
-"}\n"
-"__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n"
-"											__global b3Int64* out_commonPrefixes,\n"
-"											__global int* out_commonPrefixLengths,\n"
-"											int numInternalNodes)\n"
-"{\n"
-"	int internalNodeIndex = get_global_id(0);\n"
-"	if (internalNodeIndex >= numInternalNodes) return;\n"
-"	\n"
-"	//Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n"
-"	//and the number of internal nodes is always numLeafNodes - 1\n"
-"	int leftLeafIndex = internalNodeIndex;\n"
-"	int rightLeafIndex = internalNodeIndex + 1;\n"
-"	\n"
-"	int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n"
-"	int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n"
-"	\n"
-"	//Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n"
-"	//Append the index of each leaf node to each morton code so that there are no duplicates.\n"
-"	//The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n"
-"	//is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n"
-"	//\n"
-"	//upsample(a, b) == ( ((b3Int64)a) << 32) | b\n"
-"	b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n"
-"	b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n"
-"	\n"
-"	out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
-"	out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
-"}\n"
-"__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n"
-"											__global int2* out_childNodes, int numLeafNodes)\n"
-"{\n"
-"	int leafNodeIndex = get_global_id(0);\n"
-"	if (leafNodeIndex >= numLeafNodes) return;\n"
-"	\n"
-"	int numInternalNodes = numLeafNodes - 1;\n"
-"	\n"
-"	int leftSplitIndex = leafNodeIndex - 1;\n"
-"	int rightSplitIndex = leafNodeIndex;\n"
-"	\n"
-"	int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
-"	int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
-"	\n"
-"	//Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n"
-"	//Leaf nodes are considered as having the highest common prefix\n"
-"	int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n"
-"	\n"
-"	//Handle cases for the edge nodes; the first and last node\n"
-"	//For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n"
-"	if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n"
-"	if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n"
-"	\n"
-"	int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n"
-"	out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n"
-"	\n"
-"	int isRightChild = (isLeftHigherCommonPrefix);	//If the left node is the parent, then this node is its right child and vice versa\n"
-"	\n"
-"	//out_childNodesAsInt[0] == int2.x == left child\n"
-"	//out_childNodesAsInt[1] == int2.y == right child\n"
-"	int isLeaf = 1;\n"
-"	__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
-"	out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n"
-"}\n"
-"__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n"
-"												__global int2* out_childNodes,\n"
-"												__global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n"
-"												int numInternalNodes)\n"
-"{\n"
-"	int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
-"	if(internalNodeIndex >= numInternalNodes) return;\n"
-"	\n"
-"	b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n"
-"	int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n"
-"	\n"
-"//#define USE_LINEAR_SEARCH\n"
-"#ifdef USE_LINEAR_SEARCH\n"
-"	int leftIndex = -1;\n"
-"	int rightIndex = -1;\n"
-"	\n"
-"	//Find nearest element to left with a lower common prefix\n"
-"	for(int i = internalNodeIndex - 1; i >= 0; --i)\n"
-"	{\n"
-"		int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
-"		if(nodeLeftSharedPrefixLength < nodePrefixLength)\n"
-"		{\n"
-"			leftIndex = i;\n"
-"			break;\n"
-"		}\n"
-"	}\n"
-"	\n"
-"	//Find nearest element to right with a lower common prefix\n"
-"	for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n"
-"	{\n"
-"		int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
-"		if(nodeRightSharedPrefixLength < nodePrefixLength)\n"
-"		{\n"
-"			rightIndex = i;\n"
-"			break;\n"
-"		}\n"
-"	}\n"
-"	\n"
-"#else //Use binary search\n"
-"	//Find nearest element to left with a lower common prefix\n"
-"	int leftIndex = -1;\n"
-"	{\n"
-"		int lower = 0;\n"
-"		int upper = internalNodeIndex - 1;\n"
-"		\n"
-"		while(lower <= upper)\n"
-"		{\n"
-"			int mid = (lower + upper) / 2;\n"
-"			b3Int64 midPrefix = commonPrefixes[mid];\n"
-"			int midPrefixLength = commonPrefixLengths[mid];\n"
-"			\n"
-"			int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
-"			if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
-"			{\n"
-"				int right = mid + 1;\n"
-"				if(right < internalNodeIndex)\n"
-"				{\n"
-"					b3Int64 rightPrefix = commonPrefixes[right];\n"
-"					int rightPrefixLength = commonPrefixLengths[right];\n"
-"					\n"
-"					int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n"
-"					if(nodeRightSharedPrefixLength < nodePrefixLength) \n"
-"					{\n"
-"						lower = right;\n"
-"						leftIndex = right;\n"
-"					}\n"
-"					else \n"
-"					{\n"
-"						leftIndex = mid;\n"
-"						break;\n"
-"					}\n"
-"				}\n"
-"				else \n"
-"				{\n"
-"					leftIndex = mid;\n"
-"					break;\n"
-"				}\n"
-"			}\n"
-"			else upper = mid - 1;\n"
-"		}\n"
-"	}\n"
-"	\n"
-"	//Find nearest element to right with a lower common prefix\n"
-"	int rightIndex = -1;\n"
-"	{\n"
-"		int lower = internalNodeIndex + 1;\n"
-"		int upper = numInternalNodes - 1;\n"
-"		\n"
-"		while(lower <= upper)\n"
-"		{\n"
-"			int mid = (lower + upper) / 2;\n"
-"			b3Int64 midPrefix = commonPrefixes[mid];\n"
-"			int midPrefixLength = commonPrefixLengths[mid];\n"
-"			\n"
-"			int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
-"			if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
-"			{\n"
-"				int left = mid - 1;\n"
-"				if(left > internalNodeIndex)\n"
-"				{\n"
-"					b3Int64 leftPrefix = commonPrefixes[left];\n"
-"					int leftPrefixLength = commonPrefixLengths[left];\n"
-"				\n"
-"					int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n"
-"					if(nodeLeftSharedPrefixLength < nodePrefixLength) \n"
-"					{\n"
-"						upper = left;\n"
-"						rightIndex = left;\n"
-"					}\n"
-"					else \n"
-"					{\n"
-"						rightIndex = mid;\n"
-"						break;\n"
-"					}\n"
-"				}\n"
-"				else \n"
-"				{\n"
-"					rightIndex = mid;\n"
-"					break;\n"
-"				}\n"
-"			}\n"
-"			else lower = mid + 1;\n"
-"		}\n"
-"	}\n"
-"#endif\n"
-"	\n"
-"	//Select parent\n"
-"	{\n"
-"		int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
-"		int rightPrefixLength =  (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
-"		\n"
-"		int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n"
-"		\n"
-"		if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n"
-"		else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n"
-"		\n"
-"		int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n"
-"		\n"
-"		int isRootNode = (leftIndex == -1 && rightIndex == -1);\n"
-"		out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n"
-"		\n"
-"		int isLeaf = 0;\n"
-"		if(!isRootNode)\n"
-"		{\n"
-"			int isRightChild = (isLeftHigherPrefixLength);	//If the left node is the parent, then this node is its right child and vice versa\n"
-"			\n"
-"			//out_childNodesAsInt[0] == int2.x == left child\n"
-"			//out_childNodesAsInt[1] == int2.y == right child\n"
-"			__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
-"			out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
-"		}\n"
-"		else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
-"	}\n"
-"}\n"
-"__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n"
-"									__global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n"
-"{\n"
-"	if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n"
-"	int internalNodeIndex = get_global_id(0);\n"
-"	if(internalNodeIndex >= numInternalNodes) return;\n"
-"	\n"
-"	//\n"
-"	int distanceFromRoot = 0;\n"
-"	{\n"
-"		int parentIndex = internalNodeParentNodes[internalNodeIndex];\n"
-"		while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n"
-"		{\n"
-"			parentIndex = internalNodeParentNodes[parentIndex];\n"
-"			++distanceFromRoot;\n"
-"		}\n"
-"	}\n"
-"	out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n"
-"	\n"
-"	//\n"
-"	__local int localMaxDistanceFromRoot;\n"
-"	if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n"
-"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-"	\n"
-"	atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n"
-"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-"	\n"
-"	if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n"
-"}\n"
-"__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n"
-"												__global int2* childNodes,\n"
-"												__global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n"
-"												int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n"
-"{\n"
-"	int internalNodeIndex = get_global_id(0);\n"
-"	if(internalNodeIndex >= numInternalNodes) return;\n"
-"	\n"
-"	int distance = distanceFromRoot[internalNodeIndex];\n"
-"	\n"
-"	if(distance == processedDistance)\n"
-"	{\n"
-"		int leftChildIndex = childNodes[internalNodeIndex].x;\n"
-"		int rightChildIndex = childNodes[internalNodeIndex].y;\n"
-"		\n"
-"		int isLeftChildLeaf = isLeafNode(leftChildIndex);\n"
-"		int isRightChildLeaf = isLeafNode(rightChildIndex);\n"
-"		\n"
-"		leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n"
-"		rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n"
-"		\n"
-"		//leftRigidIndex/rightRigidIndex is not used if internal node\n"
-"		int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n"
-"		int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n"
-"		\n"
-"		b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n"
-"		b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n"
-"		\n"
-"		b3AabbCL mergedAabb;\n"
-"		mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n"
-"		mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n"
-"		internalNodeAabbs[internalNodeIndex] = mergedAabb;\n"
-"	}\n"
-"}\n"
-"__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n"
-"{\n"
-"	int internalNodeIndex = get_global_id(0);\n"
-"	if(internalNodeIndex >= numInternalNodes) return;\n"
-"	\n"
-"	int numLeafNodes = numInternalNodes + 1;\n"
-"	\n"
-"	int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n"
-"	\n"
-"	int2 leafIndexRange;	//x == min leaf index, y == max leaf index\n"
-"	\n"
-"	//Find lowest leaf index covered by this internal node\n"
-"	{\n"
-"		int lowestIndex = childNodes.x;		//childNodes.x == Left child\n"
-"		while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n"
-"		leafIndexRange.x = lowestIndex;\n"
-"	}\n"
-"	\n"
-"	//Find highest leaf index covered by this internal node\n"
-"	{\n"
-"		int highestIndex = childNodes.y;	//childNodes.y == Right child\n"
-"		while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n"
-"		leafIndexRange.y = highestIndex;\n"
-"	}\n"
-"	\n"
-"	//\n"
-"	out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n"
-"}\n"
-;
+static const char* parallelLinearBvhCL =
+	"/*\n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose,\n"
+	"including commercial applications, and to alter it and redistribute it freely,\n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Initial Author Jackson Lee, 2014\n"
+	"typedef float b3Scalar;\n"
+	"typedef float4 b3Vector3;\n"
+	"#define b3Max max\n"
+	"#define b3Min min\n"
+	"#define b3Sqrt sqrt\n"
+	"typedef struct\n"
+	"{\n"
+	"	unsigned int m_key;\n"
+	"	unsigned int m_value;\n"
+	"} SortDataCL;\n"
+	"typedef struct \n"
+	"{\n"
+	"	union\n"
+	"	{\n"
+	"		float4	m_min;\n"
+	"		float   m_minElems[4];\n"
+	"		int			m_minIndices[4];\n"
+	"	};\n"
+	"	union\n"
+	"	{\n"
+	"		float4	m_max;\n"
+	"		float   m_maxElems[4];\n"
+	"		int			m_maxIndices[4];\n"
+	"	};\n"
+	"} b3AabbCL;\n"
+	"unsigned int interleaveBits(unsigned int x)\n"
+	"{\n"
+	"	//........ ........ ......12 3456789A	//x\n"
+	"	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x after interleaving bits\n"
+	"	\n"
+	"	//......12 3456789A ......12 3456789A	//x ^ (x << 16)\n"
+	"	//11111111 ........ ........ 11111111	//0x FF 00 00 FF\n"
+	"	//......12 ........ ........ 3456789A	//x = (x ^ (x << 16)) & 0xFF0000FF;\n"
+	"	\n"
+	"	//......12 ........ 3456789A 3456789A	//x ^ (x <<  8)\n"
+	"	//......11 ........ 1111.... ....1111	//0x 03 00 F0 0F\n"
+	"	//......12 ........ 3456.... ....789A	//x = (x ^ (x <<  8)) & 0x0300F00F;\n"
+	"	\n"
+	"	//..12..12 ....3456 3456.... 789A789A	//x ^ (x <<  4)\n"
+	"	//......11 ....11.. ..11.... 11....11	//0x 03 0C 30 C3\n"
+	"	//......12 ....34.. ..56.... 78....9A	//x = (x ^ (x <<  4)) & 0x030C30C3;\n"
+	"	\n"
+	"	//....1212 ..3434.. 5656..78 78..9A9A	//x ^ (x <<  2)\n"
+	"	//....1..1 ..1..1.. 1..1..1. .1..1..1	//0x 09 24 92 49\n"
+	"	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x = (x ^ (x <<  2)) & 0x09249249;\n"
+	"	\n"
+	"	//........ ........ ......11 11111111	//0x000003FF\n"
+	"	x &= 0x000003FF;		//Clear all bits above bit 10\n"
+	"	\n"
+	"	x = (x ^ (x << 16)) & 0xFF0000FF;\n"
+	"	x = (x ^ (x <<  8)) & 0x0300F00F;\n"
+	"	x = (x ^ (x <<  4)) & 0x030C30C3;\n"
+	"	x = (x ^ (x <<  2)) & 0x09249249;\n"
+	"	\n"
+	"	return x;\n"
+	"}\n"
+	"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n"
+	"{\n"
+	"	return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n"
+	"}\n"
+	"__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n"
+	"{\n"
+	"	int separatedAabbIndex = get_global_id(0);\n"
+	"	if(separatedAabbIndex >= numAabbsToSeparate) return;\n"
+	"	int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n"
+	"	out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n"
+	"}\n"
+	"//Should replace with an optimized parallel reduction\n"
+	"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n"
+	"{\n"
+	"	//Each time this kernel is added to the command queue, \n"
+	"	//the number of AABBs needing to be merged is halved\n"
+	"	//\n"
+	"	//Example with 159 AABBs:\n"
+	"	//	numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n"
+	"	//	numMergedAabbs == 159 - 80 == 79\n"
+	"	//So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n"
+	"	\n"
+	"	int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n"
+	"	int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n"
+	"	\n"
+	"	int aabbIndex = get_global_id(0);\n"
+	"	if(aabbIndex >= numMergedAabbs) return;\n"
+	"	\n"
+	"	int otherAabbIndex = aabbIndex + numRemainingAabbs;\n"
+	"	\n"
+	"	b3AabbCL aabb = out_mergedAabb[aabbIndex];\n"
+	"	b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n"
+	"		\n"
+	"	b3AabbCL mergedAabb;\n"
+	"	mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n"
+	"	mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n"
+	"	out_mergedAabb[aabbIndex] = mergedAabb;\n"
+	"}\n"
+	"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n"
+	"												__global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n"
+	"{\n"
+	"	int leafNodeIndex = get_global_id(0);	//Leaf node index == AABB index\n"
+	"	if(leafNodeIndex >= numAabbs) return;\n"
+	"	\n"
+	"	b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n"
+	"	b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n"
+	"	b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n"
+	"	\n"
+	"	b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n"
+	"	b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n"
+	"	b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n"
+	"	\n"
+	"	//Quantize into integer coordinates\n"
+	"	//floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n"
+	"	b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n"
+	"	\n"
+	"	int4 discretePosition;\n"
+	"	discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n"
+	"	discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n"
+	"	discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n"
+	"	\n"
+	"	//Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n"
+	"	discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n"
+	"	discretePosition += 512;\n"
+	"	\n"
+	"	//Interleave bits(assign a morton code, also known as a z-curve)\n"
+	"	unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n"
+	"	\n"
+	"	//\n"
+	"	SortDataCL mortonCodeIndexPair;\n"
+	"	mortonCodeIndexPair.m_key = mortonCode;\n"
+	"	mortonCodeIndexPair.m_value = leafNodeIndex;\n"
+	"	\n"
+	"	out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n"
+	"}\n"
+	"#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n"
+	"//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n"
+	"//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n"
+	"//In both cases, the bit should be cleared to access the actual node index.\n"
+	"int isLeafNode(int index) { return (index >> 31 == 0); }\n"
+	"int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n"
+	"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n"
+	"//From sap.cl\n"
+	"#define NEW_PAIR_MARKER -1\n"
+	"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n"
+	"{\n"
+	"	bool overlap = true;\n"
+	"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+	"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+	"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+	"	return overlap;\n"
+	"}\n"
+	"//From sap.cl\n"
+	"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n"
+	"											__global int* rootNodeIndex, \n"
+	"											__global int2* internalNodeChildIndices, \n"
+	"											__global b3AabbCL* internalNodeAabbs,\n"
+	"											__global int2* internalNodeLeafIndexRanges,\n"
+	"											\n"
+	"											__global SortDataCL* mortonCodesAndAabbIndices,\n"
+	"											__global int* out_numPairs, __global int4* out_overlappingPairs, \n"
+	"											int maxPairs, int numQueryAabbs)\n"
+	"{\n"
+	"	//Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n"
+	"	//mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n"
+	"	int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
+	"	if(queryBvhNodeIndex >= numQueryAabbs) return;\n"
+	"	\n"
+	"	int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n"
+	"	b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n"
+	"	\n"
+	"	int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
+	"	\n"
+	"	int stackSize = 1;\n"
+	"	stack[0] = *rootNodeIndex;\n"
+	"	\n"
+	"	while(stackSize)\n"
+	"	{\n"
+	"		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
+	"		--stackSize;\n"
+	"		\n"
+	"		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false\n"
+	"		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
+	"		\n"
+	"		//Optimization - if the BVH is structured as a binary radix tree, then\n"
+	"		//each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n"
+	"		//This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n"
+	"		{\n"
+	"			int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n"
+	"			if(highestLeafIndex <= queryBvhNodeIndex) continue;\n"
+	"		}\n"
+	"		\n"
+	"		//bvhRigidIndex is not used if internal node\n"
+	"		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
+	"	\n"
+	"		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
+	"		if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n"
+	"		{\n"
+	"			if(isLeaf)\n"
+	"			{\n"
+	"				int4 pair;\n"
+	"				pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n"
+	"				pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
+	"				pair.z = NEW_PAIR_MARKER;\n"
+	"				pair.w = NEW_PAIR_MARKER;\n"
+	"				\n"
+	"				int pairIndex = atomic_inc(out_numPairs);\n"
+	"				if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
+	"			}\n"
+	"			\n"
+	"			if(!isLeaf)	//Internal node\n"
+	"			{\n"
+	"				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
+	"				{\n"
+	"					//Error\n"
+	"				}\n"
+	"				else\n"
+	"				{\n"
+	"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
+	"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
+	"				}\n"
+	"			}\n"
+	"		}\n"
+	"		\n"
+	"	}\n"
+	"}\n"
+	"//From rayCastKernels.cl\n"
+	"typedef struct\n"
+	"{\n"
+	"	float4 m_from;\n"
+	"	float4 m_to;\n"
+	"} b3RayInfo;\n"
+	"//From rayCastKernels.cl\n"
+	"b3Vector3 b3Vector3_normalize(b3Vector3 v)\n"
+	"{\n"
+	"	b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n"
+	"	return normalize(normal);	//OpenCL normalize == vector4 normalize\n"
+	"}\n"
+	"b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n"
+	"b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n"
+	"int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n"
+	"{\n"
+	"	//AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n"
+	"	//t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n"
+	"	//\n"
+	"	//if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n"
+	"	//and min.x will be the far plane; otherwise, it is reversed.\n"
+	"	//\n"
+	"	//In order for there to be a collision, the t_min and t_max of each pair must overlap.\n"
+	"	//This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n"
+	"	\n"
+	"	int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) );	//isless(x,y) returns (x < y)\n"
+	"	\n"
+	"	//When using vector types, the select() function checks the most signficant bit, \n"
+	"	//but isless() sets the least significant bit.\n"
+	"	isNegative <<= 31;\n"
+	"	//select(b, a, condition) == condition ? a : b\n"
+	"	//When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n"
+	"	b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
+	"	b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
+	"	\n"
+	"	b3Scalar t_min_final = 0.0f;\n"
+	"	b3Scalar t_max_final = rayLength;\n"
+	"	\n"
+	"	//Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n"
+	"	//Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n"
+	"	//Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n"
+	"	t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n"
+	"	t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n"
+	"	\n"
+	"	return (t_min_final <= t_max_final);\n"
+	"}\n"
+	"__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n"
+	"								__global int* rootNodeIndex, \n"
+	"								__global int2* internalNodeChildIndices, \n"
+	"								__global b3AabbCL* internalNodeAabbs,\n"
+	"								__global int2* internalNodeLeafIndexRanges,\n"
+	"								__global SortDataCL* mortonCodesAndAabbIndices,\n"
+	"								\n"
+	"								__global b3RayInfo* rays,\n"
+	"								\n"
+	"								__global int* out_numRayRigidPairs, \n"
+	"								__global int2* out_rayRigidPairs,\n"
+	"								int maxRayRigidPairs, int numRays)\n"
+	"{\n"
+	"	int rayIndex = get_global_id(0);\n"
+	"	if(rayIndex >= numRays) return;\n"
+	"	\n"
+	"	//\n"
+	"	b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
+	"	b3Vector3 rayTo = rays[rayIndex].m_to;\n"
+	"	b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
+	"	b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
+	"	\n"
+	"	//\n"
+	"	int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
+	"	\n"
+	"	int stackSize = 1;\n"
+	"	stack[0] = *rootNodeIndex;\n"
+	"	\n"
+	"	while(stackSize)\n"
+	"	{\n"
+	"		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
+	"		--stackSize;\n"
+	"		\n"
+	"		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false\n"
+	"		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
+	"		\n"
+	"		//bvhRigidIndex is not used if internal node\n"
+	"		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
+	"	\n"
+	"		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
+	"		if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb)  )\n"
+	"		{\n"
+	"			if(isLeaf)\n"
+	"			{\n"
+	"				int2 rayRigidPair;\n"
+	"				rayRigidPair.x = rayIndex;\n"
+	"				rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
+	"				\n"
+	"				int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
+	"				if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
+	"			}\n"
+	"			\n"
+	"			if(!isLeaf)	//Internal node\n"
+	"			{\n"
+	"				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
+	"				{\n"
+	"					//Error\n"
+	"				}\n"
+	"				else\n"
+	"				{\n"
+	"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
+	"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
+	"				}\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n"
+	"									__global int* out_numPairs, __global int4* out_overlappingPairs, \n"
+	"									int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n"
+	"{\n"
+	"	int smallAabbIndex = get_global_id(0);\n"
+	"	if(smallAabbIndex >= numSmallAabbRigids) return;\n"
+	"	\n"
+	"	b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n"
+	"	for(int i = 0; i < numLargeAabbRigids; ++i)\n"
+	"	{\n"
+	"		b3AabbCL largeAabb = largeAabbs[i];\n"
+	"		if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n"
+	"		{\n"
+	"			int4 pair;\n"
+	"			pair.x = largeAabb.m_minIndices[3];\n"
+	"			pair.y = smallAabb.m_minIndices[3];\n"
+	"			pair.z = NEW_PAIR_MARKER;\n"
+	"			pair.w = NEW_PAIR_MARKER;\n"
+	"			\n"
+	"			int pairIndex = atomic_inc(out_numPairs);\n"
+	"			if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n"
+	"									__global int* out_numRayRigidPairs,  __global int2* out_rayRigidPairs,\n"
+	"									int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n"
+	"{\n"
+	"	int rayIndex = get_global_id(0);\n"
+	"	if(rayIndex >= numRays) return;\n"
+	"	\n"
+	"	b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
+	"	b3Vector3 rayTo = rays[rayIndex].m_to;\n"
+	"	b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
+	"	b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
+	"	\n"
+	"	for(int i = 0; i < numLargeAabbRigids; ++i)\n"
+	"	{\n"
+	"		b3AabbCL rigidAabb = largeRigidAabbs[i];\n"
+	"		if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n"
+	"		{\n"
+	"			int2 rayRigidPair;\n"
+	"			rayRigidPair.x = rayIndex;\n"
+	"			rayRigidPair.y = rigidAabb.m_minIndices[3];\n"
+	"			\n"
+	"			int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
+	"			if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n"
+	"//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n"
+	"//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n"
+	"//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n"
+	"#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n"
+	"#define B3_PLBVH_ROOT_NODE_MARKER -1\n"
+	"#define b3Int64 long\n"
+	"int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n"
+	"b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n"
+	"{\n"
+	"	//This function only needs to return (i & j) in order for the algorithm to work,\n"
+	"	//but it may help with debugging to mask out the lower bits.\n"
+	"	b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n"
+	"	b3Int64 sharedBits = i & j;\n"
+	"	b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength);	//Set all bits after the common prefix to 0\n"
+	"	\n"
+	"	return sharedBits & bitmask;\n"
+	"}\n"
+	"//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n"
+	"int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n"
+	"{\n"
+	"	return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n"
+	"}\n"
+	"__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n"
+	"											__global b3Int64* out_commonPrefixes,\n"
+	"											__global int* out_commonPrefixLengths,\n"
+	"											int numInternalNodes)\n"
+	"{\n"
+	"	int internalNodeIndex = get_global_id(0);\n"
+	"	if (internalNodeIndex >= numInternalNodes) return;\n"
+	"	\n"
+	"	//Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n"
+	"	//and the number of internal nodes is always numLeafNodes - 1\n"
+	"	int leftLeafIndex = internalNodeIndex;\n"
+	"	int rightLeafIndex = internalNodeIndex + 1;\n"
+	"	\n"
+	"	int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n"
+	"	int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n"
+	"	\n"
+	"	//Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n"
+	"	//Append the index of each leaf node to each morton code so that there are no duplicates.\n"
+	"	//The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n"
+	"	//is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n"
+	"	//\n"
+	"	//upsample(a, b) == ( ((b3Int64)a) << 32) | b\n"
+	"	b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n"
+	"	b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n"
+	"	\n"
+	"	out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
+	"	out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
+	"}\n"
+	"__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n"
+	"											__global int2* out_childNodes, int numLeafNodes)\n"
+	"{\n"
+	"	int leafNodeIndex = get_global_id(0);\n"
+	"	if (leafNodeIndex >= numLeafNodes) return;\n"
+	"	\n"
+	"	int numInternalNodes = numLeafNodes - 1;\n"
+	"	\n"
+	"	int leftSplitIndex = leafNodeIndex - 1;\n"
+	"	int rightSplitIndex = leafNodeIndex;\n"
+	"	\n"
+	"	int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
+	"	int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
+	"	\n"
+	"	//Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n"
+	"	//Leaf nodes are considered as having the highest common prefix\n"
+	"	int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n"
+	"	\n"
+	"	//Handle cases for the edge nodes; the first and last node\n"
+	"	//For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n"
+	"	if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n"
+	"	if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n"
+	"	\n"
+	"	int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n"
+	"	out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n"
+	"	\n"
+	"	int isRightChild = (isLeftHigherCommonPrefix);	//If the left node is the parent, then this node is its right child and vice versa\n"
+	"	\n"
+	"	//out_childNodesAsInt[0] == int2.x == left child\n"
+	"	//out_childNodesAsInt[1] == int2.y == right child\n"
+	"	int isLeaf = 1;\n"
+	"	__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
+	"	out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n"
+	"}\n"
+	"__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n"
+	"												__global int2* out_childNodes,\n"
+	"												__global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n"
+	"												int numInternalNodes)\n"
+	"{\n"
+	"	int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
+	"	if(internalNodeIndex >= numInternalNodes) return;\n"
+	"	\n"
+	"	b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n"
+	"	int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n"
+	"	\n"
+	"//#define USE_LINEAR_SEARCH\n"
+	"#ifdef USE_LINEAR_SEARCH\n"
+	"	int leftIndex = -1;\n"
+	"	int rightIndex = -1;\n"
+	"	\n"
+	"	//Find nearest element to left with a lower common prefix\n"
+	"	for(int i = internalNodeIndex - 1; i >= 0; --i)\n"
+	"	{\n"
+	"		int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
+	"		if(nodeLeftSharedPrefixLength < nodePrefixLength)\n"
+	"		{\n"
+	"			leftIndex = i;\n"
+	"			break;\n"
+	"		}\n"
+	"	}\n"
+	"	\n"
+	"	//Find nearest element to right with a lower common prefix\n"
+	"	for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n"
+	"	{\n"
+	"		int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
+	"		if(nodeRightSharedPrefixLength < nodePrefixLength)\n"
+	"		{\n"
+	"			rightIndex = i;\n"
+	"			break;\n"
+	"		}\n"
+	"	}\n"
+	"	\n"
+	"#else //Use binary search\n"
+	"	//Find nearest element to left with a lower common prefix\n"
+	"	int leftIndex = -1;\n"
+	"	{\n"
+	"		int lower = 0;\n"
+	"		int upper = internalNodeIndex - 1;\n"
+	"		\n"
+	"		while(lower <= upper)\n"
+	"		{\n"
+	"			int mid = (lower + upper) / 2;\n"
+	"			b3Int64 midPrefix = commonPrefixes[mid];\n"
+	"			int midPrefixLength = commonPrefixLengths[mid];\n"
+	"			\n"
+	"			int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
+	"			if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
+	"			{\n"
+	"				int right = mid + 1;\n"
+	"				if(right < internalNodeIndex)\n"
+	"				{\n"
+	"					b3Int64 rightPrefix = commonPrefixes[right];\n"
+	"					int rightPrefixLength = commonPrefixLengths[right];\n"
+	"					\n"
+	"					int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n"
+	"					if(nodeRightSharedPrefixLength < nodePrefixLength) \n"
+	"					{\n"
+	"						lower = right;\n"
+	"						leftIndex = right;\n"
+	"					}\n"
+	"					else \n"
+	"					{\n"
+	"						leftIndex = mid;\n"
+	"						break;\n"
+	"					}\n"
+	"				}\n"
+	"				else \n"
+	"				{\n"
+	"					leftIndex = mid;\n"
+	"					break;\n"
+	"				}\n"
+	"			}\n"
+	"			else upper = mid - 1;\n"
+	"		}\n"
+	"	}\n"
+	"	\n"
+	"	//Find nearest element to right with a lower common prefix\n"
+	"	int rightIndex = -1;\n"
+	"	{\n"
+	"		int lower = internalNodeIndex + 1;\n"
+	"		int upper = numInternalNodes - 1;\n"
+	"		\n"
+	"		while(lower <= upper)\n"
+	"		{\n"
+	"			int mid = (lower + upper) / 2;\n"
+	"			b3Int64 midPrefix = commonPrefixes[mid];\n"
+	"			int midPrefixLength = commonPrefixLengths[mid];\n"
+	"			\n"
+	"			int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
+	"			if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
+	"			{\n"
+	"				int left = mid - 1;\n"
+	"				if(left > internalNodeIndex)\n"
+	"				{\n"
+	"					b3Int64 leftPrefix = commonPrefixes[left];\n"
+	"					int leftPrefixLength = commonPrefixLengths[left];\n"
+	"				\n"
+	"					int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n"
+	"					if(nodeLeftSharedPrefixLength < nodePrefixLength) \n"
+	"					{\n"
+	"						upper = left;\n"
+	"						rightIndex = left;\n"
+	"					}\n"
+	"					else \n"
+	"					{\n"
+	"						rightIndex = mid;\n"
+	"						break;\n"
+	"					}\n"
+	"				}\n"
+	"				else \n"
+	"				{\n"
+	"					rightIndex = mid;\n"
+	"					break;\n"
+	"				}\n"
+	"			}\n"
+	"			else lower = mid + 1;\n"
+	"		}\n"
+	"	}\n"
+	"#endif\n"
+	"	\n"
+	"	//Select parent\n"
+	"	{\n"
+	"		int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
+	"		int rightPrefixLength =  (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
+	"		\n"
+	"		int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n"
+	"		\n"
+	"		if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n"
+	"		else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n"
+	"		\n"
+	"		int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n"
+	"		\n"
+	"		int isRootNode = (leftIndex == -1 && rightIndex == -1);\n"
+	"		out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n"
+	"		\n"
+	"		int isLeaf = 0;\n"
+	"		if(!isRootNode)\n"
+	"		{\n"
+	"			int isRightChild = (isLeftHigherPrefixLength);	//If the left node is the parent, then this node is its right child and vice versa\n"
+	"			\n"
+	"			//out_childNodesAsInt[0] == int2.x == left child\n"
+	"			//out_childNodesAsInt[1] == int2.y == right child\n"
+	"			__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
+	"			out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
+	"		}\n"
+	"		else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
+	"	}\n"
+	"}\n"
+	"__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n"
+	"									__global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n"
+	"{\n"
+	"	if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n"
+	"	int internalNodeIndex = get_global_id(0);\n"
+	"	if(internalNodeIndex >= numInternalNodes) return;\n"
+	"	\n"
+	"	//\n"
+	"	int distanceFromRoot = 0;\n"
+	"	{\n"
+	"		int parentIndex = internalNodeParentNodes[internalNodeIndex];\n"
+	"		while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n"
+	"		{\n"
+	"			parentIndex = internalNodeParentNodes[parentIndex];\n"
+	"			++distanceFromRoot;\n"
+	"		}\n"
+	"	}\n"
+	"	out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n"
+	"	\n"
+	"	//\n"
+	"	__local int localMaxDistanceFromRoot;\n"
+	"	if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n"
+	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"	\n"
+	"	atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n"
+	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"	\n"
+	"	if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n"
+	"}\n"
+	"__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n"
+	"												__global int2* childNodes,\n"
+	"												__global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n"
+	"												int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n"
+	"{\n"
+	"	int internalNodeIndex = get_global_id(0);\n"
+	"	if(internalNodeIndex >= numInternalNodes) return;\n"
+	"	\n"
+	"	int distance = distanceFromRoot[internalNodeIndex];\n"
+	"	\n"
+	"	if(distance == processedDistance)\n"
+	"	{\n"
+	"		int leftChildIndex = childNodes[internalNodeIndex].x;\n"
+	"		int rightChildIndex = childNodes[internalNodeIndex].y;\n"
+	"		\n"
+	"		int isLeftChildLeaf = isLeafNode(leftChildIndex);\n"
+	"		int isRightChildLeaf = isLeafNode(rightChildIndex);\n"
+	"		\n"
+	"		leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n"
+	"		rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n"
+	"		\n"
+	"		//leftRigidIndex/rightRigidIndex is not used if internal node\n"
+	"		int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n"
+	"		int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n"
+	"		\n"
+	"		b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n"
+	"		b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n"
+	"		\n"
+	"		b3AabbCL mergedAabb;\n"
+	"		mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n"
+	"		mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n"
+	"		internalNodeAabbs[internalNodeIndex] = mergedAabb;\n"
+	"	}\n"
+	"}\n"
+	"__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n"
+	"{\n"
+	"	int internalNodeIndex = get_global_id(0);\n"
+	"	if(internalNodeIndex >= numInternalNodes) return;\n"
+	"	\n"
+	"	int numLeafNodes = numInternalNodes + 1;\n"
+	"	\n"
+	"	int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n"
+	"	\n"
+	"	int2 leafIndexRange;	//x == min leaf index, y == max leaf index\n"
+	"	\n"
+	"	//Find lowest leaf index covered by this internal node\n"
+	"	{\n"
+	"		int lowestIndex = childNodes.x;		//childNodes.x == Left child\n"
+	"		while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n"
+	"		leafIndexRange.x = lowestIndex;\n"
+	"	}\n"
+	"	\n"
+	"	//Find highest leaf index covered by this internal node\n"
+	"	{\n"
+	"		int highestIndex = childNodes.y;	//childNodes.y == Right child\n"
+	"		while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n"
+	"		leafIndexRange.y = highestIndex;\n"
+	"	}\n"
+	"	\n"
+	"	//\n"
+	"	out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n"
+	"}\n";
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
index 04d40fcf26..d6999b94cb 100644
--- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
+++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
@@ -1,342 +1,341 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* sapCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Erwin Coumans\n"
-"#define NEW_PAIR_MARKER -1\n"
-"typedef struct \n"
-"{\n"
-"	union\n"
-"	{\n"
-"		float4	m_min;\n"
-"		float   m_minElems[4];\n"
-"		int			m_minIndices[4];\n"
-"	};\n"
-"	union\n"
-"	{\n"
-"		float4	m_max;\n"
-"		float   m_maxElems[4];\n"
-"		int			m_maxIndices[4];\n"
-"	};\n"
-"} btAabbCL;\n"
-"/// conservative test for overlap between two aabbs\n"
-"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
-"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
-"{\n"
-"	bool overlap = true;\n"
-"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
-"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
-"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
-"	return overlap;\n"
-"}\n"
-"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
-"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
-"{\n"
-"	bool overlap = true;\n"
-"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
-"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
-"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
-"	return overlap;\n"
-"}\n"
-"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
-"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
-"{\n"
-"	bool overlap = true;\n"
-"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
-"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
-"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
-"	return overlap;\n"
-"}\n"
-"__kernel void   computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping,  __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	if (i>=numUnsortedAabbs)\n"
-"		return;\n"
-"	int j = get_global_id(1);\n"
-"	if (j>=numUnSortedAabbs2)\n"
-"		return;\n"
-"	__global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n"
-"	__global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n"
-"	if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n"
-"	{\n"
-"		int4 myPair;\n"
-"		\n"
-"		int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n"
-"		int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n"
-"		if (xIndex>yIndex)\n"
-"		{\n"
-"			int tmp = xIndex;\n"
-"			xIndex=yIndex;\n"
-"			yIndex=tmp;\n"
-"		}\n"
-"		\n"
-"		myPair.x = xIndex;\n"
-"		myPair.y = yIndex;\n"
-"		myPair.z = NEW_PAIR_MARKER;\n"
-"		myPair.w = NEW_PAIR_MARKER;\n"
-"		int curPair = atomic_inc (pairCount);\n"
-"		if (curPair<maxPairs)\n"
-"		{\n"
-"				pairsOut[curPair] = myPair; //flush to main memory\n"
-"		}\n"
-"	}\n"
-"}\n"
-"__kernel void   computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	if (i>=numObjects)\n"
-"		return;\n"
-"	for (int j=i+1;j<numObjects;j++)\n"
-"	{\n"
-"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
-"		{\n"
-"			int4 myPair;\n"
-"			myPair.x = aabbs[i].m_minIndices[3];\n"
-"			myPair.y = aabbs[j].m_minIndices[3];\n"
-"			myPair.z = NEW_PAIR_MARKER;\n"
-"			myPair.w = NEW_PAIR_MARKER;\n"
-"			int curPair = atomic_inc (pairCount);\n"
-"			if (curPair<maxPairs)\n"
-"			{\n"
-"					pairsOut[curPair] = myPair; //flush to main memory\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"}\n"
-"__kernel void   computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	if (i>=numObjects)\n"
-"		return;\n"
-"	for (int j=i+1;j<numObjects;j++)\n"
-"	{\n"
-"  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
-"		{\n"
-"			break;\n"
-"		}\n"
-"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
-"		{\n"
-"			int4 myPair;\n"
-"			myPair.x = aabbs[i].m_minIndices[3];\n"
-"			myPair.y = aabbs[j].m_minIndices[3];\n"
-"			myPair.z = NEW_PAIR_MARKER;\n"
-"			myPair.w = NEW_PAIR_MARKER;\n"
-"			int curPair = atomic_inc (pairCount);\n"
-"			if (curPair<maxPairs)\n"
-"			{\n"
-"					pairsOut[curPair] = myPair; //flush to main memory\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"}\n"
-"__kernel void   computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	int localId = get_local_id(0);\n"
-"	__local int numActiveWgItems[1];\n"
-"	__local int breakRequest[1];\n"
-"	if (localId==0)\n"
-"	{\n"
-"		numActiveWgItems[0] = 0;\n"
-"		breakRequest[0] = 0;\n"
-"	}\n"
-"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-"	atomic_inc(numActiveWgItems);\n"
-"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-"	int localBreak = 0;\n"
-"	int j=i+1;\n"
-"	do\n"
-"	{\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"	\n"
-"		if (j<numObjects)\n"
-"		{\n"
-"	  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
-"			{\n"
-"				if (!localBreak)\n"
-"				{\n"
-"					atomic_inc(breakRequest);\n"
-"					localBreak = 1;\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"		\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"		\n"
-"		if (j>=numObjects && !localBreak)\n"
-"		{\n"
-"			atomic_inc(breakRequest);\n"
-"			localBreak = 1;\n"
-"		}\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"		\n"
-"		if (!localBreak)\n"
-"		{\n"
-"			if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
-"			{\n"
-"				int4 myPair;\n"
-"				myPair.x = aabbs[i].m_minIndices[3];\n"
-"				myPair.y = aabbs[j].m_minIndices[3];\n"
-"				myPair.z = NEW_PAIR_MARKER;\n"
-"				myPair.w = NEW_PAIR_MARKER;\n"
-"				int curPair = atomic_inc (pairCount);\n"
-"				if (curPair<maxPairs)\n"
-"				{\n"
-"						pairsOut[curPair] = myPair; //flush to main memory\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"		j++;\n"
-"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
-"}\n"
-"__kernel void   computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	int localId = get_local_id(0);\n"
-"	__local int numActiveWgItems[1];\n"
-"	__local int breakRequest[1];\n"
-"	__local btAabbCL localAabbs[128];// = aabbs[i];\n"
-"	\n"
-"	btAabbCL myAabb;\n"
-"	\n"
-"	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
-"	float testValue = 	myAabb.m_maxElems[axis];\n"
-"	\n"
-"	if (localId==0)\n"
-"	{\n"
-"		numActiveWgItems[0] = 0;\n"
-"		breakRequest[0] = 0;\n"
-"	}\n"
-"	int localCount=0;\n"
-"	int block=0;\n"
-"	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
-"	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
-"	\n"
-"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-"	atomic_inc(numActiveWgItems);\n"
-"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-"	int localBreak = 0;\n"
-"	\n"
-"	int j=i+1;\n"
-"	do\n"
-"	{\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"	\n"
-"		if (j<numObjects)\n"
-"		{\n"
-"	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
-"			{\n"
-"				if (!localBreak)\n"
-"				{\n"
-"					atomic_inc(breakRequest);\n"
-"					localBreak = 1;\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"		\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"		\n"
-"		if (j>=numObjects && !localBreak)\n"
-"		{\n"
-"			atomic_inc(breakRequest);\n"
-"			localBreak = 1;\n"
-"		}\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"		\n"
-"		if (!localBreak)\n"
-"		{\n"
-"			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
-"			{\n"
-"				int4 myPair;\n"
-"				myPair.x = myAabb.m_minIndices[3];\n"
-"				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
-"				myPair.z = NEW_PAIR_MARKER;\n"
-"				myPair.w = NEW_PAIR_MARKER;\n"
-"				int curPair = atomic_inc (pairCount);\n"
-"				if (curPair<maxPairs)\n"
-"				{\n"
-"						pairsOut[curPair] = myPair; //flush to main memory\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"		\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"		localCount++;\n"
-"		if (localCount==64)\n"
-"		{\n"
-"			localCount = 0;\n"
-"			block+=64;			\n"
-"			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
-"			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
-"		}\n"
-"		j++;\n"
-"		\n"
-"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
-"	\n"
-"}\n"
-"//http://stereopsis.com/radix.html\n"
-"unsigned int FloatFlip(float fl);\n"
-"unsigned int FloatFlip(float fl)\n"
-"{\n"
-"	unsigned int f = *(unsigned int*)&fl;\n"
-"	unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
-"	return f ^ mask;\n"
-"}\n"
-"float IFloatFlip(unsigned int f);\n"
-"float IFloatFlip(unsigned int f)\n"
-"{\n"
-"	unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
-"	unsigned int fl = f ^ mask;\n"
-"	return *(float*)&fl;\n"
-"}\n"
-"__kernel void   copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	if (i>=numObjects)\n"
-"		return;\n"
-"	int src = destAabbs[i].m_maxIndices[3];\n"
-"	destAabbs[i] = allAabbs[src];\n"
-"	destAabbs[i].m_maxIndices[3] = src;\n"
-"}\n"
-"__kernel void   flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	if (i>=numObjects)\n"
-"		return;\n"
-"	\n"
-"	\n"
-"	sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n"
-"	sortData[i].y = i;\n"
-"		\n"
-"}\n"
-"__kernel void   scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	if (i>=numObjects)\n"
-"		return;\n"
-"	\n"
-"	sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n"
-"}\n"
-"__kernel void   prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	if (i>=numAabbs)\n"
-"		return;\n"
-"	\n"
-"	btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n"
-"	\n"
-"	float4 s;\n"
-"	s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n"
-"	sum[i]=s;\n"
-"	sum2[i]=s*s;	\n"
-"}\n"
-;
+static const char* sapCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Erwin Coumans\n"
+	"#define NEW_PAIR_MARKER -1\n"
+	"typedef struct \n"
+	"{\n"
+	"	union\n"
+	"	{\n"
+	"		float4	m_min;\n"
+	"		float   m_minElems[4];\n"
+	"		int			m_minIndices[4];\n"
+	"	};\n"
+	"	union\n"
+	"	{\n"
+	"		float4	m_max;\n"
+	"		float   m_maxElems[4];\n"
+	"		int			m_maxIndices[4];\n"
+	"	};\n"
+	"} btAabbCL;\n"
+	"/// conservative test for overlap between two aabbs\n"
+	"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
+	"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
+	"{\n"
+	"	bool overlap = true;\n"
+	"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+	"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+	"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+	"	return overlap;\n"
+	"}\n"
+	"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
+	"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
+	"{\n"
+	"	bool overlap = true;\n"
+	"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+	"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+	"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+	"	return overlap;\n"
+	"}\n"
+	"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
+	"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
+	"{\n"
+	"	bool overlap = true;\n"
+	"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+	"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+	"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+	"	return overlap;\n"
+	"}\n"
+	"__kernel void   computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping,  __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	if (i>=numUnsortedAabbs)\n"
+	"		return;\n"
+	"	int j = get_global_id(1);\n"
+	"	if (j>=numUnSortedAabbs2)\n"
+	"		return;\n"
+	"	__global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n"
+	"	__global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n"
+	"	if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n"
+	"	{\n"
+	"		int4 myPair;\n"
+	"		\n"
+	"		int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n"
+	"		int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n"
+	"		if (xIndex>yIndex)\n"
+	"		{\n"
+	"			int tmp = xIndex;\n"
+	"			xIndex=yIndex;\n"
+	"			yIndex=tmp;\n"
+	"		}\n"
+	"		\n"
+	"		myPair.x = xIndex;\n"
+	"		myPair.y = yIndex;\n"
+	"		myPair.z = NEW_PAIR_MARKER;\n"
+	"		myPair.w = NEW_PAIR_MARKER;\n"
+	"		int curPair = atomic_inc (pairCount);\n"
+	"		if (curPair<maxPairs)\n"
+	"		{\n"
+	"				pairsOut[curPair] = myPair; //flush to main memory\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"__kernel void   computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	if (i>=numObjects)\n"
+	"		return;\n"
+	"	for (int j=i+1;j<numObjects;j++)\n"
+	"	{\n"
+	"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+	"		{\n"
+	"			int4 myPair;\n"
+	"			myPair.x = aabbs[i].m_minIndices[3];\n"
+	"			myPair.y = aabbs[j].m_minIndices[3];\n"
+	"			myPair.z = NEW_PAIR_MARKER;\n"
+	"			myPair.w = NEW_PAIR_MARKER;\n"
+	"			int curPair = atomic_inc (pairCount);\n"
+	"			if (curPair<maxPairs)\n"
+	"			{\n"
+	"					pairsOut[curPair] = myPair; //flush to main memory\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"__kernel void   computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	if (i>=numObjects)\n"
+	"		return;\n"
+	"	for (int j=i+1;j<numObjects;j++)\n"
+	"	{\n"
+	"  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
+	"		{\n"
+	"			break;\n"
+	"		}\n"
+	"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+	"		{\n"
+	"			int4 myPair;\n"
+	"			myPair.x = aabbs[i].m_minIndices[3];\n"
+	"			myPair.y = aabbs[j].m_minIndices[3];\n"
+	"			myPair.z = NEW_PAIR_MARKER;\n"
+	"			myPair.w = NEW_PAIR_MARKER;\n"
+	"			int curPair = atomic_inc (pairCount);\n"
+	"			if (curPair<maxPairs)\n"
+	"			{\n"
+	"					pairsOut[curPair] = myPair; //flush to main memory\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"__kernel void   computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	int localId = get_local_id(0);\n"
+	"	__local int numActiveWgItems[1];\n"
+	"	__local int breakRequest[1];\n"
+	"	if (localId==0)\n"
+	"	{\n"
+	"		numActiveWgItems[0] = 0;\n"
+	"		breakRequest[0] = 0;\n"
+	"	}\n"
+	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"	atomic_inc(numActiveWgItems);\n"
+	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"	int localBreak = 0;\n"
+	"	int j=i+1;\n"
+	"	do\n"
+	"	{\n"
+	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"	\n"
+	"		if (j<numObjects)\n"
+	"		{\n"
+	"	  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
+	"			{\n"
+	"				if (!localBreak)\n"
+	"				{\n"
+	"					atomic_inc(breakRequest);\n"
+	"					localBreak = 1;\n"
+	"				}\n"
+	"			}\n"
+	"		}\n"
+	"		\n"
+	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"		\n"
+	"		if (j>=numObjects && !localBreak)\n"
+	"		{\n"
+	"			atomic_inc(breakRequest);\n"
+	"			localBreak = 1;\n"
+	"		}\n"
+	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"		\n"
+	"		if (!localBreak)\n"
+	"		{\n"
+	"			if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+	"			{\n"
+	"				int4 myPair;\n"
+	"				myPair.x = aabbs[i].m_minIndices[3];\n"
+	"				myPair.y = aabbs[j].m_minIndices[3];\n"
+	"				myPair.z = NEW_PAIR_MARKER;\n"
+	"				myPair.w = NEW_PAIR_MARKER;\n"
+	"				int curPair = atomic_inc (pairCount);\n"
+	"				if (curPair<maxPairs)\n"
+	"				{\n"
+	"						pairsOut[curPair] = myPair; //flush to main memory\n"
+	"				}\n"
+	"			}\n"
+	"		}\n"
+	"		j++;\n"
+	"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
+	"}\n"
+	"__kernel void   computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	int localId = get_local_id(0);\n"
+	"	__local int numActiveWgItems[1];\n"
+	"	__local int breakRequest[1];\n"
+	"	__local btAabbCL localAabbs[128];// = aabbs[i];\n"
+	"	\n"
+	"	btAabbCL myAabb;\n"
+	"	\n"
+	"	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
+	"	float testValue = 	myAabb.m_maxElems[axis];\n"
+	"	\n"
+	"	if (localId==0)\n"
+	"	{\n"
+	"		numActiveWgItems[0] = 0;\n"
+	"		breakRequest[0] = 0;\n"
+	"	}\n"
+	"	int localCount=0;\n"
+	"	int block=0;\n"
+	"	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
+	"	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
+	"	\n"
+	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"	atomic_inc(numActiveWgItems);\n"
+	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"	int localBreak = 0;\n"
+	"	\n"
+	"	int j=i+1;\n"
+	"	do\n"
+	"	{\n"
+	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"	\n"
+	"		if (j<numObjects)\n"
+	"		{\n"
+	"	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
+	"			{\n"
+	"				if (!localBreak)\n"
+	"				{\n"
+	"					atomic_inc(breakRequest);\n"
+	"					localBreak = 1;\n"
+	"				}\n"
+	"			}\n"
+	"		}\n"
+	"		\n"
+	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"		\n"
+	"		if (j>=numObjects && !localBreak)\n"
+	"		{\n"
+	"			atomic_inc(breakRequest);\n"
+	"			localBreak = 1;\n"
+	"		}\n"
+	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"		\n"
+	"		if (!localBreak)\n"
+	"		{\n"
+	"			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
+	"			{\n"
+	"				int4 myPair;\n"
+	"				myPair.x = myAabb.m_minIndices[3];\n"
+	"				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
+	"				myPair.z = NEW_PAIR_MARKER;\n"
+	"				myPair.w = NEW_PAIR_MARKER;\n"
+	"				int curPair = atomic_inc (pairCount);\n"
+	"				if (curPair<maxPairs)\n"
+	"				{\n"
+	"						pairsOut[curPair] = myPair; //flush to main memory\n"
+	"				}\n"
+	"			}\n"
+	"		}\n"
+	"		\n"
+	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"		localCount++;\n"
+	"		if (localCount==64)\n"
+	"		{\n"
+	"			localCount = 0;\n"
+	"			block+=64;			\n"
+	"			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
+	"			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
+	"		}\n"
+	"		j++;\n"
+	"		\n"
+	"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
+	"	\n"
+	"}\n"
+	"//http://stereopsis.com/radix.html\n"
+	"unsigned int FloatFlip(float fl);\n"
+	"unsigned int FloatFlip(float fl)\n"
+	"{\n"
+	"	unsigned int f = *(unsigned int*)&fl;\n"
+	"	unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
+	"	return f ^ mask;\n"
+	"}\n"
+	"float IFloatFlip(unsigned int f);\n"
+	"float IFloatFlip(unsigned int f)\n"
+	"{\n"
+	"	unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
+	"	unsigned int fl = f ^ mask;\n"
+	"	return *(float*)&fl;\n"
+	"}\n"
+	"__kernel void   copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	if (i>=numObjects)\n"
+	"		return;\n"
+	"	int src = destAabbs[i].m_maxIndices[3];\n"
+	"	destAabbs[i] = allAabbs[src];\n"
+	"	destAabbs[i].m_maxIndices[3] = src;\n"
+	"}\n"
+	"__kernel void   flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	if (i>=numObjects)\n"
+	"		return;\n"
+	"	\n"
+	"	\n"
+	"	sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n"
+	"	sortData[i].y = i;\n"
+	"		\n"
+	"}\n"
+	"__kernel void   scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	if (i>=numObjects)\n"
+	"		return;\n"
+	"	\n"
+	"	sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n"
+	"}\n"
+	"__kernel void   prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	if (i>=numAabbs)\n"
+	"		return;\n"
+	"	\n"
+	"	btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n"
+	"	\n"
+	"	float4 s;\n"
+	"	s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n"
+	"	sum[i]=s;\n"
+	"	sum2[i]=s*s;	\n"
+	"}\n";