summaryrefslogtreecommitdiff
path: root/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision')
-rw-r--r--thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h28
-rw-r--r--thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp253
-rw-r--r--thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h68
-rw-r--r--thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp540
-rw-r--r--thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h68
-rw-r--r--thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp44
-rw-r--r--thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h24
-rw-r--r--thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp918
-rw-r--r--thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h146
-rw-r--r--thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h9
-rw-r--r--thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h395
-rw-r--r--thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h1455
-rw-r--r--thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h681
13 files changed, 2234 insertions, 2395 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h
index 0ed8aa8232..b296992525 100644
--- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h
+++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h
@@ -12,33 +12,31 @@
class b3GpuBroadphaseInterface
{
public:
-
- typedef class b3GpuBroadphaseInterface* (CreateFunc)(cl_context ctx,cl_device_id device, cl_command_queue q);
+ typedef class b3GpuBroadphaseInterface*(CreateFunc)(cl_context ctx, cl_device_id device, cl_command_queue q);
virtual ~b3GpuBroadphaseInterface()
{
}
- virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0;
- virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0;
+ virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0;
+ virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0;
- virtual void calculateOverlappingPairs(int maxPairs)=0;
- virtual void calculateOverlappingPairsHost(int maxPairs)=0;
+ virtual void calculateOverlappingPairs(int maxPairs) = 0;
+ virtual void calculateOverlappingPairsHost(int maxPairs) = 0;
//call writeAabbsToGpu after done making all changes (createProxy etc)
- virtual void writeAabbsToGpu()=0;
+ virtual void writeAabbsToGpu() = 0;
+
+ virtual cl_mem getAabbBufferWS() = 0;
+ virtual int getNumOverlap() = 0;
+ virtual cl_mem getOverlappingPairBuffer() = 0;
- virtual cl_mem getAabbBufferWS()=0;
- virtual int getNumOverlap()=0;
- virtual cl_mem getOverlappingPairBuffer()=0;
+ virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() = 0;
+ virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() = 0;
- virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()=0;
- virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()=0;
-
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() = 0;
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() = 0;
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() = 0;
-
};
-#endif //B3_GPU_BROADPHASE_INTERFACE_H
+#endif //B3_GPU_BROADPHASE_INTERFACE_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp
index 74d0c8056c..e714fadac3 100644
--- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp
+++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp
@@ -5,12 +5,9 @@
#include "kernels/sapKernels.h"
//#include "kernels/gridBroadphase.cl"
-
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
-
-
#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
#define B3_GRID_BROADPHASE_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl"
@@ -21,31 +18,25 @@ cl_kernel kFindOverlappingPairs;
cl_kernel m_copyAabbsKernel;
cl_kernel m_sap2Kernel;
-
-
-
-
//int maxPairsPerBody = 64;
-int maxBodiesPerCell = 256;//??
+int maxBodiesPerCell = 256; //??
-b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q )
-:m_context(ctx),
-m_device(device),
-m_queue(q),
-m_allAabbsGPU1(ctx,q),
-m_smallAabbsMappingGPU(ctx,q),
-m_largeAabbsMappingGPU(ctx,q),
-m_gpuPairs(ctx,q),
+b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q)
+ : m_context(ctx),
+ m_device(device),
+ m_queue(q),
+ m_allAabbsGPU1(ctx, q),
+ m_smallAabbsMappingGPU(ctx, q),
+ m_largeAabbsMappingGPU(ctx, q),
+ m_gpuPairs(ctx, q),
-m_hashGpu(ctx,q),
+ m_hashGpu(ctx, q),
-m_cellStartGpu(ctx,q),
-m_paramsGPU(ctx,q)
+ m_cellStartGpu(ctx, q),
+ m_paramsGPU(ctx, q)
{
-
-
- b3Vector3 gridSize = b3MakeVector3(3,3,3);
- b3Vector3 invGridSize = b3MakeVector3(1.f/gridSize[0],1.f/gridSize[1],1.f/gridSize[2]);
+ b3Vector3 gridSize = b3MakeVector3(3, 3, 3);
+ b3Vector3 invGridSize = b3MakeVector3(1.f / gridSize[0], 1.f / gridSize[1], 1.f / gridSize[2]);
m_paramsCPU.m_gridSize[0] = 128;
m_paramsCPU.m_gridSize[1] = 128;
@@ -58,92 +49,79 @@ m_paramsGPU(ctx,q)
m_paramsCPU.m_invCellSize[3] = 0.f;
m_paramsGPU.push_back(m_paramsCPU);
- cl_int errNum=0;
+ cl_int errNum = 0;
{
const char* sapSrc = sapCL;
- cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
- b3Assert(errNum==CL_SUCCESS);
- m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
- m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg );
- b3Assert(errNum==CL_SUCCESS);
+ cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH);
+ b3Assert(errNum == CL_SUCCESS);
+ m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg);
+ m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg);
+ b3Assert(errNum == CL_SUCCESS);
}
{
-
- cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,gridBroadphaseCL,&errNum,"",B3_GRID_BROADPHASE_PATH);
- b3Assert(errNum==CL_SUCCESS);
-
- kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kCalcHashAABB",&errNum,gridProg);
- b3Assert(errNum==CL_SUCCESS);
-
- kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kClearCellStart",&errNum,gridProg);
- b3Assert(errNum==CL_SUCCESS);
-
- kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindCellStart",&errNum,gridProg);
- b3Assert(errNum==CL_SUCCESS);
-
-
- kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindOverlappingPairs",&errNum,gridProg);
- b3Assert(errNum==CL_SUCCESS);
-
-
-
-
- }
+ cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, gridBroadphaseCL, &errNum, "", B3_GRID_BROADPHASE_PATH);
+ b3Assert(errNum == CL_SUCCESS);
- m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue);
+ kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kCalcHashAABB", &errNum, gridProg);
+ b3Assert(errNum == CL_SUCCESS);
+ kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kClearCellStart", &errNum, gridProg);
+ b3Assert(errNum == CL_SUCCESS);
+
+ kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindCellStart", &errNum, gridProg);
+ b3Assert(errNum == CL_SUCCESS);
+
+ kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindOverlappingPairs", &errNum, gridProg);
+ b3Assert(errNum == CL_SUCCESS);
+ }
+
+ m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue);
}
b3GpuGridBroadphase::~b3GpuGridBroadphase()
{
- clReleaseKernel( kCalcHashAABB);
- clReleaseKernel( kClearCellStart);
- clReleaseKernel( kFindCellStart);
- clReleaseKernel( kFindOverlappingPairs);
- clReleaseKernel( m_sap2Kernel);
- clReleaseKernel( m_copyAabbsKernel);
-
-
-
+ clReleaseKernel(kCalcHashAABB);
+ clReleaseKernel(kClearCellStart);
+ clReleaseKernel(kFindCellStart);
+ clReleaseKernel(kFindOverlappingPairs);
+ clReleaseKernel(m_sap2Kernel);
+ clReleaseKernel(m_copyAabbsKernel);
+
delete m_sorter;
}
-
-
-void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)
+void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
- aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr;
+ aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size(); //NOT userPtr;
m_smallAabbsMappingCPU.push_back(m_allAabbsCPU1.size());
m_allAabbsCPU1.push_back(aabb);
-
}
-void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)
+void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
- aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr;
+ aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size(); //NOT userPtr;
m_largeAabbsMappingCPU.push_back(m_allAabbsCPU1.size());
m_allAabbsCPU1.push_back(aabb);
}
-void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
+void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
{
B3_PROFILE("b3GpuGridBroadphase::calculateOverlappingPairs");
-
if (0)
{
calculateOverlappingPairsHost(maxPairs);
- /*
+ /*
b3AlignedObjectArray<b3Int4> cpuPairs;
m_gpuPairs.copyToHost(cpuPairs);
printf("host m_gpuPairs.size()=%d\n",m_gpuPairs.size());
@@ -154,57 +132,50 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
*/
return;
}
-
-
-
-
int numSmallAabbs = m_smallAabbsMappingGPU.size();
- b3OpenCLArray<int> pairCount(m_context,m_queue);
+ b3OpenCLArray<int> pairCount(m_context, m_queue);
pairCount.push_back(0);
- m_gpuPairs.resize(maxPairs);//numSmallAabbs*maxPairsPerBody);
+ m_gpuPairs.resize(maxPairs); //numSmallAabbs*maxPairsPerBody);
{
int numLargeAabbs = m_largeAabbsMappingGPU.size();
if (numLargeAabbs && numSmallAabbs)
{
B3_PROFILE("sap2Kernel");
- b3BufferInfoCL bInfo[] = {
- b3BufferInfoCL( m_allAabbsGPU1.getBufferCL() ),
- b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ),
- b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ),
- b3BufferInfoCL( m_gpuPairs.getBufferCL() ),
+ b3BufferInfoCL bInfo[] = {
+ b3BufferInfoCL(m_allAabbsGPU1.getBufferCL()),
+ b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()),
+ b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()),
+ b3BufferInfoCL(m_gpuPairs.getBufferCL()),
b3BufferInfoCL(pairCount.getBufferCL())};
- b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel");
- launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
- launcher.setConst( numLargeAabbs );
- launcher.setConst( numSmallAabbs);
- launcher.setConst( 0 );//axis is not used
- launcher.setConst( maxPairs );
- //@todo: use actual maximum work item sizes of the device instead of hardcoded values
- launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64);
-
+ b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel");
+ launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+ launcher.setConst(numLargeAabbs);
+ launcher.setConst(numSmallAabbs);
+ launcher.setConst(0); //axis is not used
+ launcher.setConst(maxPairs);
+ //@todo: use actual maximum work item sizes of the device instead of hardcoded values
+ launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64);
+
int numPairs = pairCount.at(0);
-
- if (numPairs >maxPairs)
+
+ if (numPairs > maxPairs)
{
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
- numPairs =maxPairs;
+ numPairs = maxPairs;
}
}
}
-
-
-
if (numSmallAabbs)
{
B3_PROFILE("gridKernel");
m_hashGpu.resize(numSmallAabbs);
{
B3_PROFILE("kCalcHashAABB");
- b3LauncherCL launch(m_queue,kCalcHashAABB,"kCalcHashAABB");
+ b3LauncherCL launch(m_queue, kCalcHashAABB, "kCalcHashAABB");
launch.setConst(numSmallAabbs);
launch.setBuffer(m_allAabbsGPU1.getBufferCL());
launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
@@ -214,117 +185,104 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
}
m_sorter->execute(m_hashGpu);
-
- int numCells = this->m_paramsCPU.m_gridSize[0]*this->m_paramsCPU.m_gridSize[1]*this->m_paramsCPU.m_gridSize[2];
+
+ int numCells = this->m_paramsCPU.m_gridSize[0] * this->m_paramsCPU.m_gridSize[1] * this->m_paramsCPU.m_gridSize[2];
m_cellStartGpu.resize(numCells);
//b3AlignedObjectArray<int > cellStartCpu;
-
-
+
{
B3_PROFILE("kClearCellStart");
- b3LauncherCL launch(m_queue,kClearCellStart,"kClearCellStart");
+ b3LauncherCL launch(m_queue, kClearCellStart, "kClearCellStart");
launch.setConst(numCells);
launch.setBuffer(m_cellStartGpu.getBufferCL());
launch.launch1D(numCells);
//m_cellStartGpu.copyToHost(cellStartCpu);
//printf("??\n");
-
}
-
{
B3_PROFILE("kFindCellStart");
- b3LauncherCL launch(m_queue,kFindCellStart,"kFindCellStart");
+ b3LauncherCL launch(m_queue, kFindCellStart, "kFindCellStart");
launch.setConst(numSmallAabbs);
launch.setBuffer(m_hashGpu.getBufferCL());
launch.setBuffer(m_cellStartGpu.getBufferCL());
launch.launch1D(numSmallAabbs);
//m_cellStartGpu.copyToHost(cellStartCpu);
//printf("??\n");
-
}
-
+
{
B3_PROFILE("kFindOverlappingPairs");
-
-
- b3LauncherCL launch(m_queue,kFindOverlappingPairs,"kFindOverlappingPairs");
+
+ b3LauncherCL launch(m_queue, kFindOverlappingPairs, "kFindOverlappingPairs");
launch.setConst(numSmallAabbs);
launch.setBuffer(m_allAabbsGPU1.getBufferCL());
launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
launch.setBuffer(m_hashGpu.getBufferCL());
launch.setBuffer(m_cellStartGpu.getBufferCL());
-
+
launch.setBuffer(m_paramsGPU.getBufferCL());
//launch.setBuffer(0);
launch.setBuffer(pairCount.getBufferCL());
launch.setBuffer(m_gpuPairs.getBufferCL());
-
+
launch.setConst(maxPairs);
launch.launch1D(numSmallAabbs);
-
int numPairs = pairCount.at(0);
- if (numPairs >maxPairs)
+ if (numPairs > maxPairs)
{
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
- numPairs =maxPairs;
+ numPairs = maxPairs;
}
-
+
m_gpuPairs.resize(numPairs);
-
+
if (0)
{
b3AlignedObjectArray<b3Int4> pairsCpu;
m_gpuPairs.copyToHost(pairsCpu);
int sz = m_gpuPairs.size();
- printf("m_gpuPairs.size()=%d\n",sz);
- for (int i=0;i<m_gpuPairs.size();i++)
+ printf("m_gpuPairs.size()=%d\n", sz);
+ for (int i = 0; i < m_gpuPairs.size(); i++)
{
- printf("pair %d = %d,%d\n",i,pairsCpu[i].x,pairsCpu[i].y);
+ printf("pair %d = %d,%d\n", i, pairsCpu[i].x, pairsCpu[i].y);
}
printf("?!?\n");
}
-
}
-
-
}
-
-
-
-
//calculateOverlappingPairsHost(maxPairs);
}
-void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
+void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
{
-
m_hostPairs.resize(0);
m_allAabbsGPU1.copyToHost(m_allAabbsCPU1);
- for (int i=0;i<m_allAabbsCPU1.size();i++)
+ for (int i = 0; i < m_allAabbsCPU1.size(); i++)
{
- for (int j=i+1;j<m_allAabbsCPU1.size();j++)
+ for (int j = i + 1; j < m_allAabbsCPU1.size(); j++)
{
if (b3TestAabbAgainstAabb2(m_allAabbsCPU1[i].m_minVec, m_allAabbsCPU1[i].m_maxVec,
- m_allAabbsCPU1[j].m_minVec,m_allAabbsCPU1[j].m_maxVec))
+ m_allAabbsCPU1[j].m_minVec, m_allAabbsCPU1[j].m_maxVec))
{
b3Int4 pair;
int a = m_allAabbsCPU1[j].m_minIndices[3];
int b = m_allAabbsCPU1[i].m_minIndices[3];
- if (a<=b)
+ if (a <= b)
{
- pair.x = a;
- pair.y = b;//store the original index in the unsorted aabb array
- } else
+ pair.x = a;
+ pair.y = b; //store the original index in the unsorted aabb array
+ }
+ else
{
pair.x = b;
- pair.y = a;//store the original index in the unsorted aabb array
+ pair.y = a; //store the original index in the unsorted aabb array
}
-
- if (m_hostPairs.size()<maxPairs)
+
+ if (m_hostPairs.size() < maxPairs)
{
m_hostPairs.push_back(pair);
}
@@ -332,40 +290,36 @@ void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
}
}
-
m_gpuPairs.copyFromHost(m_hostPairs);
-
-
}
- //call writeAabbsToGpu after done making all changes (createProxy etc)
+//call writeAabbsToGpu after done making all changes (createProxy etc)
void b3GpuGridBroadphase::writeAabbsToGpu()
{
m_allAabbsGPU1.copyFromHost(m_allAabbsCPU1);
m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU);
m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU);
-
}
-cl_mem b3GpuGridBroadphase::getAabbBufferWS()
+cl_mem b3GpuGridBroadphase::getAabbBufferWS()
{
return this->m_allAabbsGPU1.getBufferCL();
}
-int b3GpuGridBroadphase::getNumOverlap()
+int b3GpuGridBroadphase::getNumOverlap()
{
return m_gpuPairs.size();
}
-cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer()
+cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer()
{
return m_gpuPairs.getBufferCL();
}
-b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU()
+b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU()
{
return m_allAabbsGPU1;
}
-b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU()
+b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU()
{
return m_allAabbsCPU1;
}
@@ -382,4 +336,3 @@ b3OpenCLArray<int>& b3GpuGridBroadphase::getLargeAabbIndicesGPU()
{
return m_largeAabbsMappingGPU;
}
-
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h
index ec18c9f716..b76cb43b68 100644
--- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h
+++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h
@@ -6,83 +6,75 @@
struct b3ParamsGridBroadphaseCL
{
-
float m_invCellSize[4];
- int m_gridSize[4];
+ int m_gridSize[4];
- int getMaxBodiesPerCell() const
+ int getMaxBodiesPerCell() const
{
return m_gridSize[3];
}
- void setMaxBodiesPerCell(int maxOverlap)
+ void setMaxBodiesPerCell(int maxOverlap)
{
m_gridSize[3] = maxOverlap;
}
};
-
class b3GpuGridBroadphase : public b3GpuBroadphaseInterface
{
protected:
- cl_context m_context;
- cl_device_id m_device;
- cl_command_queue m_queue;
+ cl_context m_context;
+ cl_device_id m_device;
+ cl_command_queue m_queue;
- b3OpenCLArray<b3SapAabb> m_allAabbsGPU1;
- b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1;
+ b3OpenCLArray<b3SapAabb> m_allAabbsGPU1;
+ b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1;
- b3OpenCLArray<int> m_smallAabbsMappingGPU;
+ b3OpenCLArray<int> m_smallAabbsMappingGPU;
b3AlignedObjectArray<int> m_smallAabbsMappingCPU;
- b3OpenCLArray<int> m_largeAabbsMappingGPU;
+ b3OpenCLArray<int> m_largeAabbsMappingGPU;
b3AlignedObjectArray<int> m_largeAabbsMappingCPU;
b3AlignedObjectArray<b3Int4> m_hostPairs;
- b3OpenCLArray<b3Int4> m_gpuPairs;
+ b3OpenCLArray<b3Int4> m_gpuPairs;
- b3OpenCLArray<b3SortData> m_hashGpu;
- b3OpenCLArray<int> m_cellStartGpu;
-
+ b3OpenCLArray<b3SortData> m_hashGpu;
+ b3OpenCLArray<int> m_cellStartGpu;
- b3ParamsGridBroadphaseCL m_paramsCPU;
- b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU;
+ b3ParamsGridBroadphaseCL m_paramsCPU;
+ b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU;
- class b3RadixSort32CL* m_sorter;
+ class b3RadixSort32CL* m_sorter;
public:
-
- b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q );
+ b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q);
virtual ~b3GpuGridBroadphase();
- static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx,cl_device_id device, cl_command_queue q)
+ static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx, cl_device_id device, cl_command_queue q)
{
- return new b3GpuGridBroadphase(ctx,device,q);
+ return new b3GpuGridBroadphase(ctx, device, q);
}
-
-
+ virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
+ virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
- virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
- virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
-
- virtual void calculateOverlappingPairs(int maxPairs);
- virtual void calculateOverlappingPairsHost(int maxPairs);
+ virtual void calculateOverlappingPairs(int maxPairs);
+ virtual void calculateOverlappingPairsHost(int maxPairs);
//call writeAabbsToGpu after done making all changes (createProxy etc)
virtual void writeAabbsToGpu();
- virtual cl_mem getAabbBufferWS();
- virtual int getNumOverlap();
- virtual cl_mem getOverlappingPairBuffer();
+ virtual cl_mem getAabbBufferWS();
+ virtual int getNumOverlap();
+ virtual cl_mem getOverlappingPairBuffer();
+
+ virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU();
+ virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU();
- virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU();
- virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU();
-
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
-
};
-#endif //B3_GPU_GRID_BROADPHASE_H \ No newline at end of file
+#endif //B3_GPU_GRID_BROADPHASE_H \ No newline at end of file
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp
index 641df9eb12..0721928684 100644
--- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp
+++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp
@@ -16,177 +16,174 @@ subject to the following restrictions:
#include "b3GpuParallelLinearBvh.h"
-b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) :
- m_queue(queue),
- m_radixSorter(context, device, queue),
-
- m_rootNodeIndex(context, queue),
- m_maxDistanceFromRoot(context, queue),
- m_temp(context, queue),
-
- m_internalNodeAabbs(context, queue),
- m_internalNodeLeafIndexRanges(context, queue),
- m_internalNodeChildNodes(context, queue),
- m_internalNodeParentNodes(context, queue),
-
- m_commonPrefixes(context, queue),
- m_commonPrefixLengths(context, queue),
- m_distanceFromRoot(context, queue),
-
- m_leafNodeParentNodes(context, queue),
- m_mortonCodesAndAabbIndicies(context, queue),
- m_mergedAabb(context, queue),
- m_leafNodeAabbs(context, queue),
-
- m_largeAabbs(context, queue)
+b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : m_queue(queue),
+ m_radixSorter(context, device, queue),
+
+ m_rootNodeIndex(context, queue),
+ m_maxDistanceFromRoot(context, queue),
+ m_temp(context, queue),
+
+ m_internalNodeAabbs(context, queue),
+ m_internalNodeLeafIndexRanges(context, queue),
+ m_internalNodeChildNodes(context, queue),
+ m_internalNodeParentNodes(context, queue),
+
+ m_commonPrefixes(context, queue),
+ m_commonPrefixLengths(context, queue),
+ m_distanceFromRoot(context, queue),
+
+ m_leafNodeParentNodes(context, queue),
+ m_mortonCodesAndAabbIndicies(context, queue),
+ m_mergedAabb(context, queue),
+ m_leafNodeAabbs(context, queue),
+
+ m_largeAabbs(context, queue)
{
m_rootNodeIndex.resize(1);
m_maxDistanceFromRoot.resize(1);
m_temp.resize(1);
-
+
//
const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl";
-
- const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h
+
+ const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h
cl_int error;
char* additionalMacros = 0;
m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH);
b3Assert(m_parallelLinearBvhProgram);
-
- m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros );
+
+ m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_separateAabbsKernel);
- m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros );
+ m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_findAllNodesMergedAabbKernel);
- m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros );
+ m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_assignMortonCodesAndAabbIndiciesKernel);
-
- m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros );
+
+ m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_computeAdjacentPairCommonPrefixKernel);
- m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros );
+ m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_buildBinaryRadixTreeLeafNodesKernel);
- m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros );
+ m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_buildBinaryRadixTreeInternalNodesKernel);
- m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros );
+ m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_findDistanceFromRootKernel);
- m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros );
+ m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel);
-
- m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros );
+
+ m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_findLeafIndexRangesKernel);
-
- m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros );
+
+ m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_plbvhCalculateOverlappingPairsKernel);
- m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros );
+ m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_plbvhRayTraverseKernel);
- m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros );
+ m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_plbvhLargeAabbAabbTestKernel);
- m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros );
+ m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_plbvhLargeAabbRayTestKernel);
}
-b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()
+b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()
{
clReleaseKernel(m_separateAabbsKernel);
clReleaseKernel(m_findAllNodesMergedAabbKernel);
clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel);
-
+
clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel);
clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel);
clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel);
clReleaseKernel(m_findDistanceFromRootKernel);
clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel);
-
+
clReleaseKernel(m_findLeafIndexRangesKernel);
-
+
clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel);
clReleaseKernel(m_plbvhRayTraverseKernel);
clReleaseKernel(m_plbvhLargeAabbAabbTestKernel);
clReleaseKernel(m_plbvhLargeAabbRayTestKernel);
-
+
clReleaseProgram(m_parallelLinearBvhProgram);
}
-void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
- const b3OpenCLArray<int>& largeAabbIndices)
+void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
+ const b3OpenCLArray<int>& largeAabbIndices)
{
B3_PROFILE("b3ParallelLinearBvh::build()");
-
+
int numLargeAabbs = largeAabbIndices.size();
int numSmallAabbs = smallAabbIndices.size();
-
- //Since all AABBs(both large and small) are input as a contiguous array,
+
+ //Since all AABBs(both large and small) are input as a contiguous array,
//with 2 additional arrays used to indicate the indices of large and small AABBs,
//it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH.
{
B3_PROFILE("Separate large and small AABBs");
-
+
m_largeAabbs.resize(numLargeAabbs);
m_leafNodeAabbs.resize(numSmallAabbs);
-
+
//Write large AABBs into m_largeAabbs
{
- b3BufferInfoCL bufferInfo[] =
- {
- b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
- b3BufferInfoCL( largeAabbIndices.getBufferCL() ),
-
- b3BufferInfoCL( m_largeAabbs.getBufferCL() )
- };
-
+ b3BufferInfoCL bufferInfo[] =
+ {
+ b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
+ b3BufferInfoCL(largeAabbIndices.getBufferCL()),
+
+ b3BufferInfoCL(m_largeAabbs.getBufferCL())};
+
b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
- launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+ launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLargeAabbs);
-
+
launcher.launch1D(numLargeAabbs);
}
-
+
//Write small AABBs into m_leafNodeAabbs
{
- b3BufferInfoCL bufferInfo[] =
- {
- b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
- b3BufferInfoCL( smallAabbIndices.getBufferCL() ),
-
- b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() )
- };
-
+ b3BufferInfoCL bufferInfo[] =
+ {
+ b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
+ b3BufferInfoCL(smallAabbIndices.getBufferCL()),
+
+ b3BufferInfoCL(m_leafNodeAabbs.getBufferCL())};
+
b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
- launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+ launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numSmallAabbs);
-
+
launcher.launch1D(numSmallAabbs);
}
-
+
clFinish(m_queue);
}
-
+
//
- int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs
+ int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs
int numInternalNodes = numLeaves - 1;
-
- if(numLeaves < 2)
+
+ if (numLeaves < 2)
{
//Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(),
//so it does not matter if numLeaves == 0 and rootNodeIndex == -1
int rootNodeIndex = numLeaves - 1;
m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1);
-
+
//Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm,
//m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index
//instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work.
//( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs )
- if(numLeaves == 1)
+ if (numLeaves == 1)
{
b3SortData leaf;
- leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set
-
+ leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set
+
m_mortonCodesAndAabbIndicies.resize(1);
m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1);
}
-
+
return;
}
-
+
//
{
m_internalNodeAabbs.resize(numInternalNodes);
@@ -197,37 +194,37 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab
m_commonPrefixes.resize(numInternalNodes);
m_commonPrefixLengths.resize(numInternalNodes);
m_distanceFromRoot.resize(numInternalNodes);
-
+
m_leafNodeParentNodes.resize(numLeaves);
m_mortonCodesAndAabbIndicies.resize(numLeaves);
m_mergedAabb.resize(numLeaves);
}
-
- //Find the merged AABB of all small AABBs; this is used to define the size of
+
+ //Find the merged AABB of all small AABBs; this is used to define the size of
//each cell in the virtual grid for the next kernel(2^10 cells in each dimension).
{
B3_PROFILE("Find AABB of merged nodes");
-
- m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array
-
- for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;
- numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
+
+ m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array
+
+ for (int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;
+ numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
{
- b3BufferInfoCL bufferInfo[] =
- {
- b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0]
- };
-
+ b3BufferInfoCL bufferInfo[] =
+ {
+ b3BufferInfoCL(m_mergedAabb.getBufferCL()) //Resulting AABB is stored in m_mergedAabb[0]
+ };
+
b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
- launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+ launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numAabbsNeedingMerge);
-
+
launcher.launch1D(numAabbsNeedingMerge);
}
-
+
clFinish(m_queue);
}
-
+
//Insert the center of the AABBs into a virtual grid,
//then convert the discrete grid coordinates into a morton code
//For each element in m_mortonCodesAndAabbIndicies, set
@@ -235,34 +232,32 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab
// m_value == small AABB index
{
B3_PROFILE("Assign morton codes");
-
- b3BufferInfoCL bufferInfo[] =
- {
- b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
- b3BufferInfoCL( m_mergedAabb.getBufferCL() ),
- b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() )
- };
-
+
+ b3BufferInfoCL bufferInfo[] =
+ {
+ b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
+ b3BufferInfoCL(m_mergedAabb.getBufferCL()),
+ b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL())};
+
b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel");
- launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+ launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLeaves);
-
+
launcher.launch1D(numLeaves);
clFinish(m_queue);
}
-
+
//
{
B3_PROFILE("Sort leaves by morton codes");
-
+
m_radixSorter.execute(m_mortonCodesAndAabbIndicies);
clFinish(m_queue);
}
-
+
//
constructBinaryRadixTree();
-
-
+
//Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices.
//The root node contains leaf node indices in the range [0, numLeafNodes - 1].
//The child nodes of each node split their parent's index range into 2 contiguous halves.
@@ -273,17 +268,16 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab
//This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice
{
B3_PROFILE("m_findLeafIndexRangesKernel");
-
- b3BufferInfoCL bufferInfo[] =
- {
- b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
- b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() )
- };
-
+
+ b3BufferInfoCL bufferInfo[] =
+ {
+ b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
+ b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL())};
+
b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel");
- launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+ launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numInternalNodes);
-
+
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
@@ -293,285 +287,271 @@ void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& ou
{
int maxPairs = out_overlappingPairs.size();
b3OpenCLArray<int>& numPairsGpu = m_temp;
-
+
int reset = 0;
numPairsGpu.copyFromHostPointer(&reset, 1);
-
+
//
- if( m_leafNodeAabbs.size() > 1 )
+ if (m_leafNodeAabbs.size() > 1)
{
B3_PROFILE("PLBVH small-small AABB test");
-
+
int numQueryAabbs = m_leafNodeAabbs.size();
-
- b3BufferInfoCL bufferInfo[] =
- {
- b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
-
- b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
- b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
- b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
- b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
- b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
-
- b3BufferInfoCL( numPairsGpu.getBufferCL() ),
- b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
- };
-
+
+ b3BufferInfoCL bufferInfo[] =
+ {
+ b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
+
+ b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
+ b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
+ b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
+ b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
+ b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
+
+ b3BufferInfoCL(numPairsGpu.getBufferCL()),
+ b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
+
b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel");
- launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+ launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(maxPairs);
launcher.setConst(numQueryAabbs);
-
+
launcher.launch1D(numQueryAabbs);
clFinish(m_queue);
}
-
+
int numLargeAabbRigids = m_largeAabbs.size();
- if( numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0 )
+ if (numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0)
{
B3_PROFILE("PLBVH large-small AABB test");
-
+
int numQueryAabbs = m_leafNodeAabbs.size();
-
- b3BufferInfoCL bufferInfo[] =
- {
- b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
- b3BufferInfoCL( m_largeAabbs.getBufferCL() ),
-
- b3BufferInfoCL( numPairsGpu.getBufferCL() ),
- b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
- };
-
+
+ b3BufferInfoCL bufferInfo[] =
+ {
+ b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
+ b3BufferInfoCL(m_largeAabbs.getBufferCL()),
+
+ b3BufferInfoCL(numPairsGpu.getBufferCL()),
+ b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
+
b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel");
- launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+ launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(maxPairs);
launcher.setConst(numLargeAabbRigids);
launcher.setConst(numQueryAabbs);
-
+
launcher.launch1D(numQueryAabbs);
clFinish(m_queue);
}
-
-
+
//
int numPairs = -1;
numPairsGpu.copyToHostPointer(&numPairs, 1);
- if(numPairs > maxPairs)
+ if (numPairs > maxPairs)
{
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
numPairs = maxPairs;
numPairsGpu.copyFromHostPointer(&maxPairs, 1);
}
-
+
out_overlappingPairs.resize(numPairs);
}
-
-void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
- b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
+void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
+ b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
{
B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()");
-
+
int numRays = rays.size();
int maxRayRigidPairs = out_rayRigidPairs.size();
-
+
int reset = 0;
out_numRayRigidPairs.copyFromHostPointer(&reset, 1);
-
+
//
- if( m_leafNodeAabbs.size() > 0 )
+ if (m_leafNodeAabbs.size() > 0)
{
B3_PROFILE("PLBVH ray test small AABB");
-
- b3BufferInfoCL bufferInfo[] =
- {
- b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
-
- b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
- b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
- b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
- b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
- b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
-
- b3BufferInfoCL( rays.getBufferCL() ),
-
- b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ),
- b3BufferInfoCL( out_rayRigidPairs.getBufferCL() )
- };
-
+
+ b3BufferInfoCL bufferInfo[] =
+ {
+ b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
+
+ b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
+ b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
+ b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
+ b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
+ b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
+
+ b3BufferInfoCL(rays.getBufferCL()),
+
+ b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
+ b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
+
b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel");
- launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+ launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(maxRayRigidPairs);
launcher.setConst(numRays);
-
+
launcher.launch1D(numRays);
clFinish(m_queue);
}
-
+
int numLargeAabbRigids = m_largeAabbs.size();
- if(numLargeAabbRigids > 0)
+ if (numLargeAabbRigids > 0)
{
B3_PROFILE("PLBVH ray test large AABB");
-
- b3BufferInfoCL bufferInfo[] =
- {
- b3BufferInfoCL( m_largeAabbs.getBufferCL() ),
- b3BufferInfoCL( rays.getBufferCL() ),
-
- b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ),
- b3BufferInfoCL( out_rayRigidPairs.getBufferCL() )
- };
-
+
+ b3BufferInfoCL bufferInfo[] =
+ {
+ b3BufferInfoCL(m_largeAabbs.getBufferCL()),
+ b3BufferInfoCL(rays.getBufferCL()),
+
+ b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
+ b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
+
b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel");
- launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+ launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLargeAabbRigids);
launcher.setConst(maxRayRigidPairs);
launcher.setConst(numRays);
-
+
launcher.launch1D(numRays);
clFinish(m_queue);
}
-
+
//
int numRayRigidPairs = -1;
out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1);
-
- if(numRayRigidPairs > maxRayRigidPairs)
+
+ if (numRayRigidPairs > maxRayRigidPairs)
b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs);
-
}
void b3GpuParallelLinearBvh::constructBinaryRadixTree()
{
B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()");
-
+
int numLeaves = m_leafNodeAabbs.size();
int numInternalNodes = numLeaves - 1;
-
+
//Each internal node is placed in between 2 leaf nodes.
//By using this arrangement and computing the common prefix between
//these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree.
{
B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel");
-
- b3BufferInfoCL bufferInfo[] =
- {
- b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
- b3BufferInfoCL( m_commonPrefixes.getBufferCL() ),
- b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() )
- };
-
+
+ b3BufferInfoCL bufferInfo[] =
+ {
+ b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
+ b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
+ b3BufferInfoCL(m_commonPrefixLengths.getBufferCL())};
+
b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel");
- launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+ launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numInternalNodes);
-
+
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
-
- //For each leaf node, select its parent node by
+
+ //For each leaf node, select its parent node by
//comparing the 2 nearest internal nodes and assign child node indices
{
B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel");
-
- b3BufferInfoCL bufferInfo[] =
- {
- b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ),
- b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ),
- b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() )
- };
-
+
+ b3BufferInfoCL bufferInfo[] =
+ {
+ b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
+ b3BufferInfoCL(m_leafNodeParentNodes.getBufferCL()),
+ b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL())};
+
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel");
- launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+ launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLeaves);
-
+
launcher.launch1D(numLeaves);
clFinish(m_queue);
}
-
+
//For each internal node, perform 2 binary searches among the other internal nodes
//to its left and right to find its potential parent nodes and assign child node indices
{
B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel");
-
- b3BufferInfoCL bufferInfo[] =
- {
- b3BufferInfoCL( m_commonPrefixes.getBufferCL() ),
- b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ),
- b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
- b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
- b3BufferInfoCL( m_rootNodeIndex.getBufferCL() )
- };
-
+
+ b3BufferInfoCL bufferInfo[] =
+ {
+ b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
+ b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
+ b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
+ b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
+ b3BufferInfoCL(m_rootNodeIndex.getBufferCL())};
+
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel");
- launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+ launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numInternalNodes);
-
+
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
-
+
//Find the number of nodes seperating each internal node and the root node
//so that the AABBs can be set using the next kernel.
//Also determine the maximum number of nodes separating an internal node and the root node.
{
B3_PROFILE("m_findDistanceFromRootKernel");
-
- b3BufferInfoCL bufferInfo[] =
- {
- b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
- b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
- b3BufferInfoCL( m_maxDistanceFromRoot.getBufferCL() ),
- b3BufferInfoCL( m_distanceFromRoot.getBufferCL() )
- };
-
+
+ b3BufferInfoCL bufferInfo[] =
+ {
+ b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
+ b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
+ b3BufferInfoCL(m_maxDistanceFromRoot.getBufferCL()),
+ b3BufferInfoCL(m_distanceFromRoot.getBufferCL())};
+
b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel");
- launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+ launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numInternalNodes);
-
+
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
-
+
//Starting from the internal nodes nearest to the leaf nodes, recursively move up
//the tree towards the root to set the AABBs of each internal node; each internal node
//checks its children and merges their AABBs
{
B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel");
-
+
int maxDistanceFromRoot = -1;
{
B3_PROFILE("copy maxDistanceFromRoot to CPU");
m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1);
clFinish(m_queue);
}
-
- for(int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
+
+ for (int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
{
- b3BufferInfoCL bufferInfo[] =
- {
- b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ),
- b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
- b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
- b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
- b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() )
- };
-
+ b3BufferInfoCL bufferInfo[] =
+ {
+ b3BufferInfoCL(m_distanceFromRoot.getBufferCL()),
+ b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
+ b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
+ b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
+ b3BufferInfoCL(m_internalNodeAabbs.getBufferCL())};
+
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel");
- launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+ launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(maxDistanceFromRoot);
launcher.setConst(distanceFromRoot);
launcher.setConst(numInternalNodes);
-
+
//It may seem inefficent to launch a thread for each internal node when a
//much smaller number of nodes is actually processed, but this is actually
- //faster than determining the exact nodes that are ready to merge their child AABBs.
+ //faster than determining the exact nodes that are ready to merge their child AABBs.
launcher.launch1D(numInternalNodes);
}
-
+
clFinish(m_queue);
}
}
-
- \ No newline at end of file
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
index effe617b7b..b390775129 100644
--- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
+++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
@@ -37,10 +37,10 @@ subject to the following restrictions:
///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n
///@par
///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages:
-/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid)
+/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid)
/// - [fully parallel] Sort morton codes
-/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH)
-/// - [somewhat parallel] Set internal node AABBs
+/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH)
+/// - [somewhat parallel] Set internal node AABBs
///@par
///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages.
///The BVH implementation here shares many concepts with [Karras 2012], but a different method is used for constructing the tree.
@@ -49,75 +49,75 @@ subject to the following restrictions:
class b3GpuParallelLinearBvh
{
cl_command_queue m_queue;
-
+
cl_program m_parallelLinearBvhProgram;
-
+
cl_kernel m_separateAabbsKernel;
cl_kernel m_findAllNodesMergedAabbKernel;
cl_kernel m_assignMortonCodesAndAabbIndiciesKernel;
-
+
//Binary radix tree construction kernels
cl_kernel m_computeAdjacentPairCommonPrefixKernel;
cl_kernel m_buildBinaryRadixTreeLeafNodesKernel;
cl_kernel m_buildBinaryRadixTreeInternalNodesKernel;
cl_kernel m_findDistanceFromRootKernel;
cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel;
-
+
cl_kernel m_findLeafIndexRangesKernel;
-
+
//Traversal kernels
cl_kernel m_plbvhCalculateOverlappingPairsKernel;
cl_kernel m_plbvhRayTraverseKernel;
cl_kernel m_plbvhLargeAabbAabbTestKernel;
cl_kernel m_plbvhLargeAabbRayTestKernel;
-
+
b3RadixSort32CL m_radixSorter;
-
+
//1 element
- b3OpenCLArray<int> m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node
- b3OpenCLArray<int> m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node
- b3OpenCLArray<int> m_temp; //Used to hold the number of pairs in calculateOverlappingPairs()
-
+ b3OpenCLArray<int> m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node
+ b3OpenCLArray<int> m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node
+ b3OpenCLArray<int> m_temp; //Used to hold the number of pairs in calculateOverlappingPairs()
+
//1 element per internal node (number_of_internal_nodes == number_of_leaves - 1)
b3OpenCLArray<b3SapAabb> m_internalNodeAabbs;
- b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index
- b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node
- b3OpenCLArray<int> m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
-
+ b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index
+ b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node
+ b3OpenCLArray<int> m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
+
//1 element per internal node; for binary radix tree construction
b3OpenCLArray<b3Int64> m_commonPrefixes;
b3OpenCLArray<int> m_commonPrefixLengths;
- b3OpenCLArray<int> m_distanceFromRoot; //Number of internal nodes between this node and the root
-
+ b3OpenCLArray<int> m_distanceFromRoot; //Number of internal nodes between this node and the root
+
//1 element per leaf node (leaf nodes only include small AABBs)
- b3OpenCLArray<int> m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
- b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs
- b3OpenCLArray<b3SapAabb> m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes
- b3OpenCLArray<b3SapAabb> m_leafNodeAabbs; //Contains only small AABBs
-
+ b3OpenCLArray<int> m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
+ b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs
+ b3OpenCLArray<b3SapAabb> m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes
+ b3OpenCLArray<b3SapAabb> m_leafNodeAabbs; //Contains only small AABBs
+
//1 element per large AABB, which is not stored in the BVH
b3OpenCLArray<b3SapAabb> m_largeAabbs;
-
+
public:
b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue);
virtual ~b3GpuParallelLinearBvh();
-
+
///Must be called before any other function
- void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
- const b3OpenCLArray<int>& largeAabbIndices);
-
+ void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
+ const b3OpenCLArray<int>& largeAabbIndices);
+
///calculateOverlappingPairs() uses the worldSpaceAabbs parameter of b3GpuParallelLinearBvh::build() as the query AABBs.
///@param out_overlappingPairs The size() of this array is used to determine the max number of pairs.
///If the number of overlapping pairs is < out_overlappingPairs.size(), out_overlappingPairs is resized.
void calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs);
-
+
///@param out_numRigidRayPairs Array of length 1; contains the number of detected ray-rigid AABB intersections;
///this value may be greater than out_rayRigidPairs.size() if out_rayRigidPairs is not large enough.
///@param out_rayRigidPairs Contains an array of rays intersecting rigid AABBs; x == ray index, y == rigid body index.
///If the size of this array is insufficient to hold all ray-rigid AABB intersections, additional intersections are discarded.
- void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
- b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs);
-
+ void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
+ b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs);
+
private:
void constructBinaryRadixTree();
};
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp
index d2618024ac..62ea7a32df 100644
--- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp
+++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp
@@ -13,45 +13,44 @@ subject to the following restrictions:
#include "b3GpuParallelLinearBvhBroadphase.h"
-b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) :
- m_plbvh(context, device, queue),
-
- m_overlappingPairsGpu(context, queue),
-
- m_aabbsGpu(context, queue),
- m_smallAabbsMappingGpu(context, queue),
- m_largeAabbsMappingGpu(context, queue)
+b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : m_plbvh(context, device, queue),
+
+ m_overlappingPairsGpu(context, queue),
+
+ m_aabbsGpu(context, queue),
+ m_smallAabbsMappingGpu(context, queue),
+ m_largeAabbsMappingGpu(context, queue)
{
}
-void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
+void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
int newAabbIndex = m_aabbsCpu.size();
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
-
+
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = newAabbIndex;
-
+
m_smallAabbsMappingCpu.push_back(newAabbIndex);
-
+
m_aabbsCpu.push_back(aabb);
}
-void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
+void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
int newAabbIndex = m_aabbsCpu.size();
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
-
+
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = newAabbIndex;
-
+
m_largeAabbsMappingCpu.push_back(newAabbIndex);
-
+
m_aabbsCpu.push_back(aabb);
}
@@ -59,22 +58,19 @@ void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairs(int maxPairs)
{
//Reconstruct BVH
m_plbvh.build(m_aabbsGpu, m_smallAabbsMappingGpu, m_largeAabbsMappingGpu);
-
+
//
m_overlappingPairsGpu.resize(maxPairs);
m_plbvh.calculateOverlappingPairs(m_overlappingPairsGpu);
}
void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairsHost(int maxPairs)
{
- b3Assert(0); //CPU version not implemented
+ b3Assert(0); //CPU version not implemented
}
-void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu()
-{
- m_aabbsGpu.copyFromHost(m_aabbsCpu);
+void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu()
+{
+ m_aabbsGpu.copyFromHost(m_aabbsCpu);
m_smallAabbsMappingGpu.copyFromHost(m_smallAabbsMappingCpu);
m_largeAabbsMappingGpu.copyFromHost(m_largeAabbsMappingCpu);
}
-
-
-
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h
index e518500637..dda0eea7be 100644
--- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h
+++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h
@@ -21,42 +21,42 @@ subject to the following restrictions:
class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface
{
b3GpuParallelLinearBvh m_plbvh;
-
+
b3OpenCLArray<b3Int4> m_overlappingPairsGpu;
-
+
b3OpenCLArray<b3SapAabb> m_aabbsGpu;
b3OpenCLArray<int> m_smallAabbsMappingGpu;
b3OpenCLArray<int> m_largeAabbsMappingGpu;
-
+
b3AlignedObjectArray<b3SapAabb> m_aabbsCpu;
b3AlignedObjectArray<int> m_smallAabbsMappingCpu;
b3AlignedObjectArray<int> m_largeAabbsMappingCpu;
-
+
public:
b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue);
virtual ~b3GpuParallelLinearBvhBroadphase() {}
- virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
- virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
-
+ virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
+ virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
+
virtual void calculateOverlappingPairs(int maxPairs);
virtual void calculateOverlappingPairsHost(int maxPairs);
//call writeAabbsToGpu after done making all changes (createProxy etc)
virtual void writeAabbsToGpu();
-
- virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); }
+
+ virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); }
virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); }
virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); }
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; }
-
+
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() { return m_overlappingPairsGpu; }
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() { return m_smallAabbsMappingGpu; }
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() { return m_largeAabbsMappingGpu; }
-
+
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_aabbsCpu; }
-
+
static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue)
{
return new b3GpuParallelLinearBvhBroadphase(context, device, queue);
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
index c45fbbdcaa..4126d03ed0 100644
--- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
+++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
@@ -6,7 +6,6 @@ bool searchIncremental3dSapOnGpu = true;
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h"
-
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "kernels/sapKernels.h"
@@ -56,110 +55,105 @@ bool searchIncremental3dSapOnGpu = true;
class b3PrefixScanFloat4CL* m_prefixScanFloat4;
*/
-b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType)
-:m_context(ctx),
-m_device(device),
-m_queue(q),
-
-m_objectMinMaxIndexGPUaxis0(ctx,q),
-m_objectMinMaxIndexGPUaxis1(ctx,q),
-m_objectMinMaxIndexGPUaxis2(ctx,q),
-m_objectMinMaxIndexGPUaxis0prev(ctx,q),
-m_objectMinMaxIndexGPUaxis1prev(ctx,q),
-m_objectMinMaxIndexGPUaxis2prev(ctx,q),
-m_sortedAxisGPU0(ctx,q),
-m_sortedAxisGPU1(ctx,q),
-m_sortedAxisGPU2(ctx,q),
-m_sortedAxisGPU0prev(ctx,q),
-m_sortedAxisGPU1prev(ctx,q),
-m_sortedAxisGPU2prev(ctx,q),
-m_addedHostPairsGPU(ctx,q),
-m_removedHostPairsGPU(ctx,q),
-m_addedCountGPU(ctx,q),
-m_removedCountGPU(ctx,q),
-m_currentBuffer(-1),
-m_pairCount(ctx,q),
-m_allAabbsGPU(ctx,q),
-m_sum(ctx,q),
-m_sum2(ctx,q),
-m_dst(ctx,q),
-m_smallAabbsMappingGPU(ctx,q),
-m_largeAabbsMappingGPU(ctx,q),
-m_overlappingPairs(ctx,q),
-m_gpuSmallSortData(ctx,q),
-m_gpuSmallSortedAabbs(ctx,q)
+b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType)
+ : m_context(ctx),
+ m_device(device),
+ m_queue(q),
+
+ m_objectMinMaxIndexGPUaxis0(ctx, q),
+ m_objectMinMaxIndexGPUaxis1(ctx, q),
+ m_objectMinMaxIndexGPUaxis2(ctx, q),
+ m_objectMinMaxIndexGPUaxis0prev(ctx, q),
+ m_objectMinMaxIndexGPUaxis1prev(ctx, q),
+ m_objectMinMaxIndexGPUaxis2prev(ctx, q),
+ m_sortedAxisGPU0(ctx, q),
+ m_sortedAxisGPU1(ctx, q),
+ m_sortedAxisGPU2(ctx, q),
+ m_sortedAxisGPU0prev(ctx, q),
+ m_sortedAxisGPU1prev(ctx, q),
+ m_sortedAxisGPU2prev(ctx, q),
+ m_addedHostPairsGPU(ctx, q),
+ m_removedHostPairsGPU(ctx, q),
+ m_addedCountGPU(ctx, q),
+ m_removedCountGPU(ctx, q),
+ m_currentBuffer(-1),
+ m_pairCount(ctx, q),
+ m_allAabbsGPU(ctx, q),
+ m_sum(ctx, q),
+ m_sum2(ctx, q),
+ m_dst(ctx, q),
+ m_smallAabbsMappingGPU(ctx, q),
+ m_largeAabbsMappingGPU(ctx, q),
+ m_overlappingPairs(ctx, q),
+ m_gpuSmallSortData(ctx, q),
+ m_gpuSmallSortedAabbs(ctx, q)
{
const char* sapSrc = sapCL;
-
-
- cl_int errNum=0;
+
+ cl_int errNum = 0;
b3Assert(m_context);
b3Assert(m_device);
- cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
- b3Assert(errNum==CL_SUCCESS);
-
+ cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH);
+ b3Assert(errNum == CL_SUCCESS);
- b3Assert(errNum==CL_SUCCESS);
+ b3Assert(errNum == CL_SUCCESS);
#ifndef __APPLE__
- m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context,m_device,m_queue);
+ m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context, m_device, m_queue);
#else
m_prefixScanFloat4 = 0;
#endif
m_sapKernel = 0;
-
+
switch (kernelType)
{
case B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU:
{
- m_sapKernel=0;
+ m_sapKernel = 0;
break;
}
- case B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU:
+ case B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU:
{
- m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBruteForce",&errNum,sapProg );
+ m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelBruteForce", &errNum, sapProg);
break;
}
case B3_GPU_SAP_KERNEL_ORIGINAL:
{
- m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
+ m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelOriginal", &errNum, sapProg);
break;
}
case B3_GPU_SAP_KERNEL_BARRIER:
{
- m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg );
+ m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelBarrier", &errNum, sapProg);
break;
}
case B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY:
{
- m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
+ m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelLocalSharedMemory", &errNum, sapProg);
break;
}
default:
{
- m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
+ m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelLocalSharedMemory", &errNum, sapProg);
b3Error("Unknown 3D GPU SAP provided, fallback to computePairsKernelLocalSharedMemory");
}
};
-
-
-
- m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg );
- b3Assert(errNum==CL_SUCCESS);
- m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "prepareSumVarianceKernel",&errNum,sapProg );
- b3Assert(errNum==CL_SUCCESS);
+ m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg);
+ b3Assert(errNum == CL_SUCCESS);
-
- m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg );
+ m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "prepareSumVarianceKernel", &errNum, sapProg);
+ b3Assert(errNum == CL_SUCCESS);
- m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
+ m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "flipFloatKernel", &errNum, sapProg);
- m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "scatterKernel",&errNum,sapProg );
+ m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg);
- m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue);
+ m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "scatterKernel", &errNum, sapProg);
+
+ m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue);
}
b3GpuSapBroadphase::~b3GpuSapBroadphase()
@@ -173,13 +167,11 @@ b3GpuSapBroadphase::~b3GpuSapBroadphase()
clReleaseKernel(m_sapKernel);
clReleaseKernel(m_sap2Kernel);
clReleaseKernel(m_prepareSumVarianceKernel);
-
-
}
/// conservative test for overlap between two aabbs
-static bool TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aabbMax1,
- const b3Vector3 &aabbMin2, const b3Vector3 &aabbMax2)
+static bool TestAabbAgainstAabb2(const b3Vector3& aabbMin1, const b3Vector3& aabbMax1,
+ const b3Vector3& aabbMin2, const b3Vector3& aabbMax2)
{
bool overlap = true;
overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap;
@@ -188,8 +180,6 @@ static bool TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aab
return overlap;
}
-
-
//http://stereopsis.com/radix.html
static unsigned int FloatFlip(float fl)
{
@@ -198,79 +188,77 @@ static unsigned int FloatFlip(float fl)
return f ^ mask;
};
-void b3GpuSapBroadphase::init3dSap()
+void b3GpuSapBroadphase::init3dSap()
{
- if (m_currentBuffer<0)
+ if (m_currentBuffer < 0)
{
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
m_currentBuffer = 0;
- for (int axis=0;axis<3;axis++)
+ for (int axis = 0; axis < 3; axis++)
{
- for (int buf=0;buf<2;buf++)
+ for (int buf = 0; buf < 2; buf++)
{
int totalNumAabbs = m_allAabbsCPU.size();
- int numEndPoints = 2*totalNumAabbs;
+ int numEndPoints = 2 * totalNumAabbs;
m_sortedAxisCPU[axis][buf].resize(numEndPoints);
- if (buf==m_currentBuffer)
+ if (buf == m_currentBuffer)
{
- for (int i=0;i<totalNumAabbs;i++)
+ for (int i = 0; i < totalNumAabbs; i++)
{
- m_sortedAxisCPU[axis][buf][i*2].m_key = FloatFlip(m_allAabbsCPU[i].m_min[axis])-1;
- m_sortedAxisCPU[axis][buf][i*2].m_value = i*2;
- m_sortedAxisCPU[axis][buf][i*2+1].m_key = FloatFlip(m_allAabbsCPU[i].m_max[axis])+1;
- m_sortedAxisCPU[axis][buf][i*2+1].m_value = i*2+1;
+ m_sortedAxisCPU[axis][buf][i * 2].m_key = FloatFlip(m_allAabbsCPU[i].m_min[axis]) - 1;
+ m_sortedAxisCPU[axis][buf][i * 2].m_value = i * 2;
+ m_sortedAxisCPU[axis][buf][i * 2 + 1].m_key = FloatFlip(m_allAabbsCPU[i].m_max[axis]) + 1;
+ m_sortedAxisCPU[axis][buf][i * 2 + 1].m_value = i * 2 + 1;
}
}
}
}
- for (int axis=0;axis<3;axis++)
+ for (int axis = 0; axis < 3; axis++)
{
m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]);
}
- for (int axis=0;axis<3;axis++)
+ for (int axis = 0; axis < 3; axis++)
{
//int totalNumAabbs = m_allAabbsCPU.size();
int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size();
m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(numEndPoints);
- for (int i=0;i<numEndPoints;i++)
+ for (int i = 0; i < numEndPoints; i++)
{
int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value;
- int newDest = destIndex/2;
- if (destIndex&1)
+ int newDest = destIndex / 2;
+ if (destIndex & 1)
{
- m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i;
- } else
+ m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y = i;
+ }
+ else
{
- m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i;
+ m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x = i;
}
}
}
-
}
}
-
static bool b3PairCmp(const b3Int4& p, const b3Int4& q)
{
- return ((p.x<q.x) || ((p.x==q.x) && (p.y<q.y)));
+ return ((p.x < q.x) || ((p.x == q.x) && (p.y < q.y)));
}
-
-static bool operator==(const b3Int4& a,const b3Int4& b)
+static bool operator==(const b3Int4& a, const b3Int4& b)
{
return a.x == b.x && a.y == b.y;
};
-static bool operator<(const b3Int4& a,const b3Int4& b)
+static bool operator<(const b3Int4& a, const b3Int4& b)
{
return a.x < b.x || (a.x == b.x && a.y < b.y);
};
-static bool operator>(const b3Int4& a,const b3Int4& b)
+static bool operator>(const b3Int4& a, const b3Int4& b)
{
return a.x > b.x || (a.x == b.x && a.y > b.y);
};
@@ -278,31 +266,29 @@ static bool operator>(const b3Int4& a,const b3Int4& b)
b3AlignedObjectArray<b3Int4> addedHostPairs;
b3AlignedObjectArray<b3Int4> removedHostPairs;
-b3AlignedObjectArray<b3SapAabb> preAabbs;
+b3AlignedObjectArray<b3SapAabb> preAabbs;
-void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
+void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
{
//static int framepje = 0;
//printf("framepje=%d\n",framepje++);
-
B3_PROFILE("calculateOverlappingPairsHostIncremental3Sap");
addedHostPairs.resize(0);
removedHostPairs.resize(0);
- b3Assert(m_currentBuffer>=0);
-
+ b3Assert(m_currentBuffer >= 0);
+
{
preAabbs.resize(m_allAabbsCPU.size());
- for (int i=0;i<preAabbs.size();i++)
+ for (int i = 0; i < preAabbs.size(); i++)
{
- preAabbs[i]=m_allAabbsCPU[i];
+ preAabbs[i] = m_allAabbsCPU[i];
}
}
-
- if (m_currentBuffer<0)
+ if (m_currentBuffer < 0)
return;
{
B3_PROFILE("m_allAabbsGPU.copyToHost");
@@ -316,100 +302,87 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
}
if (0)
{
- {
- printf("ab[40].min=%f,%f,%f,ab[40].max=%f,%f,%f\n",
- m_allAabbsCPU[40].m_min[0], m_allAabbsCPU[40].m_min[1],m_allAabbsCPU[40].m_min[2],
- m_allAabbsCPU[40].m_max[0], m_allAabbsCPU[40].m_max[1],m_allAabbsCPU[40].m_max[2]);
- }
-
- {
- printf("ab[53].min=%f,%f,%f,ab[53].max=%f,%f,%f\n",
- m_allAabbsCPU[53].m_min[0], m_allAabbsCPU[53].m_min[1],m_allAabbsCPU[53].m_min[2],
- m_allAabbsCPU[53].m_max[0], m_allAabbsCPU[53].m_max[1],m_allAabbsCPU[53].m_max[2]);
- }
-
-
- {
- b3Int4 newPair;
- newPair.x = 40;
- newPair.y = 53;
- int index = allPairs.findBinarySearch(newPair);
- printf("hasPair(40,53)=%d out of %d\n",index, allPairs.size());
-
{
- int overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[40].m_min, (const b3Vector3&)m_allAabbsCPU[40].m_max,(const b3Vector3&)m_allAabbsCPU[53].m_min,(const b3Vector3&)m_allAabbsCPU[53].m_max);
- printf("overlap=%d\n",overlap);
+ printf("ab[40].min=%f,%f,%f,ab[40].max=%f,%f,%f\n",
+ m_allAabbsCPU[40].m_min[0], m_allAabbsCPU[40].m_min[1], m_allAabbsCPU[40].m_min[2],
+ m_allAabbsCPU[40].m_max[0], m_allAabbsCPU[40].m_max[1], m_allAabbsCPU[40].m_max[2]);
}
- if (preAabbs.size())
- {
- int prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[40].m_min, (const b3Vector3&)preAabbs[40].m_max,(const b3Vector3&)preAabbs[53].m_min,(const b3Vector3&)preAabbs[53].m_max);
- printf("prevoverlap=%d\n",prevOverlap);
- } else
{
- printf("unknown prevoverlap\n");
+ printf("ab[53].min=%f,%f,%f,ab[53].max=%f,%f,%f\n",
+ m_allAabbsCPU[53].m_min[0], m_allAabbsCPU[53].m_min[1], m_allAabbsCPU[53].m_min[2],
+ m_allAabbsCPU[53].m_max[0], m_allAabbsCPU[53].m_max[1], m_allAabbsCPU[53].m_max[2]);
}
- }
- }
+ {
+ b3Int4 newPair;
+ newPair.x = 40;
+ newPair.y = 53;
+ int index = allPairs.findBinarySearch(newPair);
+ printf("hasPair(40,53)=%d out of %d\n", index, allPairs.size());
+ {
+ int overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[40].m_min, (const b3Vector3&)m_allAabbsCPU[40].m_max, (const b3Vector3&)m_allAabbsCPU[53].m_min, (const b3Vector3&)m_allAabbsCPU[53].m_max);
+ printf("overlap=%d\n", overlap);
+ }
+
+ if (preAabbs.size())
+ {
+ int prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[40].m_min, (const b3Vector3&)preAabbs[40].m_max, (const b3Vector3&)preAabbs[53].m_min, (const b3Vector3&)preAabbs[53].m_max);
+ printf("prevoverlap=%d\n", prevOverlap);
+ }
+ else
+ {
+ printf("unknown prevoverlap\n");
+ }
+ }
+ }
if (0)
{
- for (int i=0;i<m_allAabbsCPU.size();i++)
+ for (int i = 0; i < m_allAabbsCPU.size(); i++)
{
//printf("aabb[%d] min=%f,%f,%f max=%f,%f,%f\n",i,m_allAabbsCPU[i].m_min[0],m_allAabbsCPU[i].m_min[1],m_allAabbsCPU[i].m_min[2], m_allAabbsCPU[i].m_max[0],m_allAabbsCPU[i].m_max[1],m_allAabbsCPU[i].m_max[2]);
-
-
}
- for (int axis=0;axis<3;axis++)
+ for (int axis = 0; axis < 3; axis++)
{
- for (int buf=0;buf<2;buf++)
+ for (int buf = 0; buf < 2; buf++)
{
- b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size()*2);
+ b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size() * 2);
}
}
}
-
-
- m_currentBuffer = 1-m_currentBuffer;
-
-
+ m_currentBuffer = 1 - m_currentBuffer;
int totalNumAabbs = m_allAabbsCPU.size();
{
B3_PROFILE("assign m_sortedAxisCPU(FloatFlip)");
- for (int i=0;i<totalNumAabbs;i++)
+ for (int i = 0; i < totalNumAabbs; i++)
{
-
-
unsigned int keyMin[3];
unsigned int keyMax[3];
- for (int axis=0;axis<3;axis++)
+ for (int axis = 0; axis < 3; axis++)
{
- float vmin=m_allAabbsCPU[i].m_min[axis];
+ float vmin = m_allAabbsCPU[i].m_min[axis];
float vmax = m_allAabbsCPU[i].m_max[axis];
keyMin[axis] = FloatFlip(vmin);
keyMax[axis] = FloatFlip(vmax);
-
- m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_key = keyMin[axis]-1;
- m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_value = i*2;
- m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_key = keyMax[axis]+1;
- m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_value = i*2+1;
+
+ m_sortedAxisCPU[axis][m_currentBuffer][i * 2].m_key = keyMin[axis] - 1;
+ m_sortedAxisCPU[axis][m_currentBuffer][i * 2].m_value = i * 2;
+ m_sortedAxisCPU[axis][m_currentBuffer][i * 2 + 1].m_key = keyMax[axis] + 1;
+ m_sortedAxisCPU[axis][m_currentBuffer][i * 2 + 1].m_value = i * 2 + 1;
}
//printf("aabb[%d] min=%u,%u,%u max %u,%u,%u\n", i,keyMin[0],keyMin[1],keyMin[2],keyMax[0],keyMax[1],keyMax[2]);
-
}
}
-
-
{
B3_PROFILE("sort m_sortedAxisCPU");
- for (int axis=0;axis<3;axis++)
+ for (int axis = 0; axis < 3; axis++)
m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]);
}
@@ -432,21 +405,22 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
{
B3_PROFILE("assign m_objectMinMaxIndexCPU");
- for (int axis=0;axis<3;axis++)
+ for (int axis = 0; axis < 3; axis++)
{
int totalNumAabbs = m_allAabbsCPU.size();
int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size();
m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(totalNumAabbs);
- for (int i=0;i<numEndPoints;i++)
+ for (int i = 0; i < numEndPoints; i++)
{
int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value;
- int newDest = destIndex/2;
- if (destIndex&1)
+ int newDest = destIndex / 2;
+ if (destIndex & 1)
{
- m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i;
- } else
+ m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y = i;
+ }
+ else
{
- m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i;
+ m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x = i;
}
}
}
@@ -485,12 +459,11 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
}
#endif
-
int a = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();
int b = m_objectMinMaxIndexCPU[1][m_currentBuffer].size();
int c = m_objectMinMaxIndexCPU[2][m_currentBuffer].size();
- b3Assert(a==b);
- b3Assert(b==c);
+ b3Assert(a == b);
+ b3Assert(b == c);
/*
if (searchIncremental3dSapOnGpu)
{
@@ -574,175 +547,170 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();
B3_PROFILE("actual search");
- for (int i=0;i<numObjects;i++)
+ for (int i = 0; i < numObjects; i++)
{
//int numObjects = m_objectMinMaxIndexCPU[axis][m_currentBuffer].size();
//int checkObjects[]={40,53};
//int numCheckObjects = sizeof(checkObjects)/sizeof(int);
-
+
//for (int a=0;a<numCheckObjects ;a++)
-
- for (int axis=0;axis<3;axis++)
+
+ for (int axis = 0; axis < 3; axis++)
{
//int i = checkObjects[a];
unsigned int curMinIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].x;
unsigned int curMaxIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].y;
- unsigned int prevMinIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].x;
+ unsigned int prevMinIndex = m_objectMinMaxIndexCPU[axis][1 - m_currentBuffer][i].x;
int dmin = curMinIndex - prevMinIndex;
-
- unsigned int prevMaxIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].y;
-
+ unsigned int prevMaxIndex = m_objectMinMaxIndexCPU[axis][1 - m_currentBuffer][i].y;
int dmax = curMaxIndex - prevMaxIndex;
- if (dmin!=0)
+ if (dmin != 0)
{
//printf("for object %d, dmin=%d\n",i,dmin);
}
- if (dmax!=0)
+ if (dmax != 0)
{
//printf("for object %d, dmax=%d\n",i,dmax);
}
- for (int otherbuffer = 0;otherbuffer<2;otherbuffer++)
+ for (int otherbuffer = 0; otherbuffer < 2; otherbuffer++)
{
- if (dmin!=0)
+ if (dmin != 0)
{
- int stepMin = dmin<0 ? -1 : 1;
- for (int j=prevMinIndex;j!=curMinIndex;j+=stepMin)
+ int stepMin = dmin < 0 ? -1 : 1;
+ for (int j = prevMinIndex; j != curMinIndex; j += stepMin)
{
int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y;
- int otherIndex = otherIndex2/2;
- if (otherIndex!=i)
+ int otherIndex = otherIndex2 / 2;
+ if (otherIndex != i)
{
- bool otherIsMax = ((otherIndex2&1)!=0);
+ bool otherIsMax = ((otherIndex2 & 1) != 0);
if (otherIsMax)
{
//bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max);
//bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max);
-
+
bool overlap = true;
- for (int ax=0;ax<3;ax++)
+ for (int ax = 0; ax < 3; ax++)
{
if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) ||
(m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x))
- overlap=false;
+ overlap = false;
}
- // b3Assert(overlap2==overlap);
+ // b3Assert(overlap2==overlap);
bool prevOverlap = true;
- for (int ax=0;ax<3;ax++)
+ for (int ax = 0; ax < 3; ax++)
{
- if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) ||
- (m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x))
- prevOverlap=false;
+ if ((m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].y) ||
+ (m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].x))
+ prevOverlap = false;
}
-
//b3Assert(overlap==overlap2);
-
-
- if (dmin<0)
+ if (dmin < 0)
{
if (overlap && !prevOverlap)
{
//add a pair
b3Int4 newPair;
- if (i<=otherIndex)
+ if (i <= otherIndex)
{
newPair.x = i;
newPair.y = otherIndex;
- } else
+ }
+ else
{
newPair.x = otherIndex;
newPair.y = i;
}
addedHostPairs.push_back(newPair);
}
- }
+ }
else
{
if (!overlap && prevOverlap)
{
-
//remove a pair
b3Int4 removedPair;
- if (i<=otherIndex)
+ if (i <= otherIndex)
{
removedPair.x = i;
removedPair.y = otherIndex;
- } else
+ }
+ else
{
removedPair.x = otherIndex;
removedPair.y = i;
}
removedHostPairs.push_back(removedPair);
}
- }//otherisMax
- }//if (dmin<0)
- }//if (otherIndex!=i)
- }//for (int j=
+ } //otherisMax
+ } //if (dmin<0)
+ } //if (otherIndex!=i)
+ } //for (int j=
}
-
- if (dmax!=0)
+
+ if (dmax != 0)
{
- int stepMax = dmax<0 ? -1 : 1;
- for (int j=prevMaxIndex;j!=curMaxIndex;j+=stepMax)
+ int stepMax = dmax < 0 ? -1 : 1;
+ for (int j = prevMaxIndex; j != curMaxIndex; j += stepMax)
{
int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y;
- int otherIndex = otherIndex2/2;
- if (otherIndex!=i)
+ int otherIndex = otherIndex2 / 2;
+ if (otherIndex != i)
{
//bool otherIsMin = ((otherIndex2&1)==0);
//if (otherIsMin)
{
//bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max);
//bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max);
-
+
bool overlap = true;
- for (int ax=0;ax<3;ax++)
+ for (int ax = 0; ax < 3; ax++)
{
if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) ||
(m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x))
- overlap=false;
+ overlap = false;
}
//b3Assert(overlap2==overlap);
bool prevOverlap = true;
- for (int ax=0;ax<3;ax++)
+ for (int ax = 0; ax < 3; ax++)
{
- if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) ||
- (m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x))
- prevOverlap=false;
+ if ((m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].y) ||
+ (m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].x))
+ prevOverlap = false;
}
-
- if (dmax>0)
+ if (dmax > 0)
{
if (overlap && !prevOverlap)
{
//add a pair
b3Int4 newPair;
- if (i<=otherIndex)
+ if (i <= otherIndex)
{
newPair.x = i;
newPair.y = otherIndex;
- } else
+ }
+ else
{
newPair.x = otherIndex;
newPair.y = i;
}
addedHostPairs.push_back(newPair);
-
}
- }
+ }
else
{
if (!overlap && prevOverlap)
@@ -750,33 +718,31 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
//if (otherIndex2&1==0) -> min?
//remove a pair
b3Int4 removedPair;
- if (i<=otherIndex)
+ if (i <= otherIndex)
{
removedPair.x = i;
removedPair.y = otherIndex;
- } else
+ }
+ else
{
removedPair.x = otherIndex;
removedPair.y = i;
}
removedHostPairs.push_back(removedPair);
-
}
}
-
- }//if (dmin<0)
- }//if (otherIndex!=i)
- }//for (int j=
+
+ } //if (dmin<0)
+ } //if (otherIndex!=i)
+ } //for (int j=
}
- }//for (int otherbuffer
- }//for (int axis=0;
- }//for (int i=0;i<numObjects
+ } //for (int otherbuffer
+ } //for (int axis=0;
+ } //for (int i=0;i<numObjects
}
//remove duplicates and add/remove then to existing m_overlappingPairs
-
-
-
+
{
{
B3_PROFILE("sort allPairs");
@@ -795,31 +761,28 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
b3Int4 prevPair;
prevPair.x = -1;
prevPair.y = -1;
-
+
int uniqueRemovedPairs = 0;
b3AlignedObjectArray<int> removedPositions;
{
B3_PROFILE("actual removing");
- for (int i=0;i<removedHostPairs.size();i++)
+ for (int i = 0; i < removedHostPairs.size(); i++)
{
b3Int4 removedPair = removedHostPairs[i];
if ((removedPair.x != prevPair.x) || (removedPair.y != prevPair.y))
{
+ int index1 = allPairs.findBinarySearch(removedPair);
- int index1 = allPairs.findBinarySearch(removedPair);
+ //#ifdef _DEBUG
- //#ifdef _DEBUG
-
-
-
int index2 = allPairs.findLinearSearch(removedPair);
- b3Assert(index1==index2);
-
+ b3Assert(index1 == index2);
+
//b3Assert(index1!=allPairs.size());
- if (index1<allPairs.size())
- //#endif//_DEBUG
+ if (index1 < allPairs.size())
+ //#endif//_DEBUG
{
uniqueRemovedPairs++;
removedPositions.push_back(index1);
@@ -833,13 +796,13 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
if (uniqueRemovedPairs)
{
- for (int i=0;i<removedPositions.size();i++)
+ for (int i = 0; i < removedPositions.size(); i++)
{
- allPairs[removedPositions[i]].x = INT_MAX ;
- allPairs[removedPositions[i]].y = INT_MAX ;
+ allPairs[removedPositions[i]].x = INT_MAX;
+ allPairs[removedPositions[i]].y = INT_MAX;
}
allPairs.quickSort(b3PairCmp);
- allPairs.resize(allPairs.size()-uniqueRemovedPairs);
+ allPairs.resize(allPairs.size() - uniqueRemovedPairs);
}
}
//if (uniqueRemovedPairs)
@@ -848,33 +811,31 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
prevPair.x = -1;
prevPair.y = -1;
-
- int uniqueAddedPairs=0;
+
+ int uniqueAddedPairs = 0;
b3AlignedObjectArray<b3Int4> actualAddedPairs;
{
B3_PROFILE("actual adding");
- for (int i=0;i<addedHostPairs.size();i++)
+ for (int i = 0; i < addedHostPairs.size(); i++)
{
b3Int4 newPair = addedHostPairs[i];
if ((newPair.x != prevPair.x) || (newPair.y != prevPair.y))
{
-//#ifdef _DEBUG
+ //#ifdef _DEBUG
int index1 = allPairs.findBinarySearch(newPair);
-
-
+
int index2 = allPairs.findLinearSearch(newPair);
- b3Assert(index1==index2);
-
+ b3Assert(index1 == index2);
- b3Assert(index1==allPairs.size());
- if (index1!=allPairs.size())
+ b3Assert(index1 == allPairs.size());
+ if (index1 != allPairs.size())
{
printf("??\n");
}
- if (index1==allPairs.size())
-//#endif //_DEBUG
+ if (index1 == allPairs.size())
+ //#endif //_DEBUG
{
uniqueAddedPairs++;
actualAddedPairs.push_back(newPair);
@@ -882,94 +843,83 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
}
prevPair = newPair;
}
- for (int i=0;i<actualAddedPairs.size();i++)
+ for (int i = 0; i < actualAddedPairs.size(); i++)
{
//printf("framepje (%d), new pair(%d):%d,%d\n",framepje,i,actualAddedPairs[i].x,actualAddedPairs[i].y);
allPairs.push_back(actualAddedPairs[i]);
}
}
-
+
//if (uniqueAddedPairs)
// printf("uniqueAddedPairs=%d\n", uniqueAddedPairs);
-
{
B3_PROFILE("m_overlappingPairs.copyFromHost");
m_overlappingPairs.copyFromHost(allPairs);
}
-
-
}
-
-
-
-void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)
+void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)
{
//test
-// if (m_currentBuffer>=0)
+ // if (m_currentBuffer>=0)
// return calculateOverlappingPairsHostIncremental3Sap();
b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size());
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
-
-
- int axis=0;
+ int axis = 0;
{
B3_PROFILE("CPU compute best variance axis");
- b3Vector3 s=b3MakeVector3(0,0,0),s2=b3MakeVector3(0,0,0);
+ b3Vector3 s = b3MakeVector3(0, 0, 0), s2 = b3MakeVector3(0, 0, 0);
int numRigidBodies = m_smallAabbsMappingCPU.size();
- for(int i=0;i<numRigidBodies;i++)
+ for (int i = 0; i < numRigidBodies; i++)
{
b3SapAabb aabb = this->m_allAabbsCPU[m_smallAabbsMappingCPU[i]];
- b3Vector3 maxAabb=b3MakeVector3(aabb.m_max[0],aabb.m_max[1],aabb.m_max[2]);
- b3Vector3 minAabb=b3MakeVector3(aabb.m_min[0],aabb.m_min[1],aabb.m_min[2]);
- b3Vector3 centerAabb=(maxAabb+minAabb)*0.5f;
-
+ b3Vector3 maxAabb = b3MakeVector3(aabb.m_max[0], aabb.m_max[1], aabb.m_max[2]);
+ b3Vector3 minAabb = b3MakeVector3(aabb.m_min[0], aabb.m_min[1], aabb.m_min[2]);
+ b3Vector3 centerAabb = (maxAabb + minAabb) * 0.5f;
+
s += centerAabb;
- s2 += centerAabb*centerAabb;
+ s2 += centerAabb * centerAabb;
}
- b3Vector3 v = s2 - (s*s) / (float)numRigidBodies;
-
- if(v[1] > v[0])
+ b3Vector3 v = s2 - (s * s) / (float)numRigidBodies;
+
+ if (v[1] > v[0])
axis = 1;
- if(v[2] > v[axis])
+ if (v[2] > v[axis])
axis = 2;
}
-
-
-
b3AlignedObjectArray<b3Int4> hostPairs;
{
int numSmallAabbs = m_smallAabbsMappingCPU.size();
- for (int i=0;i<numSmallAabbs;i++)
+ for (int i = 0; i < numSmallAabbs; i++)
{
b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]];
//float reference = smallAabbi.m_max[axis];
- for (int j=i+1;j<numSmallAabbs;j++)
+ for (int j = i + 1; j < numSmallAabbs; j++)
{
-
b3SapAabb smallAabbj = m_allAabbsCPU[m_smallAabbsMappingCPU[j]];
if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max,
- (b3Vector3&)smallAabbj.m_min,(b3Vector3&)smallAabbj.m_max))
+ (b3Vector3&)smallAabbj.m_min, (b3Vector3&)smallAabbj.m_max))
{
b3Int4 pair;
int a = smallAabbi.m_minIndices[3];
int b = smallAabbj.m_minIndices[3];
- if (a<=b)
+ if (a <= b)
{
- pair.x = a;//store the original index in the unsorted aabb array
+ pair.x = a; //store the original index in the unsorted aabb array
pair.y = b;
- } else
+ }
+ else
{
- pair.x = b;//store the original index in the unsorted aabb array
+ pair.x = b; //store the original index in the unsorted aabb array
pair.y = a;
}
hostPairs.push_back(pair);
@@ -978,35 +928,35 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)
}
}
-
{
int numSmallAabbs = m_smallAabbsMappingCPU.size();
- for (int i=0;i<numSmallAabbs;i++)
+ for (int i = 0; i < numSmallAabbs; i++)
{
b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]];
//float reference = smallAabbi.m_max[axis];
int numLargeAabbs = m_largeAabbsMappingCPU.size();
- for (int j=0;j<numLargeAabbs;j++)
+ for (int j = 0; j < numLargeAabbs; j++)
{
b3SapAabb largeAabbj = m_allAabbsCPU[m_largeAabbsMappingCPU[j]];
if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max,
- (b3Vector3&)largeAabbj.m_min,(b3Vector3&)largeAabbj.m_max))
+ (b3Vector3&)largeAabbj.m_min, (b3Vector3&)largeAabbj.m_max))
{
b3Int4 pair;
int a = largeAabbj.m_minIndices[3];
int b = smallAabbi.m_minIndices[3];
- if (a<=b)
+ if (a <= b)
{
- pair.x = a;
- pair.y = b;//store the original index in the unsorted aabb array
- } else
+ pair.x = a;
+ pair.y = b; //store the original index in the unsorted aabb array
+ }
+ else
{
pair.x = b;
- pair.y = a;//store the original index in the unsorted aabb array
+ pair.y = a; //store the original index in the unsorted aabb array
}
-
+
hostPairs.push_back(pair);
}
}
@@ -1021,21 +971,20 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)
if (hostPairs.size())
{
m_overlappingPairs.copyFromHost(hostPairs);
- } else
+ }
+ else
{
m_overlappingPairs.resize(0);
}
//init3dSap();
-
}
-void b3GpuSapBroadphase::reset()
+void b3GpuSapBroadphase::reset()
{
m_allAabbsGPU.resize(0);
m_allAabbsCPU.resize(0);
-
m_smallAabbsMappingGPU.resize(0);
m_smallAabbsMappingCPU.resize(0);
@@ -1043,13 +992,11 @@ void b3GpuSapBroadphase::reset()
m_largeAabbsMappingGPU.resize(0);
m_largeAabbsMappingCPU.resize(0);
-
}
-
-void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)
+void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)
{
- if (m_sapKernel==0)
+ if (m_sapKernel == 0)
{
calculateOverlappingPairsHost(maxPairs);
return;
@@ -1065,68 +1012,62 @@ void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)
int axis = 0;
{
+ //bool syncOnHost = false;
- //bool syncOnHost = false;
-
- int numSmallAabbs = m_smallAabbsMappingCPU.size();
- if (m_prefixScanFloat4 && numSmallAabbs)
- {
- B3_PROFILE("GPU compute best variance axis");
-
- if (m_dst.size()!=(numSmallAabbs+1))
+ int numSmallAabbs = m_smallAabbsMappingCPU.size();
+ if (m_prefixScanFloat4 && numSmallAabbs)
{
- m_dst.resize(numSmallAabbs+128);
- m_sum.resize(numSmallAabbs+128);
- m_sum2.resize(numSmallAabbs+128);
- m_sum.at(numSmallAabbs)=b3MakeVector3(0,0,0); //slow?
- m_sum2.at(numSmallAabbs)=b3MakeVector3(0,0,0); //slow?
- }
+ B3_PROFILE("GPU compute best variance axis");
- b3LauncherCL launcher(m_queue, m_prepareSumVarianceKernel ,"m_prepareSumVarianceKernel");
- launcher.setBuffer(m_allAabbsGPU.getBufferCL());
-
- launcher.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
- launcher.setBuffer(m_sum.getBufferCL());
- launcher.setBuffer(m_sum2.getBufferCL());
- launcher.setConst( numSmallAabbs );
- int num = numSmallAabbs;
- launcher.launch1D( num);
-
+ if (m_dst.size() != (numSmallAabbs + 1))
+ {
+ m_dst.resize(numSmallAabbs + 128);
+ m_sum.resize(numSmallAabbs + 128);
+ m_sum2.resize(numSmallAabbs + 128);
+ m_sum.at(numSmallAabbs) = b3MakeVector3(0, 0, 0); //slow?
+ m_sum2.at(numSmallAabbs) = b3MakeVector3(0, 0, 0); //slow?
+ }
- b3Vector3 s;
- b3Vector3 s2;
- m_prefixScanFloat4->execute(m_sum,m_dst,numSmallAabbs+1,&s);
- m_prefixScanFloat4->execute(m_sum2,m_dst,numSmallAabbs+1,&s2);
+ b3LauncherCL launcher(m_queue, m_prepareSumVarianceKernel, "m_prepareSumVarianceKernel");
+ launcher.setBuffer(m_allAabbsGPU.getBufferCL());
- b3Vector3 v = s2 - (s*s) / (float)numSmallAabbs;
-
- if(v[1] > v[0])
- axis = 1;
- if(v[2] > v[axis])
- axis = 2;
- }
+ launcher.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
+ launcher.setBuffer(m_sum.getBufferCL());
+ launcher.setBuffer(m_sum2.getBufferCL());
+ launcher.setConst(numSmallAabbs);
+ int num = numSmallAabbs;
+ launcher.launch1D(num);
+ b3Vector3 s;
+ b3Vector3 s2;
+ m_prefixScanFloat4->execute(m_sum, m_dst, numSmallAabbs + 1, &s);
+ m_prefixScanFloat4->execute(m_sum2, m_dst, numSmallAabbs + 1, &s2);
+
+ b3Vector3 v = s2 - (s * s) / (float)numSmallAabbs;
+
+ if (v[1] > v[0])
+ axis = 1;
+ if (v[2] > v[axis])
+ axis = 2;
+ }
-
m_gpuSmallSortData.resize(numSmallAabbs);
-
#if 1
if (m_smallAabbsMappingGPU.size())
{
-
B3_PROFILE("flipFloatKernel");
- b3BufferInfoCL bInfo[] = {
- b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ),
- b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL(), true),
- b3BufferInfoCL( m_gpuSmallSortData.getBufferCL())};
- b3LauncherCL launcher(m_queue, m_flipFloatKernel ,"m_flipFloatKernel");
- launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
- launcher.setConst( numSmallAabbs );
- launcher.setConst( axis );
-
+ b3BufferInfoCL bInfo[] = {
+ b3BufferInfoCL(m_allAabbsGPU.getBufferCL(), true),
+ b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL(), true),
+ b3BufferInfoCL(m_gpuSmallSortData.getBufferCL())};
+ b3LauncherCL launcher(m_queue, m_flipFloatKernel, "m_flipFloatKernel");
+ launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+ launcher.setConst(numSmallAabbs);
+ launcher.setConst(axis);
+
int num = numSmallAabbs;
- launcher.launch1D( num);
+ launcher.launch1D(num);
clFinish(m_queue);
}
@@ -1141,69 +1082,66 @@ void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)
if (numSmallAabbs)
{
B3_PROFILE("scatterKernel");
-
- b3BufferInfoCL bInfo[] = {
- b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ),
- b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL(), true),
- b3BufferInfoCL( m_gpuSmallSortData.getBufferCL(),true),
+
+ b3BufferInfoCL bInfo[] = {
+ b3BufferInfoCL(m_allAabbsGPU.getBufferCL(), true),
+ b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL(), true),
+ b3BufferInfoCL(m_gpuSmallSortData.getBufferCL(), true),
b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())};
- b3LauncherCL launcher(m_queue, m_scatterKernel ,"m_scatterKernel ");
- launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
- launcher.setConst( numSmallAabbs);
+ b3LauncherCL launcher(m_queue, m_scatterKernel, "m_scatterKernel ");
+ launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+ launcher.setConst(numSmallAabbs);
int num = numSmallAabbs;
- launcher.launch1D( num);
+ launcher.launch1D(num);
clFinish(m_queue);
-
}
-
- m_overlappingPairs.resize(maxPairs);
+ m_overlappingPairs.resize(maxPairs);
- m_pairCount.resize(0);
- m_pairCount.push_back(0);
- int numPairs=0;
+ m_pairCount.resize(0);
+ m_pairCount.push_back(0);
+ int numPairs = 0;
+ {
+ int numLargeAabbs = m_largeAabbsMappingGPU.size();
+ if (numLargeAabbs && numSmallAabbs)
{
- int numLargeAabbs = m_largeAabbsMappingGPU.size();
- if (numLargeAabbs && numSmallAabbs)
+ //@todo
+ B3_PROFILE("sap2Kernel");
+ b3BufferInfoCL bInfo[] = {
+ b3BufferInfoCL(m_allAabbsGPU.getBufferCL()),
+ b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()),
+ b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()),
+ b3BufferInfoCL(m_overlappingPairs.getBufferCL()),
+ b3BufferInfoCL(m_pairCount.getBufferCL())};
+ b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel");
+ launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+ launcher.setConst(numLargeAabbs);
+ launcher.setConst(numSmallAabbs);
+ launcher.setConst(axis);
+ launcher.setConst(maxPairs);
+ //@todo: use actual maximum work item sizes of the device instead of hardcoded values
+ launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64);
+
+ numPairs = m_pairCount.at(0);
+ if (numPairs > maxPairs)
{
- //@todo
- B3_PROFILE("sap2Kernel");
- b3BufferInfoCL bInfo[] = {
- b3BufferInfoCL( m_allAabbsGPU.getBufferCL() ),
- b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ),
- b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ),
- b3BufferInfoCL( m_overlappingPairs.getBufferCL() ),
- b3BufferInfoCL(m_pairCount.getBufferCL())};
- b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel");
- launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
- launcher.setConst( numLargeAabbs );
- launcher.setConst( numSmallAabbs);
- launcher.setConst( axis );
- launcher.setConst( maxPairs );
-//@todo: use actual maximum work item sizes of the device instead of hardcoded values
- launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64);
-
- numPairs = m_pairCount.at(0);
- if (numPairs >maxPairs)
- {
- b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
- numPairs =maxPairs;
- }
+ b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
+ numPairs = maxPairs;
}
}
- if (m_gpuSmallSortedAabbs.size())
- {
- B3_PROFILE("sapKernel");
- b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), b3BufferInfoCL(m_pairCount.getBufferCL())};
- b3LauncherCL launcher(m_queue, m_sapKernel,"m_sapKernel");
- launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
- launcher.setConst( numSmallAabbs );
- launcher.setConst( axis );
- launcher.setConst( maxPairs );
-
-
- int num = numSmallAabbs;
+ }
+ if (m_gpuSmallSortedAabbs.size())
+ {
+ B3_PROFILE("sapKernel");
+ b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL()), b3BufferInfoCL(m_overlappingPairs.getBufferCL()), b3BufferInfoCL(m_pairCount.getBufferCL())};
+ b3LauncherCL launcher(m_queue, m_sapKernel, "m_sapKernel");
+ launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+ launcher.setConst(numSmallAabbs);
+ launcher.setConst(axis);
+ launcher.setConst(maxPairs);
+
+ int num = numSmallAabbs;
#if 0
int buffSize = launcher.getSerializationBufferSize();
unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
@@ -1225,73 +1163,71 @@ void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)
FILE* f = fopen("m_sapKernelArgs.bin","wb");
fwrite(buf,buffSize+sizeof(int),1,f);
fclose(f);
-#endif//
+#endif //
- launcher.launch1D( num);
- clFinish(m_queue);
-
- numPairs = m_pairCount.at(0);
- if (numPairs>maxPairs)
- {
- b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
- numPairs = maxPairs;
- m_pairCount.resize(0);
- m_pairCount.push_back(maxPairs);
- }
+ launcher.launch1D(num);
+ clFinish(m_queue);
+
+ numPairs = m_pairCount.at(0);
+ if (numPairs > maxPairs)
+ {
+ b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
+ numPairs = maxPairs;
+ m_pairCount.resize(0);
+ m_pairCount.push_back(maxPairs);
}
-
+ }
+
#else
- int numPairs = 0;
-
-
- b3LauncherCL launcher(m_queue, m_sapKernel);
-
- const char* fileName = "m_sapKernelArgs.bin";
- FILE* f = fopen(fileName,"rb");
- if (f)
- {
- int sizeInBytes=0;
- if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
- {
- printf("error, cannot get file size\n");
- exit(0);
- }
-
- unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
- fread(buf,sizeInBytes,1,f);
- int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
- int num = *(int*)&buf[serializedBytes];
- launcher.launch1D( num);
-
- b3OpenCLArray<int> pairCount(m_context, m_queue);
- int numElements = launcher.m_arrays[2]->size()/sizeof(int);
- pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements);
- numPairs = pairCount.at(0);
- //printf("overlapping pairs = %d\n",numPairs);
- b3AlignedObjectArray<b3Int4> hostOoverlappingPairs;
- b3OpenCLArray<b3Int4> tmpGpuPairs(m_context,m_queue);
- tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs );
-
- tmpGpuPairs.copyToHost(hostOoverlappingPairs);
- m_overlappingPairs.copyFromHost(hostOoverlappingPairs);
- //printf("hello %d\n", m_overlappingPairs.size());
- free(buf);
- fclose(f);
-
- } else {
- printf("error: cannot find file %s\n",fileName);
- }
-
- clFinish(m_queue);
-
-
+ int numPairs = 0;
+
+ b3LauncherCL launcher(m_queue, m_sapKernel);
+
+ const char* fileName = "m_sapKernelArgs.bin";
+ FILE* f = fopen(fileName, "rb");
+ if (f)
+ {
+ int sizeInBytes = 0;
+ if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
+ {
+ printf("error, cannot get file size\n");
+ exit(0);
+ }
+
+ unsigned char* buf = (unsigned char*)malloc(sizeInBytes);
+ fread(buf, sizeInBytes, 1, f);
+ int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context);
+ int num = *(int*)&buf[serializedBytes];
+ launcher.launch1D(num);
+
+ b3OpenCLArray<int> pairCount(m_context, m_queue);
+ int numElements = launcher.m_arrays[2]->size() / sizeof(int);
+ pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(), numElements);
+ numPairs = pairCount.at(0);
+ //printf("overlapping pairs = %d\n",numPairs);
+ b3AlignedObjectArray<b3Int4> hostOoverlappingPairs;
+ b3OpenCLArray<b3Int4> tmpGpuPairs(m_context, m_queue);
+ tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(), numPairs);
+
+ tmpGpuPairs.copyToHost(hostOoverlappingPairs);
+ m_overlappingPairs.copyFromHost(hostOoverlappingPairs);
+ //printf("hello %d\n", m_overlappingPairs.size());
+ free(buf);
+ fclose(f);
+ }
+ else
+ {
+ printf("error: cannot find file %s\n", fileName);
+ }
+
+ clFinish(m_queue);
+
#endif
-
- m_overlappingPairs.resize(numPairs);
-
- }//B3_PROFILE("GPU_RADIX SORT");
- //init3dSap();
+ m_overlappingPairs.resize(numPairs);
+
+ } //B3_PROFILE("GPU_RADIX SORT");
+ //init3dSap();
}
void b3GpuSapBroadphase::writeAabbsToGpu()
@@ -1299,17 +1235,14 @@ void b3GpuSapBroadphase::writeAabbsToGpu()
m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU);
m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU);
- m_allAabbsGPU.copyFromHost(m_allAabbsCPU);//might not be necessary, the 'setupGpuAabbsFull' already takes care of this
-
-
-
+ m_allAabbsGPU.copyFromHost(m_allAabbsCPU); //might not be necessary, the 'setupGpuAabbsFull' already takes care of this
}
-void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)
+void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
int index = userPtr;
b3SapAabb aabb;
- for (int i=0;i<4;i++)
+ for (int i = 0; i < 4; i++)
{
aabb.m_min[i] = aabbMin[i];
aabb.m_max[i] = aabbMax[i];
@@ -1317,15 +1250,15 @@ void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vec
aabb.m_minIndices[3] = index;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
m_largeAabbsMappingCPU.push_back(m_allAabbsCPU.size());
-
+
m_allAabbsCPU.push_back(aabb);
}
-void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)
+void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
int index = userPtr;
b3SapAabb aabb;
- for (int i=0;i<4;i++)
+ for (int i = 0; i < 4; i++)
{
aabb.m_min[i] = aabbMin[i];
aabb.m_max[i] = aabbMax[i];
@@ -1334,20 +1267,19 @@ void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3&
aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
m_smallAabbsMappingCPU.push_back(m_allAabbsCPU.size());
-
m_allAabbsCPU.push_back(aabb);
}
-cl_mem b3GpuSapBroadphase::getAabbBufferWS()
+cl_mem b3GpuSapBroadphase::getAabbBufferWS()
{
return m_allAabbsGPU.getBufferCL();
}
-int b3GpuSapBroadphase::getNumOverlap()
+int b3GpuSapBroadphase::getNumOverlap()
{
return m_overlappingPairs.size();
}
-cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer()
+cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer()
{
return m_overlappingPairs.getBufferCL();
}
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
index 8d36ac78f2..d17590b14a 100644
--- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
+++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
@@ -2,7 +2,7 @@
#define B3_GPU_SAP_BROADPHASE_H
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
-#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
+#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
class b3Vector3;
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
@@ -11,141 +11,133 @@ class b3Vector3;
#include "b3GpuBroadphaseInterface.h"
-
class b3GpuSapBroadphase : public b3GpuBroadphaseInterface
{
-
- cl_context m_context;
- cl_device_id m_device;
- cl_command_queue m_queue;
- cl_kernel m_flipFloatKernel;
- cl_kernel m_scatterKernel ;
- cl_kernel m_copyAabbsKernel;
- cl_kernel m_sapKernel;
- cl_kernel m_sap2Kernel;
- cl_kernel m_prepareSumVarianceKernel;
-
+ cl_context m_context;
+ cl_device_id m_device;
+ cl_command_queue m_queue;
+ cl_kernel m_flipFloatKernel;
+ cl_kernel m_scatterKernel;
+ cl_kernel m_copyAabbsKernel;
+ cl_kernel m_sapKernel;
+ cl_kernel m_sap2Kernel;
+ cl_kernel m_prepareSumVarianceKernel;
class b3RadixSort32CL* m_sorter;
///test for 3d SAP
- b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2];
- b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2];
- b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0;
- b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1;
- b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2;
- b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev;
- b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev;
- b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev;
-
- b3OpenCLArray<b3SortData> m_sortedAxisGPU0;
- b3OpenCLArray<b3SortData> m_sortedAxisGPU1;
- b3OpenCLArray<b3SortData> m_sortedAxisGPU2;
- b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev;
- b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev;
- b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev;
-
-
- b3OpenCLArray<b3Int4> m_addedHostPairsGPU;
- b3OpenCLArray<b3Int4> m_removedHostPairsGPU;
- b3OpenCLArray<int> m_addedCountGPU;
- b3OpenCLArray<int> m_removedCountGPU;
-
- int m_currentBuffer;
+ b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2];
+ b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2];
+ b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0;
+ b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1;
+ b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2;
+ b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev;
+ b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev;
+ b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev;
+
+ b3OpenCLArray<b3SortData> m_sortedAxisGPU0;
+ b3OpenCLArray<b3SortData> m_sortedAxisGPU1;
+ b3OpenCLArray<b3SortData> m_sortedAxisGPU2;
+ b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev;
+ b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev;
+ b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev;
+
+ b3OpenCLArray<b3Int4> m_addedHostPairsGPU;
+ b3OpenCLArray<b3Int4> m_removedHostPairsGPU;
+ b3OpenCLArray<int> m_addedCountGPU;
+ b3OpenCLArray<int> m_removedCountGPU;
+
+ int m_currentBuffer;
public:
-
b3OpenCLArray<int> m_pairCount;
+ b3OpenCLArray<b3SapAabb> m_allAabbsGPU;
+ b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
- b3OpenCLArray<b3SapAabb> m_allAabbsGPU;
- b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
-
- virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()
+ virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()
{
return m_allAabbsGPU;
}
- virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()
+ virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()
{
return m_allAabbsCPU;
}
- b3OpenCLArray<b3Vector3> m_sum;
- b3OpenCLArray<b3Vector3> m_sum2;
- b3OpenCLArray<b3Vector3> m_dst;
+ b3OpenCLArray<b3Vector3> m_sum;
+ b3OpenCLArray<b3Vector3> m_sum2;
+ b3OpenCLArray<b3Vector3> m_dst;
- b3OpenCLArray<int> m_smallAabbsMappingGPU;
+ b3OpenCLArray<int> m_smallAabbsMappingGPU;
b3AlignedObjectArray<int> m_smallAabbsMappingCPU;
- b3OpenCLArray<int> m_largeAabbsMappingGPU;
+ b3OpenCLArray<int> m_largeAabbsMappingGPU;
b3AlignedObjectArray<int> m_largeAabbsMappingCPU;
-
- b3OpenCLArray<b3Int4> m_overlappingPairs;
+ b3OpenCLArray<b3Int4> m_overlappingPairs;
//temporary gpu work memory
- b3OpenCLArray<b3SortData> m_gpuSmallSortData;
- b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs;
+ b3OpenCLArray<b3SortData> m_gpuSmallSortData;
+ b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs;
- class b3PrefixScanFloat4CL* m_prefixScanFloat4;
+ class b3PrefixScanFloat4CL* m_prefixScanFloat4;
enum b3GpuSapKernelType
{
- B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU=1,
+ B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU = 1,
B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU,
B3_GPU_SAP_KERNEL_ORIGINAL,
B3_GPU_SAP_KERNEL_BARRIER,
B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY
};
- b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType=B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
+ b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType = B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
virtual ~b3GpuSapBroadphase();
-
- static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx,cl_device_id device, cl_command_queue q)
+
+ static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx, cl_device_id device, cl_command_queue q)
{
- return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU);
+ return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU);
}
- static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx,cl_device_id device, cl_command_queue q)
+ static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx, cl_device_id device, cl_command_queue q)
{
- return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU);
+ return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU);
}
- static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx,cl_device_id device, cl_command_queue q)
+ static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx, cl_device_id device, cl_command_queue q)
{
- return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_ORIGINAL);
+ return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_ORIGINAL);
}
- static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx,cl_device_id device, cl_command_queue q)
+ static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx, cl_device_id device, cl_command_queue q)
{
- return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BARRIER);
+ return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BARRIER);
}
- static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx,cl_device_id device, cl_command_queue q)
+ static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx, cl_device_id device, cl_command_queue q)
{
- return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
+ return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
}
-
- virtual void calculateOverlappingPairs(int maxPairs);
- virtual void calculateOverlappingPairsHost(int maxPairs);
-
- void reset();
+ virtual void calculateOverlappingPairs(int maxPairs);
+ virtual void calculateOverlappingPairsHost(int maxPairs);
+
+ void reset();
void init3dSap();
virtual void calculateOverlappingPairsHostIncremental3Sap();
- virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
- virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
+ virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
+ virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
//call writeAabbsToGpu after done making all changes (createProxy etc)
virtual void writeAabbsToGpu();
- virtual cl_mem getAabbBufferWS();
- virtual int getNumOverlap();
- virtual cl_mem getOverlappingPairBuffer();
-
+ virtual cl_mem getAabbBufferWS();
+ virtual int getNumOverlap();
+ virtual cl_mem getOverlappingPairBuffer();
+
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
};
-#endif //B3_GPU_SAP_BROADPHASE_H \ No newline at end of file
+#endif //B3_GPU_SAP_BROADPHASE_H \ No newline at end of file
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
index ea6550fede..60570f2605 100644
--- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
+++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
@@ -5,10 +5,9 @@
#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
///just make sure that the b3Aabb is 16-byte aligned
-B3_ATTRIBUTE_ALIGNED16(struct) b3SapAabb : public b3Aabb
-{
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3SapAabb : public b3Aabb{
-};
+ };
-
-#endif //B3_SAP_AABB_H
+#endif //B3_SAP_AABB_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h
index dad42477c3..0185417786 100644
--- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h
+++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h
@@ -1,199 +1,198 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* gridBroadphaseCL= \
-"int getPosHash(int4 gridPos, __global float4* pParams)\n"
-"{\n"
-" int4 gridDim = *((__global int4*)(pParams + 1));\n"
-" gridPos.x &= gridDim.x - 1;\n"
-" gridPos.y &= gridDim.y - 1;\n"
-" gridPos.z &= gridDim.z - 1;\n"
-" int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n"
-" return hash;\n"
-"} \n"
-"int4 getGridPos(float4 worldPos, __global float4* pParams)\n"
-"{\n"
-" int4 gridPos;\n"
-" int4 gridDim = *((__global int4*)(pParams + 1));\n"
-" gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n"
-" gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n"
-" gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n"
-" return gridPos;\n"
-"}\n"
-"// calculate grid hash value for each body using its AABB\n"
-"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n"
-"{\n"
-" int index = get_global_id(0);\n"
-" if(index >= numObjects)\n"
-" {\n"
-" return;\n"
-" }\n"
-" float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n"
-" float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n"
-" float4 pos;\n"
-" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
-" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
-" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
-" pos.w = 0.f;\n"
-" // get address in grid\n"
-" int4 gridPos = getGridPos(pos, pParams);\n"
-" int gridHash = getPosHash(gridPos, pParams);\n"
-" // store grid hash and body index\n"
-" int2 hashVal;\n"
-" hashVal.x = gridHash;\n"
-" hashVal.y = index;\n"
-" pHash[index] = hashVal;\n"
-"}\n"
-"__kernel void kClearCellStart( int numCells, \n"
-" __global int* pCellStart )\n"
-"{\n"
-" int index = get_global_id(0);\n"
-" if(index >= numCells)\n"
-" {\n"
-" return;\n"
-" }\n"
-" pCellStart[index] = -1;\n"
-"}\n"
-"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n"
-"{\n"
-" __local int sharedHash[513];\n"
-" int index = get_global_id(0);\n"
-" int2 sortedData;\n"
-" if(index < numObjects)\n"
-" {\n"
-" sortedData = pHash[index];\n"
-" // Load hash data into shared memory so that we can look \n"
-" // at neighboring body's hash value without loading\n"
-" // two hash values per thread\n"
-" sharedHash[get_local_id(0) + 1] = sortedData.x;\n"
-" if((index > 0) && (get_local_id(0) == 0))\n"
-" {\n"
-" // first thread in block must load neighbor body hash\n"
-" sharedHash[0] = pHash[index-1].x;\n"
-" }\n"
-" }\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" if(index < numObjects)\n"
-" {\n"
-" if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n"
-" {\n"
-" cellStart[sortedData.x] = index;\n"
-" }\n"
-" }\n"
-"}\n"
-"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n"
-"{\n"
-" return (min0.x <= max1.x)&& (min1.x <= max0.x) && \n"
-" (min0.y <= max1.y)&& (min1.y <= max0.y) && \n"
-" (min0.z <= max1.z)&& (min1.z <= max0.z); \n"
-"}\n"
-"//search for AABB 'index' against other AABBs' in this cell\n"
-"void findPairsInCell( int numObjects,\n"
-" int4 gridPos,\n"
-" int index,\n"
-" __global int2* pHash,\n"
-" __global int* pCellStart,\n"
-" __global float4* allpAABB, \n"
-" __global const int* smallAabbMapping,\n"
-" __global float4* pParams,\n"
-" volatile __global int* pairCount,\n"
-" __global int4* pPairBuff2,\n"
-" int maxPairs\n"
-" )\n"
-"{\n"
-" int4 pGridDim = *((__global int4*)(pParams + 1));\n"
-" int maxBodiesPerCell = pGridDim.w;\n"
-" int gridHash = getPosHash(gridPos, pParams);\n"
-" // get start of bucket for this cell\n"
-" int bucketStart = pCellStart[gridHash];\n"
-" if (bucketStart == -1)\n"
-" {\n"
-" return; // cell empty\n"
-" }\n"
-" // iterate over bodies in this cell\n"
-" int2 sortedData = pHash[index];\n"
-" int unsorted_indx = sortedData.y;\n"
-" float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n"
-" float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
-" int handleIndex = as_int(min0.w);\n"
-" \n"
-" int bucketEnd = bucketStart + maxBodiesPerCell;\n"
-" bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n"
-" for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n"
-" {\n"
-" int2 cellData = pHash[index2];\n"
-" if (cellData.x != gridHash)\n"
-" {\n"
-" break; // no longer in same bucket\n"
-" }\n"
-" int unsorted_indx2 = cellData.y;\n"
-" //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n"
-" if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n"
-" { \n"
-" float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n"
-" float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n"
-" if(testAABBOverlap(min0, max0, min1, max1))\n"
-" {\n"
-" if (pairCount)\n"
-" {\n"
-" int handleIndex2 = as_int(min1.w);\n"
-" if (handleIndex<handleIndex2)\n"
-" {\n"
-" int curPair = atomic_add(pairCount,1);\n"
-" if (curPair<maxPairs)\n"
-" {\n"
-" int4 newpair;\n"
-" newpair.x = handleIndex;\n"
-" newpair.y = handleIndex2;\n"
-" newpair.z = -1;\n"
-" newpair.w = -1;\n"
-" pPairBuff2[curPair] = newpair;\n"
-" }\n"
-" }\n"
-" \n"
-" }\n"
-" }\n"
-" }\n"
-" }\n"
-"}\n"
-"__kernel void kFindOverlappingPairs( int numObjects,\n"
-" __global float4* allpAABB, \n"
-" __global const int* smallAabbMapping,\n"
-" __global int2* pHash, \n"
-" __global int* pCellStart, \n"
-" __global float4* pParams ,\n"
-" volatile __global int* pairCount,\n"
-" __global int4* pPairBuff2,\n"
-" int maxPairs\n"
-" )\n"
-"{\n"
-" int index = get_global_id(0);\n"
-" if(index >= numObjects)\n"
-" {\n"
-" return;\n"
-" }\n"
-" int2 sortedData = pHash[index];\n"
-" int unsorted_indx = sortedData.y;\n"
-" float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n"
-" float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
-" float4 pos;\n"
-" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
-" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
-" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
-" // get address in grid\n"
-" int4 gridPosA = getGridPos(pos, pParams);\n"
-" int4 gridPosB; \n"
-" // examine only neighbouring cells\n"
-" for(int z=-1; z<=1; z++) \n"
-" {\n"
-" gridPosB.z = gridPosA.z + z;\n"
-" for(int y=-1; y<=1; y++) \n"
-" {\n"
-" gridPosB.y = gridPosA.y + y;\n"
-" for(int x=-1; x<=1; x++) \n"
-" {\n"
-" gridPosB.x = gridPosA.x + x;\n"
-" findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n"
-" }\n"
-" }\n"
-" }\n"
-"}\n"
-;
+static const char* gridBroadphaseCL =
+ "int getPosHash(int4 gridPos, __global float4* pParams)\n"
+ "{\n"
+ " int4 gridDim = *((__global int4*)(pParams + 1));\n"
+ " gridPos.x &= gridDim.x - 1;\n"
+ " gridPos.y &= gridDim.y - 1;\n"
+ " gridPos.z &= gridDim.z - 1;\n"
+ " int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n"
+ " return hash;\n"
+ "} \n"
+ "int4 getGridPos(float4 worldPos, __global float4* pParams)\n"
+ "{\n"
+ " int4 gridPos;\n"
+ " int4 gridDim = *((__global int4*)(pParams + 1));\n"
+ " gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n"
+ " gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n"
+ " gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n"
+ " return gridPos;\n"
+ "}\n"
+ "// calculate grid hash value for each body using its AABB\n"
+ "__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n"
+ "{\n"
+ " int index = get_global_id(0);\n"
+ " if(index >= numObjects)\n"
+ " {\n"
+ " return;\n"
+ " }\n"
+ " float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n"
+ " float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n"
+ " float4 pos;\n"
+ " pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
+ " pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
+ " pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
+ " pos.w = 0.f;\n"
+ " // get address in grid\n"
+ " int4 gridPos = getGridPos(pos, pParams);\n"
+ " int gridHash = getPosHash(gridPos, pParams);\n"
+ " // store grid hash and body index\n"
+ " int2 hashVal;\n"
+ " hashVal.x = gridHash;\n"
+ " hashVal.y = index;\n"
+ " pHash[index] = hashVal;\n"
+ "}\n"
+ "__kernel void kClearCellStart( int numCells, \n"
+ " __global int* pCellStart )\n"
+ "{\n"
+ " int index = get_global_id(0);\n"
+ " if(index >= numCells)\n"
+ " {\n"
+ " return;\n"
+ " }\n"
+ " pCellStart[index] = -1;\n"
+ "}\n"
+ "__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n"
+ "{\n"
+ " __local int sharedHash[513];\n"
+ " int index = get_global_id(0);\n"
+ " int2 sortedData;\n"
+ " if(index < numObjects)\n"
+ " {\n"
+ " sortedData = pHash[index];\n"
+ " // Load hash data into shared memory so that we can look \n"
+ " // at neighboring body's hash value without loading\n"
+ " // two hash values per thread\n"
+ " sharedHash[get_local_id(0) + 1] = sortedData.x;\n"
+ " if((index > 0) && (get_local_id(0) == 0))\n"
+ " {\n"
+ " // first thread in block must load neighbor body hash\n"
+ " sharedHash[0] = pHash[index-1].x;\n"
+ " }\n"
+ " }\n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n"
+ " if(index < numObjects)\n"
+ " {\n"
+ " if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n"
+ " {\n"
+ " cellStart[sortedData.x] = index;\n"
+ " }\n"
+ " }\n"
+ "}\n"
+ "int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n"
+ "{\n"
+ " return (min0.x <= max1.x)&& (min1.x <= max0.x) && \n"
+ " (min0.y <= max1.y)&& (min1.y <= max0.y) && \n"
+ " (min0.z <= max1.z)&& (min1.z <= max0.z); \n"
+ "}\n"
+ "//search for AABB 'index' against other AABBs' in this cell\n"
+ "void findPairsInCell( int numObjects,\n"
+ " int4 gridPos,\n"
+ " int index,\n"
+ " __global int2* pHash,\n"
+ " __global int* pCellStart,\n"
+ " __global float4* allpAABB, \n"
+ " __global const int* smallAabbMapping,\n"
+ " __global float4* pParams,\n"
+ " volatile __global int* pairCount,\n"
+ " __global int4* pPairBuff2,\n"
+ " int maxPairs\n"
+ " )\n"
+ "{\n"
+ " int4 pGridDim = *((__global int4*)(pParams + 1));\n"
+ " int maxBodiesPerCell = pGridDim.w;\n"
+ " int gridHash = getPosHash(gridPos, pParams);\n"
+ " // get start of bucket for this cell\n"
+ " int bucketStart = pCellStart[gridHash];\n"
+ " if (bucketStart == -1)\n"
+ " {\n"
+ " return; // cell empty\n"
+ " }\n"
+ " // iterate over bodies in this cell\n"
+ " int2 sortedData = pHash[index];\n"
+ " int unsorted_indx = sortedData.y;\n"
+ " float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n"
+ " float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
+ " int handleIndex = as_int(min0.w);\n"
+ " \n"
+ " int bucketEnd = bucketStart + maxBodiesPerCell;\n"
+ " bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n"
+ " for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n"
+ " {\n"
+ " int2 cellData = pHash[index2];\n"
+ " if (cellData.x != gridHash)\n"
+ " {\n"
+ " break; // no longer in same bucket\n"
+ " }\n"
+ " int unsorted_indx2 = cellData.y;\n"
+ " //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n"
+ " if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n"
+ " { \n"
+ " float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n"
+ " float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n"
+ " if(testAABBOverlap(min0, max0, min1, max1))\n"
+ " {\n"
+ " if (pairCount)\n"
+ " {\n"
+ " int handleIndex2 = as_int(min1.w);\n"
+ " if (handleIndex<handleIndex2)\n"
+ " {\n"
+ " int curPair = atomic_add(pairCount,1);\n"
+ " if (curPair<maxPairs)\n"
+ " {\n"
+ " int4 newpair;\n"
+ " newpair.x = handleIndex;\n"
+ " newpair.y = handleIndex2;\n"
+ " newpair.z = -1;\n"
+ " newpair.w = -1;\n"
+ " pPairBuff2[curPair] = newpair;\n"
+ " }\n"
+ " }\n"
+ " \n"
+ " }\n"
+ " }\n"
+ " }\n"
+ " }\n"
+ "}\n"
+ "__kernel void kFindOverlappingPairs( int numObjects,\n"
+ " __global float4* allpAABB, \n"
+ " __global const int* smallAabbMapping,\n"
+ " __global int2* pHash, \n"
+ " __global int* pCellStart, \n"
+ " __global float4* pParams ,\n"
+ " volatile __global int* pairCount,\n"
+ " __global int4* pPairBuff2,\n"
+ " int maxPairs\n"
+ " )\n"
+ "{\n"
+ " int index = get_global_id(0);\n"
+ " if(index >= numObjects)\n"
+ " {\n"
+ " return;\n"
+ " }\n"
+ " int2 sortedData = pHash[index];\n"
+ " int unsorted_indx = sortedData.y;\n"
+ " float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n"
+ " float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
+ " float4 pos;\n"
+ " pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
+ " pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
+ " pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
+ " // get address in grid\n"
+ " int4 gridPosA = getGridPos(pos, pParams);\n"
+ " int4 gridPosB; \n"
+ " // examine only neighbouring cells\n"
+ " for(int z=-1; z<=1; z++) \n"
+ " {\n"
+ " gridPosB.z = gridPosA.z + z;\n"
+ " for(int y=-1; y<=1; y++) \n"
+ " {\n"
+ " gridPosB.y = gridPosA.y + y;\n"
+ " for(int x=-1; x<=1; x++) \n"
+ " {\n"
+ " gridPosB.x = gridPosA.x + x;\n"
+ " findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n"
+ " }\n"
+ " }\n"
+ " }\n"
+ "}\n";
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
index 5eb8f45b16..c02877dde9 100644
--- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
+++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
@@ -1,729 +1,728 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* parallelLinearBvhCL= \
-"/*\n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose,\n"
-"including commercial applications, and to alter it and redistribute it freely,\n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Initial Author Jackson Lee, 2014\n"
-"typedef float b3Scalar;\n"
-"typedef float4 b3Vector3;\n"
-"#define b3Max max\n"
-"#define b3Min min\n"
-"#define b3Sqrt sqrt\n"
-"typedef struct\n"
-"{\n"
-" unsigned int m_key;\n"
-" unsigned int m_value;\n"
-"} SortDataCL;\n"
-"typedef struct \n"
-"{\n"
-" union\n"
-" {\n"
-" float4 m_min;\n"
-" float m_minElems[4];\n"
-" int m_minIndices[4];\n"
-" };\n"
-" union\n"
-" {\n"
-" float4 m_max;\n"
-" float m_maxElems[4];\n"
-" int m_maxIndices[4];\n"
-" };\n"
-"} b3AabbCL;\n"
-"unsigned int interleaveBits(unsigned int x)\n"
-"{\n"
-" //........ ........ ......12 3456789A //x\n"
-" //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits\n"
-" \n"
-" //......12 3456789A ......12 3456789A //x ^ (x << 16)\n"
-" //11111111 ........ ........ 11111111 //0x FF 00 00 FF\n"
-" //......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF;\n"
-" \n"
-" //......12 ........ 3456789A 3456789A //x ^ (x << 8)\n"
-" //......11 ........ 1111.... ....1111 //0x 03 00 F0 0F\n"
-" //......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F;\n"
-" \n"
-" //..12..12 ....3456 3456.... 789A789A //x ^ (x << 4)\n"
-" //......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3\n"
-" //......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3;\n"
-" \n"
-" //....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2)\n"
-" //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49\n"
-" //....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249;\n"
-" \n"
-" //........ ........ ......11 11111111 //0x000003FF\n"
-" x &= 0x000003FF; //Clear all bits above bit 10\n"
-" \n"
-" x = (x ^ (x << 16)) & 0xFF0000FF;\n"
-" x = (x ^ (x << 8)) & 0x0300F00F;\n"
-" x = (x ^ (x << 4)) & 0x030C30C3;\n"
-" x = (x ^ (x << 2)) & 0x09249249;\n"
-" \n"
-" return x;\n"
-"}\n"
-"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n"
-"{\n"
-" return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n"
-"}\n"
-"__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n"
-"{\n"
-" int separatedAabbIndex = get_global_id(0);\n"
-" if(separatedAabbIndex >= numAabbsToSeparate) return;\n"
-" int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n"
-" out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n"
-"}\n"
-"//Should replace with an optimized parallel reduction\n"
-"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n"
-"{\n"
-" //Each time this kernel is added to the command queue, \n"
-" //the number of AABBs needing to be merged is halved\n"
-" //\n"
-" //Example with 159 AABBs:\n"
-" // numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n"
-" // numMergedAabbs == 159 - 80 == 79\n"
-" //So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n"
-" \n"
-" int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n"
-" int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n"
-" \n"
-" int aabbIndex = get_global_id(0);\n"
-" if(aabbIndex >= numMergedAabbs) return;\n"
-" \n"
-" int otherAabbIndex = aabbIndex + numRemainingAabbs;\n"
-" \n"
-" b3AabbCL aabb = out_mergedAabb[aabbIndex];\n"
-" b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n"
-" \n"
-" b3AabbCL mergedAabb;\n"
-" mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n"
-" mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n"
-" out_mergedAabb[aabbIndex] = mergedAabb;\n"
-"}\n"
-"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n"
-" __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n"
-"{\n"
-" int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index\n"
-" if(leafNodeIndex >= numAabbs) return;\n"
-" \n"
-" b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n"
-" b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n"
-" b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n"
-" \n"
-" b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n"
-" b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n"
-" b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n"
-" \n"
-" //Quantize into integer coordinates\n"
-" //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n"
-" b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n"
-" \n"
-" int4 discretePosition;\n"
-" discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n"
-" discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n"
-" discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n"
-" \n"
-" //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n"
-" discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n"
-" discretePosition += 512;\n"
-" \n"
-" //Interleave bits(assign a morton code, also known as a z-curve)\n"
-" unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n"
-" \n"
-" //\n"
-" SortDataCL mortonCodeIndexPair;\n"
-" mortonCodeIndexPair.m_key = mortonCode;\n"
-" mortonCodeIndexPair.m_value = leafNodeIndex;\n"
-" \n"
-" out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n"
-"}\n"
-"#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n"
-"//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n"
-"//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n"
-"//In both cases, the bit should be cleared to access the actual node index.\n"
-"int isLeafNode(int index) { return (index >> 31 == 0); }\n"
-"int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n"
-"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n"
-"//From sap.cl\n"
-"#define NEW_PAIR_MARKER -1\n"
-"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n"
-"{\n"
-" bool overlap = true;\n"
-" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
-" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
-" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
-" return overlap;\n"
-"}\n"
-"//From sap.cl\n"
-"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n"
-" __global int* rootNodeIndex, \n"
-" __global int2* internalNodeChildIndices, \n"
-" __global b3AabbCL* internalNodeAabbs,\n"
-" __global int2* internalNodeLeafIndexRanges,\n"
-" \n"
-" __global SortDataCL* mortonCodesAndAabbIndices,\n"
-" __global int* out_numPairs, __global int4* out_overlappingPairs, \n"
-" int maxPairs, int numQueryAabbs)\n"
-"{\n"
-" //Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n"
-" //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n"
-" int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
-" if(queryBvhNodeIndex >= numQueryAabbs) return;\n"
-" \n"
-" int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n"
-" b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n"
-" \n"
-" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
-" \n"
-" int stackSize = 1;\n"
-" stack[0] = *rootNodeIndex;\n"
-" \n"
-" while(stackSize)\n"
-" {\n"
-" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
-" --stackSize;\n"
-" \n"
-" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n"
-" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
-" \n"
-" //Optimization - if the BVH is structured as a binary radix tree, then\n"
-" //each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n"
-" //This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n"
-" {\n"
-" int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n"
-" if(highestLeafIndex <= queryBvhNodeIndex) continue;\n"
-" }\n"
-" \n"
-" //bvhRigidIndex is not used if internal node\n"
-" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
-" \n"
-" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
-" if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n"
-" {\n"
-" if(isLeaf)\n"
-" {\n"
-" int4 pair;\n"
-" pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n"
-" pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
-" pair.z = NEW_PAIR_MARKER;\n"
-" pair.w = NEW_PAIR_MARKER;\n"
-" \n"
-" int pairIndex = atomic_inc(out_numPairs);\n"
-" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
-" }\n"
-" \n"
-" if(!isLeaf) //Internal node\n"
-" {\n"
-" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
-" {\n"
-" //Error\n"
-" }\n"
-" else\n"
-" {\n"
-" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
-" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
-" }\n"
-" }\n"
-" }\n"
-" \n"
-" }\n"
-"}\n"
-"//From rayCastKernels.cl\n"
-"typedef struct\n"
-"{\n"
-" float4 m_from;\n"
-" float4 m_to;\n"
-"} b3RayInfo;\n"
-"//From rayCastKernels.cl\n"
-"b3Vector3 b3Vector3_normalize(b3Vector3 v)\n"
-"{\n"
-" b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n"
-" return normalize(normal); //OpenCL normalize == vector4 normalize\n"
-"}\n"
-"b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n"
-"b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n"
-"int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n"
-"{\n"
-" //AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n"
-" //t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n"
-" //\n"
-" //if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n"
-" //and min.x will be the far plane; otherwise, it is reversed.\n"
-" //\n"
-" //In order for there to be a collision, the t_min and t_max of each pair must overlap.\n"
-" //This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n"
-" \n"
-" int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) ); //isless(x,y) returns (x < y)\n"
-" \n"
-" //When using vector types, the select() function checks the most signficant bit, \n"
-" //but isless() sets the least significant bit.\n"
-" isNegative <<= 31;\n"
-" //select(b, a, condition) == condition ? a : b\n"
-" //When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n"
-" b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
-" b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
-" \n"
-" b3Scalar t_min_final = 0.0f;\n"
-" b3Scalar t_max_final = rayLength;\n"
-" \n"
-" //Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n"
-" //Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n"
-" //Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n"
-" t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n"
-" t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n"
-" \n"
-" return (t_min_final <= t_max_final);\n"
-"}\n"
-"__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n"
-" __global int* rootNodeIndex, \n"
-" __global int2* internalNodeChildIndices, \n"
-" __global b3AabbCL* internalNodeAabbs,\n"
-" __global int2* internalNodeLeafIndexRanges,\n"
-" __global SortDataCL* mortonCodesAndAabbIndices,\n"
-" \n"
-" __global b3RayInfo* rays,\n"
-" \n"
-" __global int* out_numRayRigidPairs, \n"
-" __global int2* out_rayRigidPairs,\n"
-" int maxRayRigidPairs, int numRays)\n"
-"{\n"
-" int rayIndex = get_global_id(0);\n"
-" if(rayIndex >= numRays) return;\n"
-" \n"
-" //\n"
-" b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
-" b3Vector3 rayTo = rays[rayIndex].m_to;\n"
-" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
-" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
-" \n"
-" //\n"
-" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
-" \n"
-" int stackSize = 1;\n"
-" stack[0] = *rootNodeIndex;\n"
-" \n"
-" while(stackSize)\n"
-" {\n"
-" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
-" --stackSize;\n"
-" \n"
-" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n"
-" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
-" \n"
-" //bvhRigidIndex is not used if internal node\n"
-" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
-" \n"
-" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
-" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) )\n"
-" {\n"
-" if(isLeaf)\n"
-" {\n"
-" int2 rayRigidPair;\n"
-" rayRigidPair.x = rayIndex;\n"
-" rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
-" \n"
-" int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
-" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
-" }\n"
-" \n"
-" if(!isLeaf) //Internal node\n"
-" {\n"
-" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
-" {\n"
-" //Error\n"
-" }\n"
-" else\n"
-" {\n"
-" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
-" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
-" }\n"
-" }\n"
-" }\n"
-" }\n"
-"}\n"
-"__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n"
-" __global int* out_numPairs, __global int4* out_overlappingPairs, \n"
-" int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n"
-"{\n"
-" int smallAabbIndex = get_global_id(0);\n"
-" if(smallAabbIndex >= numSmallAabbRigids) return;\n"
-" \n"
-" b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n"
-" for(int i = 0; i < numLargeAabbRigids; ++i)\n"
-" {\n"
-" b3AabbCL largeAabb = largeAabbs[i];\n"
-" if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n"
-" {\n"
-" int4 pair;\n"
-" pair.x = largeAabb.m_minIndices[3];\n"
-" pair.y = smallAabb.m_minIndices[3];\n"
-" pair.z = NEW_PAIR_MARKER;\n"
-" pair.w = NEW_PAIR_MARKER;\n"
-" \n"
-" int pairIndex = atomic_inc(out_numPairs);\n"
-" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
-" }\n"
-" }\n"
-"}\n"
-"__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n"
-" __global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs,\n"
-" int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n"
-"{\n"
-" int rayIndex = get_global_id(0);\n"
-" if(rayIndex >= numRays) return;\n"
-" \n"
-" b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
-" b3Vector3 rayTo = rays[rayIndex].m_to;\n"
-" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
-" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
-" \n"
-" for(int i = 0; i < numLargeAabbRigids; ++i)\n"
-" {\n"
-" b3AabbCL rigidAabb = largeRigidAabbs[i];\n"
-" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n"
-" {\n"
-" int2 rayRigidPair;\n"
-" rayRigidPair.x = rayIndex;\n"
-" rayRigidPair.y = rigidAabb.m_minIndices[3];\n"
-" \n"
-" int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
-" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
-" }\n"
-" }\n"
-"}\n"
-"//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n"
-"//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n"
-"//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n"
-"//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n"
-"#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n"
-"#define B3_PLBVH_ROOT_NODE_MARKER -1\n"
-"#define b3Int64 long\n"
-"int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n"
-"b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n"
-"{\n"
-" //This function only needs to return (i & j) in order for the algorithm to work,\n"
-" //but it may help with debugging to mask out the lower bits.\n"
-" b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n"
-" b3Int64 sharedBits = i & j;\n"
-" b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0\n"
-" \n"
-" return sharedBits & bitmask;\n"
-"}\n"
-"//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n"
-"int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n"
-"{\n"
-" return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n"
-"}\n"
-"__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n"
-" __global b3Int64* out_commonPrefixes,\n"
-" __global int* out_commonPrefixLengths,\n"
-" int numInternalNodes)\n"
-"{\n"
-" int internalNodeIndex = get_global_id(0);\n"
-" if (internalNodeIndex >= numInternalNodes) return;\n"
-" \n"
-" //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n"
-" //and the number of internal nodes is always numLeafNodes - 1\n"
-" int leftLeafIndex = internalNodeIndex;\n"
-" int rightLeafIndex = internalNodeIndex + 1;\n"
-" \n"
-" int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n"
-" int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n"
-" \n"
-" //Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n"
-" //Append the index of each leaf node to each morton code so that there are no duplicates.\n"
-" //The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n"
-" //is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n"
-" //\n"
-" //upsample(a, b) == ( ((b3Int64)a) << 32) | b\n"
-" b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n"
-" b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n"
-" \n"
-" out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
-" out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
-"}\n"
-"__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n"
-" __global int2* out_childNodes, int numLeafNodes)\n"
-"{\n"
-" int leafNodeIndex = get_global_id(0);\n"
-" if (leafNodeIndex >= numLeafNodes) return;\n"
-" \n"
-" int numInternalNodes = numLeafNodes - 1;\n"
-" \n"
-" int leftSplitIndex = leafNodeIndex - 1;\n"
-" int rightSplitIndex = leafNodeIndex;\n"
-" \n"
-" int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
-" int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
-" \n"
-" //Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n"
-" //Leaf nodes are considered as having the highest common prefix\n"
-" int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n"
-" \n"
-" //Handle cases for the edge nodes; the first and last node\n"
-" //For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n"
-" if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n"
-" if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n"
-" \n"
-" int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n"
-" out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n"
-" \n"
-" int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa\n"
-" \n"
-" //out_childNodesAsInt[0] == int2.x == left child\n"
-" //out_childNodesAsInt[1] == int2.y == right child\n"
-" int isLeaf = 1;\n"
-" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
-" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n"
-"}\n"
-"__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n"
-" __global int2* out_childNodes,\n"
-" __global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n"
-" int numInternalNodes)\n"
-"{\n"
-" int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
-" if(internalNodeIndex >= numInternalNodes) return;\n"
-" \n"
-" b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n"
-" int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n"
-" \n"
-"//#define USE_LINEAR_SEARCH\n"
-"#ifdef USE_LINEAR_SEARCH\n"
-" int leftIndex = -1;\n"
-" int rightIndex = -1;\n"
-" \n"
-" //Find nearest element to left with a lower common prefix\n"
-" for(int i = internalNodeIndex - 1; i >= 0; --i)\n"
-" {\n"
-" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
-" if(nodeLeftSharedPrefixLength < nodePrefixLength)\n"
-" {\n"
-" leftIndex = i;\n"
-" break;\n"
-" }\n"
-" }\n"
-" \n"
-" //Find nearest element to right with a lower common prefix\n"
-" for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n"
-" {\n"
-" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
-" if(nodeRightSharedPrefixLength < nodePrefixLength)\n"
-" {\n"
-" rightIndex = i;\n"
-" break;\n"
-" }\n"
-" }\n"
-" \n"
-"#else //Use binary search\n"
-" //Find nearest element to left with a lower common prefix\n"
-" int leftIndex = -1;\n"
-" {\n"
-" int lower = 0;\n"
-" int upper = internalNodeIndex - 1;\n"
-" \n"
-" while(lower <= upper)\n"
-" {\n"
-" int mid = (lower + upper) / 2;\n"
-" b3Int64 midPrefix = commonPrefixes[mid];\n"
-" int midPrefixLength = commonPrefixLengths[mid];\n"
-" \n"
-" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
-" if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
-" {\n"
-" int right = mid + 1;\n"
-" if(right < internalNodeIndex)\n"
-" {\n"
-" b3Int64 rightPrefix = commonPrefixes[right];\n"
-" int rightPrefixLength = commonPrefixLengths[right];\n"
-" \n"
-" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n"
-" if(nodeRightSharedPrefixLength < nodePrefixLength) \n"
-" {\n"
-" lower = right;\n"
-" leftIndex = right;\n"
-" }\n"
-" else \n"
-" {\n"
-" leftIndex = mid;\n"
-" break;\n"
-" }\n"
-" }\n"
-" else \n"
-" {\n"
-" leftIndex = mid;\n"
-" break;\n"
-" }\n"
-" }\n"
-" else upper = mid - 1;\n"
-" }\n"
-" }\n"
-" \n"
-" //Find nearest element to right with a lower common prefix\n"
-" int rightIndex = -1;\n"
-" {\n"
-" int lower = internalNodeIndex + 1;\n"
-" int upper = numInternalNodes - 1;\n"
-" \n"
-" while(lower <= upper)\n"
-" {\n"
-" int mid = (lower + upper) / 2;\n"
-" b3Int64 midPrefix = commonPrefixes[mid];\n"
-" int midPrefixLength = commonPrefixLengths[mid];\n"
-" \n"
-" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
-" if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
-" {\n"
-" int left = mid - 1;\n"
-" if(left > internalNodeIndex)\n"
-" {\n"
-" b3Int64 leftPrefix = commonPrefixes[left];\n"
-" int leftPrefixLength = commonPrefixLengths[left];\n"
-" \n"
-" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n"
-" if(nodeLeftSharedPrefixLength < nodePrefixLength) \n"
-" {\n"
-" upper = left;\n"
-" rightIndex = left;\n"
-" }\n"
-" else \n"
-" {\n"
-" rightIndex = mid;\n"
-" break;\n"
-" }\n"
-" }\n"
-" else \n"
-" {\n"
-" rightIndex = mid;\n"
-" break;\n"
-" }\n"
-" }\n"
-" else lower = mid + 1;\n"
-" }\n"
-" }\n"
-"#endif\n"
-" \n"
-" //Select parent\n"
-" {\n"
-" int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
-" int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
-" \n"
-" int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n"
-" \n"
-" if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n"
-" else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n"
-" \n"
-" int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n"
-" \n"
-" int isRootNode = (leftIndex == -1 && rightIndex == -1);\n"
-" out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n"
-" \n"
-" int isLeaf = 0;\n"
-" if(!isRootNode)\n"
-" {\n"
-" int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa\n"
-" \n"
-" //out_childNodesAsInt[0] == int2.x == left child\n"
-" //out_childNodesAsInt[1] == int2.y == right child\n"
-" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
-" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
-" }\n"
-" else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
-" }\n"
-"}\n"
-"__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n"
-" __global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n"
-"{\n"
-" if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n"
-" int internalNodeIndex = get_global_id(0);\n"
-" if(internalNodeIndex >= numInternalNodes) return;\n"
-" \n"
-" //\n"
-" int distanceFromRoot = 0;\n"
-" {\n"
-" int parentIndex = internalNodeParentNodes[internalNodeIndex];\n"
-" while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n"
-" {\n"
-" parentIndex = internalNodeParentNodes[parentIndex];\n"
-" ++distanceFromRoot;\n"
-" }\n"
-" }\n"
-" out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n"
-" \n"
-" //\n"
-" __local int localMaxDistanceFromRoot;\n"
-" if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" \n"
-" atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" \n"
-" if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n"
-"}\n"
-"__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n"
-" __global int2* childNodes,\n"
-" __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n"
-" int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n"
-"{\n"
-" int internalNodeIndex = get_global_id(0);\n"
-" if(internalNodeIndex >= numInternalNodes) return;\n"
-" \n"
-" int distance = distanceFromRoot[internalNodeIndex];\n"
-" \n"
-" if(distance == processedDistance)\n"
-" {\n"
-" int leftChildIndex = childNodes[internalNodeIndex].x;\n"
-" int rightChildIndex = childNodes[internalNodeIndex].y;\n"
-" \n"
-" int isLeftChildLeaf = isLeafNode(leftChildIndex);\n"
-" int isRightChildLeaf = isLeafNode(rightChildIndex);\n"
-" \n"
-" leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n"
-" rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n"
-" \n"
-" //leftRigidIndex/rightRigidIndex is not used if internal node\n"
-" int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n"
-" int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n"
-" \n"
-" b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n"
-" b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n"
-" \n"
-" b3AabbCL mergedAabb;\n"
-" mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n"
-" mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n"
-" internalNodeAabbs[internalNodeIndex] = mergedAabb;\n"
-" }\n"
-"}\n"
-"__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n"
-"{\n"
-" int internalNodeIndex = get_global_id(0);\n"
-" if(internalNodeIndex >= numInternalNodes) return;\n"
-" \n"
-" int numLeafNodes = numInternalNodes + 1;\n"
-" \n"
-" int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n"
-" \n"
-" int2 leafIndexRange; //x == min leaf index, y == max leaf index\n"
-" \n"
-" //Find lowest leaf index covered by this internal node\n"
-" {\n"
-" int lowestIndex = childNodes.x; //childNodes.x == Left child\n"
-" while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n"
-" leafIndexRange.x = lowestIndex;\n"
-" }\n"
-" \n"
-" //Find highest leaf index covered by this internal node\n"
-" {\n"
-" int highestIndex = childNodes.y; //childNodes.y == Right child\n"
-" while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n"
-" leafIndexRange.y = highestIndex;\n"
-" }\n"
-" \n"
-" //\n"
-" out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n"
-"}\n"
-;
+static const char* parallelLinearBvhCL =
+ "/*\n"
+ "This software is provided 'as-is', without any express or implied warranty.\n"
+ "In no event will the authors be held liable for any damages arising from the use of this software.\n"
+ "Permission is granted to anyone to use this software for any purpose,\n"
+ "including commercial applications, and to alter it and redistribute it freely,\n"
+ "subject to the following restrictions:\n"
+ "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+ "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+ "3. This notice may not be removed or altered from any source distribution.\n"
+ "*/\n"
+ "//Initial Author Jackson Lee, 2014\n"
+ "typedef float b3Scalar;\n"
+ "typedef float4 b3Vector3;\n"
+ "#define b3Max max\n"
+ "#define b3Min min\n"
+ "#define b3Sqrt sqrt\n"
+ "typedef struct\n"
+ "{\n"
+ " unsigned int m_key;\n"
+ " unsigned int m_value;\n"
+ "} SortDataCL;\n"
+ "typedef struct \n"
+ "{\n"
+ " union\n"
+ " {\n"
+ " float4 m_min;\n"
+ " float m_minElems[4];\n"
+ " int m_minIndices[4];\n"
+ " };\n"
+ " union\n"
+ " {\n"
+ " float4 m_max;\n"
+ " float m_maxElems[4];\n"
+ " int m_maxIndices[4];\n"
+ " };\n"
+ "} b3AabbCL;\n"
+ "unsigned int interleaveBits(unsigned int x)\n"
+ "{\n"
+ " //........ ........ ......12 3456789A //x\n"
+ " //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits\n"
+ " \n"
+ " //......12 3456789A ......12 3456789A //x ^ (x << 16)\n"
+ " //11111111 ........ ........ 11111111 //0x FF 00 00 FF\n"
+ " //......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF;\n"
+ " \n"
+ " //......12 ........ 3456789A 3456789A //x ^ (x << 8)\n"
+ " //......11 ........ 1111.... ....1111 //0x 03 00 F0 0F\n"
+ " //......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F;\n"
+ " \n"
+ " //..12..12 ....3456 3456.... 789A789A //x ^ (x << 4)\n"
+ " //......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3\n"
+ " //......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3;\n"
+ " \n"
+ " //....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2)\n"
+ " //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49\n"
+ " //....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249;\n"
+ " \n"
+ " //........ ........ ......11 11111111 //0x000003FF\n"
+ " x &= 0x000003FF; //Clear all bits above bit 10\n"
+ " \n"
+ " x = (x ^ (x << 16)) & 0xFF0000FF;\n"
+ " x = (x ^ (x << 8)) & 0x0300F00F;\n"
+ " x = (x ^ (x << 4)) & 0x030C30C3;\n"
+ " x = (x ^ (x << 2)) & 0x09249249;\n"
+ " \n"
+ " return x;\n"
+ "}\n"
+ "unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n"
+ "{\n"
+ " return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n"
+ "}\n"
+ "__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n"
+ "{\n"
+ " int separatedAabbIndex = get_global_id(0);\n"
+ " if(separatedAabbIndex >= numAabbsToSeparate) return;\n"
+ " int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n"
+ " out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n"
+ "}\n"
+ "//Should replace with an optimized parallel reduction\n"
+ "__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n"
+ "{\n"
+ " //Each time this kernel is added to the command queue, \n"
+ " //the number of AABBs needing to be merged is halved\n"
+ " //\n"
+ " //Example with 159 AABBs:\n"
+ " // numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n"
+ " // numMergedAabbs == 159 - 80 == 79\n"
+ " //So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n"
+ " \n"
+ " int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n"
+ " int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n"
+ " \n"
+ " int aabbIndex = get_global_id(0);\n"
+ " if(aabbIndex >= numMergedAabbs) return;\n"
+ " \n"
+ " int otherAabbIndex = aabbIndex + numRemainingAabbs;\n"
+ " \n"
+ " b3AabbCL aabb = out_mergedAabb[aabbIndex];\n"
+ " b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n"
+ " \n"
+ " b3AabbCL mergedAabb;\n"
+ " mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n"
+ " mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n"
+ " out_mergedAabb[aabbIndex] = mergedAabb;\n"
+ "}\n"
+ "__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n"
+ " __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n"
+ "{\n"
+ " int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index\n"
+ " if(leafNodeIndex >= numAabbs) return;\n"
+ " \n"
+ " b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n"
+ " b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n"
+ " b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n"
+ " \n"
+ " b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n"
+ " b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n"
+ " b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n"
+ " \n"
+ " //Quantize into integer coordinates\n"
+ " //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n"
+ " b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n"
+ " \n"
+ " int4 discretePosition;\n"
+ " discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n"
+ " discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n"
+ " discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n"
+ " \n"
+ " //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n"
+ " discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n"
+ " discretePosition += 512;\n"
+ " \n"
+ " //Interleave bits(assign a morton code, also known as a z-curve)\n"
+ " unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n"
+ " \n"
+ " //\n"
+ " SortDataCL mortonCodeIndexPair;\n"
+ " mortonCodeIndexPair.m_key = mortonCode;\n"
+ " mortonCodeIndexPair.m_value = leafNodeIndex;\n"
+ " \n"
+ " out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n"
+ "}\n"
+ "#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n"
+ "//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n"
+ "//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n"
+ "//In both cases, the bit should be cleared to access the actual node index.\n"
+ "int isLeafNode(int index) { return (index >> 31 == 0); }\n"
+ "int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n"
+ "int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n"
+ "//From sap.cl\n"
+ "#define NEW_PAIR_MARKER -1\n"
+ "bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n"
+ "{\n"
+ " bool overlap = true;\n"
+ " overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+ " overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+ " overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+ " return overlap;\n"
+ "}\n"
+ "//From sap.cl\n"
+ "__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n"
+ " __global int* rootNodeIndex, \n"
+ " __global int2* internalNodeChildIndices, \n"
+ " __global b3AabbCL* internalNodeAabbs,\n"
+ " __global int2* internalNodeLeafIndexRanges,\n"
+ " \n"
+ " __global SortDataCL* mortonCodesAndAabbIndices,\n"
+ " __global int* out_numPairs, __global int4* out_overlappingPairs, \n"
+ " int maxPairs, int numQueryAabbs)\n"
+ "{\n"
+ " //Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n"
+ " //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n"
+ " int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
+ " if(queryBvhNodeIndex >= numQueryAabbs) return;\n"
+ " \n"
+ " int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n"
+ " b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n"
+ " \n"
+ " int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
+ " \n"
+ " int stackSize = 1;\n"
+ " stack[0] = *rootNodeIndex;\n"
+ " \n"
+ " while(stackSize)\n"
+ " {\n"
+ " int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
+ " --stackSize;\n"
+ " \n"
+ " int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n"
+ " int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
+ " \n"
+ " //Optimization - if the BVH is structured as a binary radix tree, then\n"
+ " //each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n"
+ " //This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n"
+ " {\n"
+ " int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n"
+ " if(highestLeafIndex <= queryBvhNodeIndex) continue;\n"
+ " }\n"
+ " \n"
+ " //bvhRigidIndex is not used if internal node\n"
+ " int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
+ " \n"
+ " b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
+ " if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n"
+ " {\n"
+ " if(isLeaf)\n"
+ " {\n"
+ " int4 pair;\n"
+ " pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n"
+ " pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
+ " pair.z = NEW_PAIR_MARKER;\n"
+ " pair.w = NEW_PAIR_MARKER;\n"
+ " \n"
+ " int pairIndex = atomic_inc(out_numPairs);\n"
+ " if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
+ " }\n"
+ " \n"
+ " if(!isLeaf) //Internal node\n"
+ " {\n"
+ " if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
+ " {\n"
+ " //Error\n"
+ " }\n"
+ " else\n"
+ " {\n"
+ " stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
+ " stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
+ " }\n"
+ " }\n"
+ " }\n"
+ " \n"
+ " }\n"
+ "}\n"
+ "//From rayCastKernels.cl\n"
+ "typedef struct\n"
+ "{\n"
+ " float4 m_from;\n"
+ " float4 m_to;\n"
+ "} b3RayInfo;\n"
+ "//From rayCastKernels.cl\n"
+ "b3Vector3 b3Vector3_normalize(b3Vector3 v)\n"
+ "{\n"
+ " b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n"
+ " return normalize(normal); //OpenCL normalize == vector4 normalize\n"
+ "}\n"
+ "b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n"
+ "b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n"
+ "int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n"
+ "{\n"
+ " //AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n"
+ " //t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n"
+ " //\n"
+ " //if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n"
+ " //and min.x will be the far plane; otherwise, it is reversed.\n"
+ " //\n"
+ " //In order for there to be a collision, the t_min and t_max of each pair must overlap.\n"
+ " //This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n"
+ " \n"
+ " int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) ); //isless(x,y) returns (x < y)\n"
+ " \n"
+ " //When using vector types, the select() function checks the most signficant bit, \n"
+ " //but isless() sets the least significant bit.\n"
+ " isNegative <<= 31;\n"
+ " //select(b, a, condition) == condition ? a : b\n"
+ " //When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n"
+ " b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
+ " b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
+ " \n"
+ " b3Scalar t_min_final = 0.0f;\n"
+ " b3Scalar t_max_final = rayLength;\n"
+ " \n"
+ " //Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n"
+ " //Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n"
+ " //Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n"
+ " t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n"
+ " t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n"
+ " \n"
+ " return (t_min_final <= t_max_final);\n"
+ "}\n"
+ "__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n"
+ " __global int* rootNodeIndex, \n"
+ " __global int2* internalNodeChildIndices, \n"
+ " __global b3AabbCL* internalNodeAabbs,\n"
+ " __global int2* internalNodeLeafIndexRanges,\n"
+ " __global SortDataCL* mortonCodesAndAabbIndices,\n"
+ " \n"
+ " __global b3RayInfo* rays,\n"
+ " \n"
+ " __global int* out_numRayRigidPairs, \n"
+ " __global int2* out_rayRigidPairs,\n"
+ " int maxRayRigidPairs, int numRays)\n"
+ "{\n"
+ " int rayIndex = get_global_id(0);\n"
+ " if(rayIndex >= numRays) return;\n"
+ " \n"
+ " //\n"
+ " b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
+ " b3Vector3 rayTo = rays[rayIndex].m_to;\n"
+ " b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
+ " b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
+ " \n"
+ " //\n"
+ " int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
+ " \n"
+ " int stackSize = 1;\n"
+ " stack[0] = *rootNodeIndex;\n"
+ " \n"
+ " while(stackSize)\n"
+ " {\n"
+ " int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
+ " --stackSize;\n"
+ " \n"
+ " int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n"
+ " int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
+ " \n"
+ " //bvhRigidIndex is not used if internal node\n"
+ " int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
+ " \n"
+ " b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
+ " if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) )\n"
+ " {\n"
+ " if(isLeaf)\n"
+ " {\n"
+ " int2 rayRigidPair;\n"
+ " rayRigidPair.x = rayIndex;\n"
+ " rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
+ " \n"
+ " int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
+ " if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
+ " }\n"
+ " \n"
+ " if(!isLeaf) //Internal node\n"
+ " {\n"
+ " if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
+ " {\n"
+ " //Error\n"
+ " }\n"
+ " else\n"
+ " {\n"
+ " stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
+ " stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
+ " }\n"
+ " }\n"
+ " }\n"
+ " }\n"
+ "}\n"
+ "__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n"
+ " __global int* out_numPairs, __global int4* out_overlappingPairs, \n"
+ " int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n"
+ "{\n"
+ " int smallAabbIndex = get_global_id(0);\n"
+ " if(smallAabbIndex >= numSmallAabbRigids) return;\n"
+ " \n"
+ " b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n"
+ " for(int i = 0; i < numLargeAabbRigids; ++i)\n"
+ " {\n"
+ " b3AabbCL largeAabb = largeAabbs[i];\n"
+ " if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n"
+ " {\n"
+ " int4 pair;\n"
+ " pair.x = largeAabb.m_minIndices[3];\n"
+ " pair.y = smallAabb.m_minIndices[3];\n"
+ " pair.z = NEW_PAIR_MARKER;\n"
+ " pair.w = NEW_PAIR_MARKER;\n"
+ " \n"
+ " int pairIndex = atomic_inc(out_numPairs);\n"
+ " if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
+ " }\n"
+ " }\n"
+ "}\n"
+ "__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n"
+ " __global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs,\n"
+ " int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n"
+ "{\n"
+ " int rayIndex = get_global_id(0);\n"
+ " if(rayIndex >= numRays) return;\n"
+ " \n"
+ " b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
+ " b3Vector3 rayTo = rays[rayIndex].m_to;\n"
+ " b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
+ " b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
+ " \n"
+ " for(int i = 0; i < numLargeAabbRigids; ++i)\n"
+ " {\n"
+ " b3AabbCL rigidAabb = largeRigidAabbs[i];\n"
+ " if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n"
+ " {\n"
+ " int2 rayRigidPair;\n"
+ " rayRigidPair.x = rayIndex;\n"
+ " rayRigidPair.y = rigidAabb.m_minIndices[3];\n"
+ " \n"
+ " int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
+ " if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
+ " }\n"
+ " }\n"
+ "}\n"
+ "//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n"
+ "//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n"
+ "//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n"
+ "//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n"
+ "#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n"
+ "#define B3_PLBVH_ROOT_NODE_MARKER -1\n"
+ "#define b3Int64 long\n"
+ "int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n"
+ "b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n"
+ "{\n"
+ " //This function only needs to return (i & j) in order for the algorithm to work,\n"
+ " //but it may help with debugging to mask out the lower bits.\n"
+ " b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n"
+ " b3Int64 sharedBits = i & j;\n"
+ " b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0\n"
+ " \n"
+ " return sharedBits & bitmask;\n"
+ "}\n"
+ "//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n"
+ "int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n"
+ "{\n"
+ " return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n"
+ "}\n"
+ "__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n"
+ " __global b3Int64* out_commonPrefixes,\n"
+ " __global int* out_commonPrefixLengths,\n"
+ " int numInternalNodes)\n"
+ "{\n"
+ " int internalNodeIndex = get_global_id(0);\n"
+ " if (internalNodeIndex >= numInternalNodes) return;\n"
+ " \n"
+ " //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n"
+ " //and the number of internal nodes is always numLeafNodes - 1\n"
+ " int leftLeafIndex = internalNodeIndex;\n"
+ " int rightLeafIndex = internalNodeIndex + 1;\n"
+ " \n"
+ " int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n"
+ " int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n"
+ " \n"
+ " //Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n"
+ " //Append the index of each leaf node to each morton code so that there are no duplicates.\n"
+ " //The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n"
+ " //is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n"
+ " //\n"
+ " //upsample(a, b) == ( ((b3Int64)a) << 32) | b\n"
+ " b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n"
+ " b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n"
+ " \n"
+ " out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
+ " out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
+ "}\n"
+ "__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n"
+ " __global int2* out_childNodes, int numLeafNodes)\n"
+ "{\n"
+ " int leafNodeIndex = get_global_id(0);\n"
+ " if (leafNodeIndex >= numLeafNodes) return;\n"
+ " \n"
+ " int numInternalNodes = numLeafNodes - 1;\n"
+ " \n"
+ " int leftSplitIndex = leafNodeIndex - 1;\n"
+ " int rightSplitIndex = leafNodeIndex;\n"
+ " \n"
+ " int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
+ " int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
+ " \n"
+ " //Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n"
+ " //Leaf nodes are considered as having the highest common prefix\n"
+ " int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n"
+ " \n"
+ " //Handle cases for the edge nodes; the first and last node\n"
+ " //For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n"
+ " if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n"
+ " if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n"
+ " \n"
+ " int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n"
+ " out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n"
+ " \n"
+ " int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa\n"
+ " \n"
+ " //out_childNodesAsInt[0] == int2.x == left child\n"
+ " //out_childNodesAsInt[1] == int2.y == right child\n"
+ " int isLeaf = 1;\n"
+ " __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
+ " out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n"
+ "}\n"
+ "__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n"
+ " __global int2* out_childNodes,\n"
+ " __global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n"
+ " int numInternalNodes)\n"
+ "{\n"
+ " int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
+ " if(internalNodeIndex >= numInternalNodes) return;\n"
+ " \n"
+ " b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n"
+ " int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n"
+ " \n"
+ "//#define USE_LINEAR_SEARCH\n"
+ "#ifdef USE_LINEAR_SEARCH\n"
+ " int leftIndex = -1;\n"
+ " int rightIndex = -1;\n"
+ " \n"
+ " //Find nearest element to left with a lower common prefix\n"
+ " for(int i = internalNodeIndex - 1; i >= 0; --i)\n"
+ " {\n"
+ " int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
+ " if(nodeLeftSharedPrefixLength < nodePrefixLength)\n"
+ " {\n"
+ " leftIndex = i;\n"
+ " break;\n"
+ " }\n"
+ " }\n"
+ " \n"
+ " //Find nearest element to right with a lower common prefix\n"
+ " for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n"
+ " {\n"
+ " int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
+ " if(nodeRightSharedPrefixLength < nodePrefixLength)\n"
+ " {\n"
+ " rightIndex = i;\n"
+ " break;\n"
+ " }\n"
+ " }\n"
+ " \n"
+ "#else //Use binary search\n"
+ " //Find nearest element to left with a lower common prefix\n"
+ " int leftIndex = -1;\n"
+ " {\n"
+ " int lower = 0;\n"
+ " int upper = internalNodeIndex - 1;\n"
+ " \n"
+ " while(lower <= upper)\n"
+ " {\n"
+ " int mid = (lower + upper) / 2;\n"
+ " b3Int64 midPrefix = commonPrefixes[mid];\n"
+ " int midPrefixLength = commonPrefixLengths[mid];\n"
+ " \n"
+ " int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
+ " if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
+ " {\n"
+ " int right = mid + 1;\n"
+ " if(right < internalNodeIndex)\n"
+ " {\n"
+ " b3Int64 rightPrefix = commonPrefixes[right];\n"
+ " int rightPrefixLength = commonPrefixLengths[right];\n"
+ " \n"
+ " int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n"
+ " if(nodeRightSharedPrefixLength < nodePrefixLength) \n"
+ " {\n"
+ " lower = right;\n"
+ " leftIndex = right;\n"
+ " }\n"
+ " else \n"
+ " {\n"
+ " leftIndex = mid;\n"
+ " break;\n"
+ " }\n"
+ " }\n"
+ " else \n"
+ " {\n"
+ " leftIndex = mid;\n"
+ " break;\n"
+ " }\n"
+ " }\n"
+ " else upper = mid - 1;\n"
+ " }\n"
+ " }\n"
+ " \n"
+ " //Find nearest element to right with a lower common prefix\n"
+ " int rightIndex = -1;\n"
+ " {\n"
+ " int lower = internalNodeIndex + 1;\n"
+ " int upper = numInternalNodes - 1;\n"
+ " \n"
+ " while(lower <= upper)\n"
+ " {\n"
+ " int mid = (lower + upper) / 2;\n"
+ " b3Int64 midPrefix = commonPrefixes[mid];\n"
+ " int midPrefixLength = commonPrefixLengths[mid];\n"
+ " \n"
+ " int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
+ " if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
+ " {\n"
+ " int left = mid - 1;\n"
+ " if(left > internalNodeIndex)\n"
+ " {\n"
+ " b3Int64 leftPrefix = commonPrefixes[left];\n"
+ " int leftPrefixLength = commonPrefixLengths[left];\n"
+ " \n"
+ " int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n"
+ " if(nodeLeftSharedPrefixLength < nodePrefixLength) \n"
+ " {\n"
+ " upper = left;\n"
+ " rightIndex = left;\n"
+ " }\n"
+ " else \n"
+ " {\n"
+ " rightIndex = mid;\n"
+ " break;\n"
+ " }\n"
+ " }\n"
+ " else \n"
+ " {\n"
+ " rightIndex = mid;\n"
+ " break;\n"
+ " }\n"
+ " }\n"
+ " else lower = mid + 1;\n"
+ " }\n"
+ " }\n"
+ "#endif\n"
+ " \n"
+ " //Select parent\n"
+ " {\n"
+ " int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
+ " int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
+ " \n"
+ " int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n"
+ " \n"
+ " if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n"
+ " else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n"
+ " \n"
+ " int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n"
+ " \n"
+ " int isRootNode = (leftIndex == -1 && rightIndex == -1);\n"
+ " out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n"
+ " \n"
+ " int isLeaf = 0;\n"
+ " if(!isRootNode)\n"
+ " {\n"
+ " int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa\n"
+ " \n"
+ " //out_childNodesAsInt[0] == int2.x == left child\n"
+ " //out_childNodesAsInt[1] == int2.y == right child\n"
+ " __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
+ " out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
+ " }\n"
+ " else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
+ " }\n"
+ "}\n"
+ "__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n"
+ " __global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n"
+ "{\n"
+ " if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n"
+ " int internalNodeIndex = get_global_id(0);\n"
+ " if(internalNodeIndex >= numInternalNodes) return;\n"
+ " \n"
+ " //\n"
+ " int distanceFromRoot = 0;\n"
+ " {\n"
+ " int parentIndex = internalNodeParentNodes[internalNodeIndex];\n"
+ " while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n"
+ " {\n"
+ " parentIndex = internalNodeParentNodes[parentIndex];\n"
+ " ++distanceFromRoot;\n"
+ " }\n"
+ " }\n"
+ " out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n"
+ " \n"
+ " //\n"
+ " __local int localMaxDistanceFromRoot;\n"
+ " if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n"
+ " \n"
+ " atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n"
+ " \n"
+ " if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n"
+ "}\n"
+ "__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n"
+ " __global int2* childNodes,\n"
+ " __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n"
+ " int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n"
+ "{\n"
+ " int internalNodeIndex = get_global_id(0);\n"
+ " if(internalNodeIndex >= numInternalNodes) return;\n"
+ " \n"
+ " int distance = distanceFromRoot[internalNodeIndex];\n"
+ " \n"
+ " if(distance == processedDistance)\n"
+ " {\n"
+ " int leftChildIndex = childNodes[internalNodeIndex].x;\n"
+ " int rightChildIndex = childNodes[internalNodeIndex].y;\n"
+ " \n"
+ " int isLeftChildLeaf = isLeafNode(leftChildIndex);\n"
+ " int isRightChildLeaf = isLeafNode(rightChildIndex);\n"
+ " \n"
+ " leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n"
+ " rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n"
+ " \n"
+ " //leftRigidIndex/rightRigidIndex is not used if internal node\n"
+ " int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n"
+ " int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n"
+ " \n"
+ " b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n"
+ " b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n"
+ " \n"
+ " b3AabbCL mergedAabb;\n"
+ " mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n"
+ " mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n"
+ " internalNodeAabbs[internalNodeIndex] = mergedAabb;\n"
+ " }\n"
+ "}\n"
+ "__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n"
+ "{\n"
+ " int internalNodeIndex = get_global_id(0);\n"
+ " if(internalNodeIndex >= numInternalNodes) return;\n"
+ " \n"
+ " int numLeafNodes = numInternalNodes + 1;\n"
+ " \n"
+ " int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n"
+ " \n"
+ " int2 leafIndexRange; //x == min leaf index, y == max leaf index\n"
+ " \n"
+ " //Find lowest leaf index covered by this internal node\n"
+ " {\n"
+ " int lowestIndex = childNodes.x; //childNodes.x == Left child\n"
+ " while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n"
+ " leafIndexRange.x = lowestIndex;\n"
+ " }\n"
+ " \n"
+ " //Find highest leaf index covered by this internal node\n"
+ " {\n"
+ " int highestIndex = childNodes.y; //childNodes.y == Right child\n"
+ " while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n"
+ " leafIndexRange.y = highestIndex;\n"
+ " }\n"
+ " \n"
+ " //\n"
+ " out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n"
+ "}\n";
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
index 04d40fcf26..d6999b94cb 100644
--- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
+++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
@@ -1,342 +1,341 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* sapCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Erwin Coumans\n"
-"#define NEW_PAIR_MARKER -1\n"
-"typedef struct \n"
-"{\n"
-" union\n"
-" {\n"
-" float4 m_min;\n"
-" float m_minElems[4];\n"
-" int m_minIndices[4];\n"
-" };\n"
-" union\n"
-" {\n"
-" float4 m_max;\n"
-" float m_maxElems[4];\n"
-" int m_maxIndices[4];\n"
-" };\n"
-"} btAabbCL;\n"
-"/// conservative test for overlap between two aabbs\n"
-"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
-"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
-"{\n"
-" bool overlap = true;\n"
-" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
-" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
-" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
-" return overlap;\n"
-"}\n"
-"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
-"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
-"{\n"
-" bool overlap = true;\n"
-" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
-" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
-" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
-" return overlap;\n"
-"}\n"
-"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
-"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
-"{\n"
-" bool overlap = true;\n"
-" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
-" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
-" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
-" return overlap;\n"
-"}\n"
-"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n"
-"{\n"
-" int i = get_global_id(0);\n"
-" if (i>=numUnsortedAabbs)\n"
-" return;\n"
-" int j = get_global_id(1);\n"
-" if (j>=numUnSortedAabbs2)\n"
-" return;\n"
-" __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n"
-" __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n"
-" if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n"
-" {\n"
-" int4 myPair;\n"
-" \n"
-" int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n"
-" int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n"
-" if (xIndex>yIndex)\n"
-" {\n"
-" int tmp = xIndex;\n"
-" xIndex=yIndex;\n"
-" yIndex=tmp;\n"
-" }\n"
-" \n"
-" myPair.x = xIndex;\n"
-" myPair.y = yIndex;\n"
-" myPair.z = NEW_PAIR_MARKER;\n"
-" myPair.w = NEW_PAIR_MARKER;\n"
-" int curPair = atomic_inc (pairCount);\n"
-" if (curPair<maxPairs)\n"
-" {\n"
-" pairsOut[curPair] = myPair; //flush to main memory\n"
-" }\n"
-" }\n"
-"}\n"
-"__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
-"{\n"
-" int i = get_global_id(0);\n"
-" if (i>=numObjects)\n"
-" return;\n"
-" for (int j=i+1;j<numObjects;j++)\n"
-" {\n"
-" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
-" {\n"
-" int4 myPair;\n"
-" myPair.x = aabbs[i].m_minIndices[3];\n"
-" myPair.y = aabbs[j].m_minIndices[3];\n"
-" myPair.z = NEW_PAIR_MARKER;\n"
-" myPair.w = NEW_PAIR_MARKER;\n"
-" int curPair = atomic_inc (pairCount);\n"
-" if (curPair<maxPairs)\n"
-" {\n"
-" pairsOut[curPair] = myPair; //flush to main memory\n"
-" }\n"
-" }\n"
-" }\n"
-"}\n"
-"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
-"{\n"
-" int i = get_global_id(0);\n"
-" if (i>=numObjects)\n"
-" return;\n"
-" for (int j=i+1;j<numObjects;j++)\n"
-" {\n"
-" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
-" {\n"
-" break;\n"
-" }\n"
-" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
-" {\n"
-" int4 myPair;\n"
-" myPair.x = aabbs[i].m_minIndices[3];\n"
-" myPair.y = aabbs[j].m_minIndices[3];\n"
-" myPair.z = NEW_PAIR_MARKER;\n"
-" myPair.w = NEW_PAIR_MARKER;\n"
-" int curPair = atomic_inc (pairCount);\n"
-" if (curPair<maxPairs)\n"
-" {\n"
-" pairsOut[curPair] = myPair; //flush to main memory\n"
-" }\n"
-" }\n"
-" }\n"
-"}\n"
-"__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
-"{\n"
-" int i = get_global_id(0);\n"
-" int localId = get_local_id(0);\n"
-" __local int numActiveWgItems[1];\n"
-" __local int breakRequest[1];\n"
-" if (localId==0)\n"
-" {\n"
-" numActiveWgItems[0] = 0;\n"
-" breakRequest[0] = 0;\n"
-" }\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" atomic_inc(numActiveWgItems);\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" int localBreak = 0;\n"
-" int j=i+1;\n"
-" do\n"
-" {\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" \n"
-" if (j<numObjects)\n"
-" {\n"
-" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
-" {\n"
-" if (!localBreak)\n"
-" {\n"
-" atomic_inc(breakRequest);\n"
-" localBreak = 1;\n"
-" }\n"
-" }\n"
-" }\n"
-" \n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" \n"
-" if (j>=numObjects && !localBreak)\n"
-" {\n"
-" atomic_inc(breakRequest);\n"
-" localBreak = 1;\n"
-" }\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" \n"
-" if (!localBreak)\n"
-" {\n"
-" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
-" {\n"
-" int4 myPair;\n"
-" myPair.x = aabbs[i].m_minIndices[3];\n"
-" myPair.y = aabbs[j].m_minIndices[3];\n"
-" myPair.z = NEW_PAIR_MARKER;\n"
-" myPair.w = NEW_PAIR_MARKER;\n"
-" int curPair = atomic_inc (pairCount);\n"
-" if (curPair<maxPairs)\n"
-" {\n"
-" pairsOut[curPair] = myPair; //flush to main memory\n"
-" }\n"
-" }\n"
-" }\n"
-" j++;\n"
-" } while (breakRequest[0]<numActiveWgItems[0]);\n"
-"}\n"
-"__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
-"{\n"
-" int i = get_global_id(0);\n"
-" int localId = get_local_id(0);\n"
-" __local int numActiveWgItems[1];\n"
-" __local int breakRequest[1];\n"
-" __local btAabbCL localAabbs[128];// = aabbs[i];\n"
-" \n"
-" btAabbCL myAabb;\n"
-" \n"
-" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
-" float testValue = myAabb.m_maxElems[axis];\n"
-" \n"
-" if (localId==0)\n"
-" {\n"
-" numActiveWgItems[0] = 0;\n"
-" breakRequest[0] = 0;\n"
-" }\n"
-" int localCount=0;\n"
-" int block=0;\n"
-" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
-" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
-" \n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" atomic_inc(numActiveWgItems);\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" int localBreak = 0;\n"
-" \n"
-" int j=i+1;\n"
-" do\n"
-" {\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" \n"
-" if (j<numObjects)\n"
-" {\n"
-" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
-" {\n"
-" if (!localBreak)\n"
-" {\n"
-" atomic_inc(breakRequest);\n"
-" localBreak = 1;\n"
-" }\n"
-" }\n"
-" }\n"
-" \n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" \n"
-" if (j>=numObjects && !localBreak)\n"
-" {\n"
-" atomic_inc(breakRequest);\n"
-" localBreak = 1;\n"
-" }\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" \n"
-" if (!localBreak)\n"
-" {\n"
-" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
-" {\n"
-" int4 myPair;\n"
-" myPair.x = myAabb.m_minIndices[3];\n"
-" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
-" myPair.z = NEW_PAIR_MARKER;\n"
-" myPair.w = NEW_PAIR_MARKER;\n"
-" int curPair = atomic_inc (pairCount);\n"
-" if (curPair<maxPairs)\n"
-" {\n"
-" pairsOut[curPair] = myPair; //flush to main memory\n"
-" }\n"
-" }\n"
-" }\n"
-" \n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" localCount++;\n"
-" if (localCount==64)\n"
-" {\n"
-" localCount = 0;\n"
-" block+=64; \n"
-" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
-" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
-" }\n"
-" j++;\n"
-" \n"
-" } while (breakRequest[0]<numActiveWgItems[0]);\n"
-" \n"
-"}\n"
-"//http://stereopsis.com/radix.html\n"
-"unsigned int FloatFlip(float fl);\n"
-"unsigned int FloatFlip(float fl)\n"
-"{\n"
-" unsigned int f = *(unsigned int*)&fl;\n"
-" unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
-" return f ^ mask;\n"
-"}\n"
-"float IFloatFlip(unsigned int f);\n"
-"float IFloatFlip(unsigned int f)\n"
-"{\n"
-" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
-" unsigned int fl = f ^ mask;\n"
-" return *(float*)&fl;\n"
-"}\n"
-"__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
-"{\n"
-" int i = get_global_id(0);\n"
-" if (i>=numObjects)\n"
-" return;\n"
-" int src = destAabbs[i].m_maxIndices[3];\n"
-" destAabbs[i] = allAabbs[src];\n"
-" destAabbs[i].m_maxIndices[3] = src;\n"
-"}\n"
-"__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n"
-"{\n"
-" int i = get_global_id(0);\n"
-" if (i>=numObjects)\n"
-" return;\n"
-" \n"
-" \n"
-" sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n"
-" sortData[i].y = i;\n"
-" \n"
-"}\n"
-"__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
-"{\n"
-" int i = get_global_id(0);\n"
-" if (i>=numObjects)\n"
-" return;\n"
-" \n"
-" sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n"
-"}\n"
-"__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n"
-"{\n"
-" int i = get_global_id(0);\n"
-" if (i>=numAabbs)\n"
-" return;\n"
-" \n"
-" btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n"
-" \n"
-" float4 s;\n"
-" s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n"
-" sum[i]=s;\n"
-" sum2[i]=s*s; \n"
-"}\n"
-;
+static const char* sapCL =
+ "/*\n"
+ "Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
+ "This software is provided 'as-is', without any express or implied warranty.\n"
+ "In no event will the authors be held liable for any damages arising from the use of this software.\n"
+ "Permission is granted to anyone to use this software for any purpose, \n"
+ "including commercial applications, and to alter it and redistribute it freely, \n"
+ "subject to the following restrictions:\n"
+ "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+ "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+ "3. This notice may not be removed or altered from any source distribution.\n"
+ "*/\n"
+ "//Originally written by Erwin Coumans\n"
+ "#define NEW_PAIR_MARKER -1\n"
+ "typedef struct \n"
+ "{\n"
+ " union\n"
+ " {\n"
+ " float4 m_min;\n"
+ " float m_minElems[4];\n"
+ " int m_minIndices[4];\n"
+ " };\n"
+ " union\n"
+ " {\n"
+ " float4 m_max;\n"
+ " float m_maxElems[4];\n"
+ " int m_maxIndices[4];\n"
+ " };\n"
+ "} btAabbCL;\n"
+ "/// conservative test for overlap between two aabbs\n"
+ "bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
+ "bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
+ "{\n"
+ " bool overlap = true;\n"
+ " overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+ " overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+ " overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+ " return overlap;\n"
+ "}\n"
+ "bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
+ "bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
+ "{\n"
+ " bool overlap = true;\n"
+ " overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+ " overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+ " overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+ " return overlap;\n"
+ "}\n"
+ "bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
+ "bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
+ "{\n"
+ " bool overlap = true;\n"
+ " overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+ " overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+ " overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+ " return overlap;\n"
+ "}\n"
+ "__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n"
+ "{\n"
+ " int i = get_global_id(0);\n"
+ " if (i>=numUnsortedAabbs)\n"
+ " return;\n"
+ " int j = get_global_id(1);\n"
+ " if (j>=numUnSortedAabbs2)\n"
+ " return;\n"
+ " __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n"
+ " __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n"
+ " if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n"
+ " {\n"
+ " int4 myPair;\n"
+ " \n"
+ " int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n"
+ " int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n"
+ " if (xIndex>yIndex)\n"
+ " {\n"
+ " int tmp = xIndex;\n"
+ " xIndex=yIndex;\n"
+ " yIndex=tmp;\n"
+ " }\n"
+ " \n"
+ " myPair.x = xIndex;\n"
+ " myPair.y = yIndex;\n"
+ " myPair.z = NEW_PAIR_MARKER;\n"
+ " myPair.w = NEW_PAIR_MARKER;\n"
+ " int curPair = atomic_inc (pairCount);\n"
+ " if (curPair<maxPairs)\n"
+ " {\n"
+ " pairsOut[curPair] = myPair; //flush to main memory\n"
+ " }\n"
+ " }\n"
+ "}\n"
+ "__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+ "{\n"
+ " int i = get_global_id(0);\n"
+ " if (i>=numObjects)\n"
+ " return;\n"
+ " for (int j=i+1;j<numObjects;j++)\n"
+ " {\n"
+ " if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+ " {\n"
+ " int4 myPair;\n"
+ " myPair.x = aabbs[i].m_minIndices[3];\n"
+ " myPair.y = aabbs[j].m_minIndices[3];\n"
+ " myPair.z = NEW_PAIR_MARKER;\n"
+ " myPair.w = NEW_PAIR_MARKER;\n"
+ " int curPair = atomic_inc (pairCount);\n"
+ " if (curPair<maxPairs)\n"
+ " {\n"
+ " pairsOut[curPair] = myPair; //flush to main memory\n"
+ " }\n"
+ " }\n"
+ " }\n"
+ "}\n"
+ "__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+ "{\n"
+ " int i = get_global_id(0);\n"
+ " if (i>=numObjects)\n"
+ " return;\n"
+ " for (int j=i+1;j<numObjects;j++)\n"
+ " {\n"
+ " if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
+ " {\n"
+ " break;\n"
+ " }\n"
+ " if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+ " {\n"
+ " int4 myPair;\n"
+ " myPair.x = aabbs[i].m_minIndices[3];\n"
+ " myPair.y = aabbs[j].m_minIndices[3];\n"
+ " myPair.z = NEW_PAIR_MARKER;\n"
+ " myPair.w = NEW_PAIR_MARKER;\n"
+ " int curPair = atomic_inc (pairCount);\n"
+ " if (curPair<maxPairs)\n"
+ " {\n"
+ " pairsOut[curPair] = myPair; //flush to main memory\n"
+ " }\n"
+ " }\n"
+ " }\n"
+ "}\n"
+ "__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+ "{\n"
+ " int i = get_global_id(0);\n"
+ " int localId = get_local_id(0);\n"
+ " __local int numActiveWgItems[1];\n"
+ " __local int breakRequest[1];\n"
+ " if (localId==0)\n"
+ " {\n"
+ " numActiveWgItems[0] = 0;\n"
+ " breakRequest[0] = 0;\n"
+ " }\n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n"
+ " atomic_inc(numActiveWgItems);\n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n"
+ " int localBreak = 0;\n"
+ " int j=i+1;\n"
+ " do\n"
+ " {\n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n"
+ " \n"
+ " if (j<numObjects)\n"
+ " {\n"
+ " if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
+ " {\n"
+ " if (!localBreak)\n"
+ " {\n"
+ " atomic_inc(breakRequest);\n"
+ " localBreak = 1;\n"
+ " }\n"
+ " }\n"
+ " }\n"
+ " \n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n"
+ " \n"
+ " if (j>=numObjects && !localBreak)\n"
+ " {\n"
+ " atomic_inc(breakRequest);\n"
+ " localBreak = 1;\n"
+ " }\n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n"
+ " \n"
+ " if (!localBreak)\n"
+ " {\n"
+ " if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+ " {\n"
+ " int4 myPair;\n"
+ " myPair.x = aabbs[i].m_minIndices[3];\n"
+ " myPair.y = aabbs[j].m_minIndices[3];\n"
+ " myPair.z = NEW_PAIR_MARKER;\n"
+ " myPair.w = NEW_PAIR_MARKER;\n"
+ " int curPair = atomic_inc (pairCount);\n"
+ " if (curPair<maxPairs)\n"
+ " {\n"
+ " pairsOut[curPair] = myPair; //flush to main memory\n"
+ " }\n"
+ " }\n"
+ " }\n"
+ " j++;\n"
+ " } while (breakRequest[0]<numActiveWgItems[0]);\n"
+ "}\n"
+ "__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+ "{\n"
+ " int i = get_global_id(0);\n"
+ " int localId = get_local_id(0);\n"
+ " __local int numActiveWgItems[1];\n"
+ " __local int breakRequest[1];\n"
+ " __local btAabbCL localAabbs[128];// = aabbs[i];\n"
+ " \n"
+ " btAabbCL myAabb;\n"
+ " \n"
+ " myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
+ " float testValue = myAabb.m_maxElems[axis];\n"
+ " \n"
+ " if (localId==0)\n"
+ " {\n"
+ " numActiveWgItems[0] = 0;\n"
+ " breakRequest[0] = 0;\n"
+ " }\n"
+ " int localCount=0;\n"
+ " int block=0;\n"
+ " localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
+ " localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
+ " \n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n"
+ " atomic_inc(numActiveWgItems);\n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n"
+ " int localBreak = 0;\n"
+ " \n"
+ " int j=i+1;\n"
+ " do\n"
+ " {\n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n"
+ " \n"
+ " if (j<numObjects)\n"
+ " {\n"
+ " if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
+ " {\n"
+ " if (!localBreak)\n"
+ " {\n"
+ " atomic_inc(breakRequest);\n"
+ " localBreak = 1;\n"
+ " }\n"
+ " }\n"
+ " }\n"
+ " \n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n"
+ " \n"
+ " if (j>=numObjects && !localBreak)\n"
+ " {\n"
+ " atomic_inc(breakRequest);\n"
+ " localBreak = 1;\n"
+ " }\n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n"
+ " \n"
+ " if (!localBreak)\n"
+ " {\n"
+ " if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
+ " {\n"
+ " int4 myPair;\n"
+ " myPair.x = myAabb.m_minIndices[3];\n"
+ " myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
+ " myPair.z = NEW_PAIR_MARKER;\n"
+ " myPair.w = NEW_PAIR_MARKER;\n"
+ " int curPair = atomic_inc (pairCount);\n"
+ " if (curPair<maxPairs)\n"
+ " {\n"
+ " pairsOut[curPair] = myPair; //flush to main memory\n"
+ " }\n"
+ " }\n"
+ " }\n"
+ " \n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n"
+ " localCount++;\n"
+ " if (localCount==64)\n"
+ " {\n"
+ " localCount = 0;\n"
+ " block+=64; \n"
+ " localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
+ " localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
+ " }\n"
+ " j++;\n"
+ " \n"
+ " } while (breakRequest[0]<numActiveWgItems[0]);\n"
+ " \n"
+ "}\n"
+ "//http://stereopsis.com/radix.html\n"
+ "unsigned int FloatFlip(float fl);\n"
+ "unsigned int FloatFlip(float fl)\n"
+ "{\n"
+ " unsigned int f = *(unsigned int*)&fl;\n"
+ " unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
+ " return f ^ mask;\n"
+ "}\n"
+ "float IFloatFlip(unsigned int f);\n"
+ "float IFloatFlip(unsigned int f)\n"
+ "{\n"
+ " unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
+ " unsigned int fl = f ^ mask;\n"
+ " return *(float*)&fl;\n"
+ "}\n"
+ "__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
+ "{\n"
+ " int i = get_global_id(0);\n"
+ " if (i>=numObjects)\n"
+ " return;\n"
+ " int src = destAabbs[i].m_maxIndices[3];\n"
+ " destAabbs[i] = allAabbs[src];\n"
+ " destAabbs[i].m_maxIndices[3] = src;\n"
+ "}\n"
+ "__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n"
+ "{\n"
+ " int i = get_global_id(0);\n"
+ " if (i>=numObjects)\n"
+ " return;\n"
+ " \n"
+ " \n"
+ " sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n"
+ " sortData[i].y = i;\n"
+ " \n"
+ "}\n"
+ "__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
+ "{\n"
+ " int i = get_global_id(0);\n"
+ " if (i>=numObjects)\n"
+ " return;\n"
+ " \n"
+ " sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n"
+ "}\n"
+ "__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n"
+ "{\n"
+ " int i = get_global_id(0);\n"
+ " if (i>=numAabbs)\n"
+ " return;\n"
+ " \n"
+ " btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n"
+ " \n"
+ " float4 s;\n"
+ " s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n"
+ " sum[i]=s;\n"
+ " sum2[i]=s*s; \n"
+ "}\n";