diff options
Diffstat (limited to 'thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp')
-rw-r--r-- | thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp | 918 |
1 files changed, 425 insertions, 493 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp index c45fbbdcaa..4126d03ed0 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp @@ -6,7 +6,6 @@ bool searchIncremental3dSapOnGpu = true; #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h" - #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" #include "kernels/sapKernels.h" @@ -56,110 +55,105 @@ bool searchIncremental3dSapOnGpu = true; class b3PrefixScanFloat4CL* m_prefixScanFloat4; */ -b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType) -:m_context(ctx), -m_device(device), -m_queue(q), - -m_objectMinMaxIndexGPUaxis0(ctx,q), -m_objectMinMaxIndexGPUaxis1(ctx,q), -m_objectMinMaxIndexGPUaxis2(ctx,q), -m_objectMinMaxIndexGPUaxis0prev(ctx,q), -m_objectMinMaxIndexGPUaxis1prev(ctx,q), -m_objectMinMaxIndexGPUaxis2prev(ctx,q), -m_sortedAxisGPU0(ctx,q), -m_sortedAxisGPU1(ctx,q), -m_sortedAxisGPU2(ctx,q), -m_sortedAxisGPU0prev(ctx,q), -m_sortedAxisGPU1prev(ctx,q), -m_sortedAxisGPU2prev(ctx,q), -m_addedHostPairsGPU(ctx,q), -m_removedHostPairsGPU(ctx,q), -m_addedCountGPU(ctx,q), -m_removedCountGPU(ctx,q), -m_currentBuffer(-1), -m_pairCount(ctx,q), -m_allAabbsGPU(ctx,q), -m_sum(ctx,q), -m_sum2(ctx,q), -m_dst(ctx,q), -m_smallAabbsMappingGPU(ctx,q), -m_largeAabbsMappingGPU(ctx,q), -m_overlappingPairs(ctx,q), -m_gpuSmallSortData(ctx,q), -m_gpuSmallSortedAabbs(ctx,q) +b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType) + : m_context(ctx), + m_device(device), + m_queue(q), + + m_objectMinMaxIndexGPUaxis0(ctx, q), + m_objectMinMaxIndexGPUaxis1(ctx, q), + m_objectMinMaxIndexGPUaxis2(ctx, q), + m_objectMinMaxIndexGPUaxis0prev(ctx, q), + m_objectMinMaxIndexGPUaxis1prev(ctx, q), + m_objectMinMaxIndexGPUaxis2prev(ctx, q), + m_sortedAxisGPU0(ctx, q), + m_sortedAxisGPU1(ctx, q), + m_sortedAxisGPU2(ctx, q), + m_sortedAxisGPU0prev(ctx, q), + m_sortedAxisGPU1prev(ctx, q), + m_sortedAxisGPU2prev(ctx, q), + m_addedHostPairsGPU(ctx, q), + m_removedHostPairsGPU(ctx, q), + m_addedCountGPU(ctx, q), + m_removedCountGPU(ctx, q), + m_currentBuffer(-1), + m_pairCount(ctx, q), + m_allAabbsGPU(ctx, q), + m_sum(ctx, q), + m_sum2(ctx, q), + m_dst(ctx, q), + m_smallAabbsMappingGPU(ctx, q), + m_largeAabbsMappingGPU(ctx, q), + m_overlappingPairs(ctx, q), + m_gpuSmallSortData(ctx, q), + m_gpuSmallSortedAabbs(ctx, q) { const char* sapSrc = sapCL; - - - cl_int errNum=0; + + cl_int errNum = 0; b3Assert(m_context); b3Assert(m_device); - cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH); - b3Assert(errNum==CL_SUCCESS); - + cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH); + b3Assert(errNum == CL_SUCCESS); - b3Assert(errNum==CL_SUCCESS); + b3Assert(errNum == CL_SUCCESS); #ifndef __APPLE__ - m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context,m_device,m_queue); + m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context, m_device, m_queue); #else m_prefixScanFloat4 = 0; #endif m_sapKernel = 0; - + switch (kernelType) { case B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU: { - m_sapKernel=0; + m_sapKernel = 0; break; } - case B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU: + case B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU: { - m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBruteForce",&errNum,sapProg ); + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelBruteForce", &errNum, sapProg); break; } case B3_GPU_SAP_KERNEL_ORIGINAL: { - m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg ); + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelOriginal", &errNum, sapProg); break; } case B3_GPU_SAP_KERNEL_BARRIER: { - m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg ); + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelBarrier", &errNum, sapProg); break; } case B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY: { - m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg ); + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelLocalSharedMemory", &errNum, sapProg); break; } default: { - m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg ); + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelLocalSharedMemory", &errNum, sapProg); b3Error("Unknown 3D GPU SAP provided, fallback to computePairsKernelLocalSharedMemory"); } }; - - - - m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg ); - b3Assert(errNum==CL_SUCCESS); - m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "prepareSumVarianceKernel",&errNum,sapProg ); - b3Assert(errNum==CL_SUCCESS); + m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg); + b3Assert(errNum == CL_SUCCESS); - - m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg ); + m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "prepareSumVarianceKernel", &errNum, sapProg); + b3Assert(errNum == CL_SUCCESS); - m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg ); + m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "flipFloatKernel", &errNum, sapProg); - m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "scatterKernel",&errNum,sapProg ); + m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg); - m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue); + m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "scatterKernel", &errNum, sapProg); + + m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue); } b3GpuSapBroadphase::~b3GpuSapBroadphase() @@ -173,13 +167,11 @@ b3GpuSapBroadphase::~b3GpuSapBroadphase() clReleaseKernel(m_sapKernel); clReleaseKernel(m_sap2Kernel); clReleaseKernel(m_prepareSumVarianceKernel); - - } /// conservative test for overlap between two aabbs -static bool TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aabbMax1, - const b3Vector3 &aabbMin2, const b3Vector3 &aabbMax2) +static bool TestAabbAgainstAabb2(const b3Vector3& aabbMin1, const b3Vector3& aabbMax1, + const b3Vector3& aabbMin2, const b3Vector3& aabbMax2) { bool overlap = true; overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap; @@ -188,8 +180,6 @@ static bool TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aab return overlap; } - - //http://stereopsis.com/radix.html static unsigned int FloatFlip(float fl) { @@ -198,79 +188,77 @@ static unsigned int FloatFlip(float fl) return f ^ mask; }; -void b3GpuSapBroadphase::init3dSap() +void b3GpuSapBroadphase::init3dSap() { - if (m_currentBuffer<0) + if (m_currentBuffer < 0) { m_allAabbsGPU.copyToHost(m_allAabbsCPU); m_currentBuffer = 0; - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { - for (int buf=0;buf<2;buf++) + for (int buf = 0; buf < 2; buf++) { int totalNumAabbs = m_allAabbsCPU.size(); - int numEndPoints = 2*totalNumAabbs; + int numEndPoints = 2 * totalNumAabbs; m_sortedAxisCPU[axis][buf].resize(numEndPoints); - if (buf==m_currentBuffer) + if (buf == m_currentBuffer) { - for (int i=0;i<totalNumAabbs;i++) + for (int i = 0; i < totalNumAabbs; i++) { - m_sortedAxisCPU[axis][buf][i*2].m_key = FloatFlip(m_allAabbsCPU[i].m_min[axis])-1; - m_sortedAxisCPU[axis][buf][i*2].m_value = i*2; - m_sortedAxisCPU[axis][buf][i*2+1].m_key = FloatFlip(m_allAabbsCPU[i].m_max[axis])+1; - m_sortedAxisCPU[axis][buf][i*2+1].m_value = i*2+1; + m_sortedAxisCPU[axis][buf][i * 2].m_key = FloatFlip(m_allAabbsCPU[i].m_min[axis]) - 1; + m_sortedAxisCPU[axis][buf][i * 2].m_value = i * 2; + m_sortedAxisCPU[axis][buf][i * 2 + 1].m_key = FloatFlip(m_allAabbsCPU[i].m_max[axis]) + 1; + m_sortedAxisCPU[axis][buf][i * 2 + 1].m_value = i * 2 + 1; } } } } - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]); } - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { //int totalNumAabbs = m_allAabbsCPU.size(); int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size(); m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(numEndPoints); - for (int i=0;i<numEndPoints;i++) + for (int i = 0; i < numEndPoints; i++) { int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value; - int newDest = destIndex/2; - if (destIndex&1) + int newDest = destIndex / 2; + if (destIndex & 1) { - m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i; - } else + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y = i; + } + else { - m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i; + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x = i; } } } - } } - static bool b3PairCmp(const b3Int4& p, const b3Int4& q) { - return ((p.x<q.x) || ((p.x==q.x) && (p.y<q.y))); + return ((p.x < q.x) || ((p.x == q.x) && (p.y < q.y))); } - -static bool operator==(const b3Int4& a,const b3Int4& b) +static bool operator==(const b3Int4& a, const b3Int4& b) { return a.x == b.x && a.y == b.y; }; -static bool operator<(const b3Int4& a,const b3Int4& b) +static bool operator<(const b3Int4& a, const b3Int4& b) { return a.x < b.x || (a.x == b.x && a.y < b.y); }; -static bool operator>(const b3Int4& a,const b3Int4& b) +static bool operator>(const b3Int4& a, const b3Int4& b) { return a.x > b.x || (a.x == b.x && a.y > b.y); }; @@ -278,31 +266,29 @@ static bool operator>(const b3Int4& a,const b3Int4& b) b3AlignedObjectArray<b3Int4> addedHostPairs; b3AlignedObjectArray<b3Int4> removedHostPairs; -b3AlignedObjectArray<b3SapAabb> preAabbs; +b3AlignedObjectArray<b3SapAabb> preAabbs; -void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() +void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() { //static int framepje = 0; //printf("framepje=%d\n",framepje++); - B3_PROFILE("calculateOverlappingPairsHostIncremental3Sap"); addedHostPairs.resize(0); removedHostPairs.resize(0); - b3Assert(m_currentBuffer>=0); - + b3Assert(m_currentBuffer >= 0); + { preAabbs.resize(m_allAabbsCPU.size()); - for (int i=0;i<preAabbs.size();i++) + for (int i = 0; i < preAabbs.size(); i++) { - preAabbs[i]=m_allAabbsCPU[i]; + preAabbs[i] = m_allAabbsCPU[i]; } } - - if (m_currentBuffer<0) + if (m_currentBuffer < 0) return; { B3_PROFILE("m_allAabbsGPU.copyToHost"); @@ -316,100 +302,87 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() } if (0) { - { - printf("ab[40].min=%f,%f,%f,ab[40].max=%f,%f,%f\n", - m_allAabbsCPU[40].m_min[0], m_allAabbsCPU[40].m_min[1],m_allAabbsCPU[40].m_min[2], - m_allAabbsCPU[40].m_max[0], m_allAabbsCPU[40].m_max[1],m_allAabbsCPU[40].m_max[2]); - } - - { - printf("ab[53].min=%f,%f,%f,ab[53].max=%f,%f,%f\n", - m_allAabbsCPU[53].m_min[0], m_allAabbsCPU[53].m_min[1],m_allAabbsCPU[53].m_min[2], - m_allAabbsCPU[53].m_max[0], m_allAabbsCPU[53].m_max[1],m_allAabbsCPU[53].m_max[2]); - } - - - { - b3Int4 newPair; - newPair.x = 40; - newPair.y = 53; - int index = allPairs.findBinarySearch(newPair); - printf("hasPair(40,53)=%d out of %d\n",index, allPairs.size()); - { - int overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[40].m_min, (const b3Vector3&)m_allAabbsCPU[40].m_max,(const b3Vector3&)m_allAabbsCPU[53].m_min,(const b3Vector3&)m_allAabbsCPU[53].m_max); - printf("overlap=%d\n",overlap); + printf("ab[40].min=%f,%f,%f,ab[40].max=%f,%f,%f\n", + m_allAabbsCPU[40].m_min[0], m_allAabbsCPU[40].m_min[1], m_allAabbsCPU[40].m_min[2], + m_allAabbsCPU[40].m_max[0], m_allAabbsCPU[40].m_max[1], m_allAabbsCPU[40].m_max[2]); } - if (preAabbs.size()) - { - int prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[40].m_min, (const b3Vector3&)preAabbs[40].m_max,(const b3Vector3&)preAabbs[53].m_min,(const b3Vector3&)preAabbs[53].m_max); - printf("prevoverlap=%d\n",prevOverlap); - } else { - printf("unknown prevoverlap\n"); + printf("ab[53].min=%f,%f,%f,ab[53].max=%f,%f,%f\n", + m_allAabbsCPU[53].m_min[0], m_allAabbsCPU[53].m_min[1], m_allAabbsCPU[53].m_min[2], + m_allAabbsCPU[53].m_max[0], m_allAabbsCPU[53].m_max[1], m_allAabbsCPU[53].m_max[2]); } - } - } + { + b3Int4 newPair; + newPair.x = 40; + newPair.y = 53; + int index = allPairs.findBinarySearch(newPair); + printf("hasPair(40,53)=%d out of %d\n", index, allPairs.size()); + { + int overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[40].m_min, (const b3Vector3&)m_allAabbsCPU[40].m_max, (const b3Vector3&)m_allAabbsCPU[53].m_min, (const b3Vector3&)m_allAabbsCPU[53].m_max); + printf("overlap=%d\n", overlap); + } + + if (preAabbs.size()) + { + int prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[40].m_min, (const b3Vector3&)preAabbs[40].m_max, (const b3Vector3&)preAabbs[53].m_min, (const b3Vector3&)preAabbs[53].m_max); + printf("prevoverlap=%d\n", prevOverlap); + } + else + { + printf("unknown prevoverlap\n"); + } + } + } if (0) { - for (int i=0;i<m_allAabbsCPU.size();i++) + for (int i = 0; i < m_allAabbsCPU.size(); i++) { //printf("aabb[%d] min=%f,%f,%f max=%f,%f,%f\n",i,m_allAabbsCPU[i].m_min[0],m_allAabbsCPU[i].m_min[1],m_allAabbsCPU[i].m_min[2], m_allAabbsCPU[i].m_max[0],m_allAabbsCPU[i].m_max[1],m_allAabbsCPU[i].m_max[2]); - - } - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { - for (int buf=0;buf<2;buf++) + for (int buf = 0; buf < 2; buf++) { - b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size()*2); + b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size() * 2); } } } - - - m_currentBuffer = 1-m_currentBuffer; - - + m_currentBuffer = 1 - m_currentBuffer; int totalNumAabbs = m_allAabbsCPU.size(); { B3_PROFILE("assign m_sortedAxisCPU(FloatFlip)"); - for (int i=0;i<totalNumAabbs;i++) + for (int i = 0; i < totalNumAabbs; i++) { - - unsigned int keyMin[3]; unsigned int keyMax[3]; - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { - float vmin=m_allAabbsCPU[i].m_min[axis]; + float vmin = m_allAabbsCPU[i].m_min[axis]; float vmax = m_allAabbsCPU[i].m_max[axis]; keyMin[axis] = FloatFlip(vmin); keyMax[axis] = FloatFlip(vmax); - - m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_key = keyMin[axis]-1; - m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_value = i*2; - m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_key = keyMax[axis]+1; - m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_value = i*2+1; + + m_sortedAxisCPU[axis][m_currentBuffer][i * 2].m_key = keyMin[axis] - 1; + m_sortedAxisCPU[axis][m_currentBuffer][i * 2].m_value = i * 2; + m_sortedAxisCPU[axis][m_currentBuffer][i * 2 + 1].m_key = keyMax[axis] + 1; + m_sortedAxisCPU[axis][m_currentBuffer][i * 2 + 1].m_value = i * 2 + 1; } //printf("aabb[%d] min=%u,%u,%u max %u,%u,%u\n", i,keyMin[0],keyMin[1],keyMin[2],keyMax[0],keyMax[1],keyMax[2]); - } } - - { B3_PROFILE("sort m_sortedAxisCPU"); - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]); } @@ -432,21 +405,22 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() { B3_PROFILE("assign m_objectMinMaxIndexCPU"); - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { int totalNumAabbs = m_allAabbsCPU.size(); int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size(); m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(totalNumAabbs); - for (int i=0;i<numEndPoints;i++) + for (int i = 0; i < numEndPoints; i++) { int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value; - int newDest = destIndex/2; - if (destIndex&1) + int newDest = destIndex / 2; + if (destIndex & 1) { - m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i; - } else + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y = i; + } + else { - m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i; + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x = i; } } } @@ -485,12 +459,11 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() } #endif - int a = m_objectMinMaxIndexCPU[0][m_currentBuffer].size(); int b = m_objectMinMaxIndexCPU[1][m_currentBuffer].size(); int c = m_objectMinMaxIndexCPU[2][m_currentBuffer].size(); - b3Assert(a==b); - b3Assert(b==c); + b3Assert(a == b); + b3Assert(b == c); /* if (searchIncremental3dSapOnGpu) { @@ -574,175 +547,170 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size(); B3_PROFILE("actual search"); - for (int i=0;i<numObjects;i++) + for (int i = 0; i < numObjects; i++) { //int numObjects = m_objectMinMaxIndexCPU[axis][m_currentBuffer].size(); //int checkObjects[]={40,53}; //int numCheckObjects = sizeof(checkObjects)/sizeof(int); - + //for (int a=0;a<numCheckObjects ;a++) - - for (int axis=0;axis<3;axis++) + + for (int axis = 0; axis < 3; axis++) { //int i = checkObjects[a]; unsigned int curMinIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].x; unsigned int curMaxIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].y; - unsigned int prevMinIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].x; + unsigned int prevMinIndex = m_objectMinMaxIndexCPU[axis][1 - m_currentBuffer][i].x; int dmin = curMinIndex - prevMinIndex; - - unsigned int prevMaxIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].y; - + unsigned int prevMaxIndex = m_objectMinMaxIndexCPU[axis][1 - m_currentBuffer][i].y; int dmax = curMaxIndex - prevMaxIndex; - if (dmin!=0) + if (dmin != 0) { //printf("for object %d, dmin=%d\n",i,dmin); } - if (dmax!=0) + if (dmax != 0) { //printf("for object %d, dmax=%d\n",i,dmax); } - for (int otherbuffer = 0;otherbuffer<2;otherbuffer++) + for (int otherbuffer = 0; otherbuffer < 2; otherbuffer++) { - if (dmin!=0) + if (dmin != 0) { - int stepMin = dmin<0 ? -1 : 1; - for (int j=prevMinIndex;j!=curMinIndex;j+=stepMin) + int stepMin = dmin < 0 ? -1 : 1; + for (int j = prevMinIndex; j != curMinIndex; j += stepMin) { int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y; - int otherIndex = otherIndex2/2; - if (otherIndex!=i) + int otherIndex = otherIndex2 / 2; + if (otherIndex != i) { - bool otherIsMax = ((otherIndex2&1)!=0); + bool otherIsMax = ((otherIndex2 & 1) != 0); if (otherIsMax) { //bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max); //bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max); - + bool overlap = true; - for (int ax=0;ax<3;ax++) + for (int ax = 0; ax < 3; ax++) { if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) || (m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x)) - overlap=false; + overlap = false; } - // b3Assert(overlap2==overlap); + // b3Assert(overlap2==overlap); bool prevOverlap = true; - for (int ax=0;ax<3;ax++) + for (int ax = 0; ax < 3; ax++) { - if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) || - (m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x)) - prevOverlap=false; + if ((m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].y) || + (m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].x)) + prevOverlap = false; } - //b3Assert(overlap==overlap2); - - - if (dmin<0) + if (dmin < 0) { if (overlap && !prevOverlap) { //add a pair b3Int4 newPair; - if (i<=otherIndex) + if (i <= otherIndex) { newPair.x = i; newPair.y = otherIndex; - } else + } + else { newPair.x = otherIndex; newPair.y = i; } addedHostPairs.push_back(newPair); } - } + } else { if (!overlap && prevOverlap) { - //remove a pair b3Int4 removedPair; - if (i<=otherIndex) + if (i <= otherIndex) { removedPair.x = i; removedPair.y = otherIndex; - } else + } + else { removedPair.x = otherIndex; removedPair.y = i; } removedHostPairs.push_back(removedPair); } - }//otherisMax - }//if (dmin<0) - }//if (otherIndex!=i) - }//for (int j= + } //otherisMax + } //if (dmin<0) + } //if (otherIndex!=i) + } //for (int j= } - - if (dmax!=0) + + if (dmax != 0) { - int stepMax = dmax<0 ? -1 : 1; - for (int j=prevMaxIndex;j!=curMaxIndex;j+=stepMax) + int stepMax = dmax < 0 ? -1 : 1; + for (int j = prevMaxIndex; j != curMaxIndex; j += stepMax) { int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y; - int otherIndex = otherIndex2/2; - if (otherIndex!=i) + int otherIndex = otherIndex2 / 2; + if (otherIndex != i) { //bool otherIsMin = ((otherIndex2&1)==0); //if (otherIsMin) { //bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max); //bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max); - + bool overlap = true; - for (int ax=0;ax<3;ax++) + for (int ax = 0; ax < 3; ax++) { if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) || (m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x)) - overlap=false; + overlap = false; } //b3Assert(overlap2==overlap); bool prevOverlap = true; - for (int ax=0;ax<3;ax++) + for (int ax = 0; ax < 3; ax++) { - if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) || - (m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x)) - prevOverlap=false; + if ((m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].y) || + (m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].x)) + prevOverlap = false; } - - if (dmax>0) + if (dmax > 0) { if (overlap && !prevOverlap) { //add a pair b3Int4 newPair; - if (i<=otherIndex) + if (i <= otherIndex) { newPair.x = i; newPair.y = otherIndex; - } else + } + else { newPair.x = otherIndex; newPair.y = i; } addedHostPairs.push_back(newPair); - } - } + } else { if (!overlap && prevOverlap) @@ -750,33 +718,31 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() //if (otherIndex2&1==0) -> min? //remove a pair b3Int4 removedPair; - if (i<=otherIndex) + if (i <= otherIndex) { removedPair.x = i; removedPair.y = otherIndex; - } else + } + else { removedPair.x = otherIndex; removedPair.y = i; } removedHostPairs.push_back(removedPair); - } } - - }//if (dmin<0) - }//if (otherIndex!=i) - }//for (int j= + + } //if (dmin<0) + } //if (otherIndex!=i) + } //for (int j= } - }//for (int otherbuffer - }//for (int axis=0; - }//for (int i=0;i<numObjects + } //for (int otherbuffer + } //for (int axis=0; + } //for (int i=0;i<numObjects } //remove duplicates and add/remove then to existing m_overlappingPairs - - - + { { B3_PROFILE("sort allPairs"); @@ -795,31 +761,28 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() b3Int4 prevPair; prevPair.x = -1; prevPair.y = -1; - + int uniqueRemovedPairs = 0; b3AlignedObjectArray<int> removedPositions; { B3_PROFILE("actual removing"); - for (int i=0;i<removedHostPairs.size();i++) + for (int i = 0; i < removedHostPairs.size(); i++) { b3Int4 removedPair = removedHostPairs[i]; if ((removedPair.x != prevPair.x) || (removedPair.y != prevPair.y)) { + int index1 = allPairs.findBinarySearch(removedPair); - int index1 = allPairs.findBinarySearch(removedPair); + //#ifdef _DEBUG - //#ifdef _DEBUG - - - int index2 = allPairs.findLinearSearch(removedPair); - b3Assert(index1==index2); - + b3Assert(index1 == index2); + //b3Assert(index1!=allPairs.size()); - if (index1<allPairs.size()) - //#endif//_DEBUG + if (index1 < allPairs.size()) + //#endif//_DEBUG { uniqueRemovedPairs++; removedPositions.push_back(index1); @@ -833,13 +796,13 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() if (uniqueRemovedPairs) { - for (int i=0;i<removedPositions.size();i++) + for (int i = 0; i < removedPositions.size(); i++) { - allPairs[removedPositions[i]].x = INT_MAX ; - allPairs[removedPositions[i]].y = INT_MAX ; + allPairs[removedPositions[i]].x = INT_MAX; + allPairs[removedPositions[i]].y = INT_MAX; } allPairs.quickSort(b3PairCmp); - allPairs.resize(allPairs.size()-uniqueRemovedPairs); + allPairs.resize(allPairs.size() - uniqueRemovedPairs); } } //if (uniqueRemovedPairs) @@ -848,33 +811,31 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() prevPair.x = -1; prevPair.y = -1; - - int uniqueAddedPairs=0; + + int uniqueAddedPairs = 0; b3AlignedObjectArray<b3Int4> actualAddedPairs; { B3_PROFILE("actual adding"); - for (int i=0;i<addedHostPairs.size();i++) + for (int i = 0; i < addedHostPairs.size(); i++) { b3Int4 newPair = addedHostPairs[i]; if ((newPair.x != prevPair.x) || (newPair.y != prevPair.y)) { -//#ifdef _DEBUG + //#ifdef _DEBUG int index1 = allPairs.findBinarySearch(newPair); - - + int index2 = allPairs.findLinearSearch(newPair); - b3Assert(index1==index2); - + b3Assert(index1 == index2); - b3Assert(index1==allPairs.size()); - if (index1!=allPairs.size()) + b3Assert(index1 == allPairs.size()); + if (index1 != allPairs.size()) { printf("??\n"); } - if (index1==allPairs.size()) -//#endif //_DEBUG + if (index1 == allPairs.size()) + //#endif //_DEBUG { uniqueAddedPairs++; actualAddedPairs.push_back(newPair); @@ -882,94 +843,83 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() } prevPair = newPair; } - for (int i=0;i<actualAddedPairs.size();i++) + for (int i = 0; i < actualAddedPairs.size(); i++) { //printf("framepje (%d), new pair(%d):%d,%d\n",framepje,i,actualAddedPairs[i].x,actualAddedPairs[i].y); allPairs.push_back(actualAddedPairs[i]); } } - + //if (uniqueAddedPairs) // printf("uniqueAddedPairs=%d\n", uniqueAddedPairs); - { B3_PROFILE("m_overlappingPairs.copyFromHost"); m_overlappingPairs.copyFromHost(allPairs); } - - } - - - -void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) +void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) { //test -// if (m_currentBuffer>=0) + // if (m_currentBuffer>=0) // return calculateOverlappingPairsHostIncremental3Sap(); b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size()); m_allAabbsGPU.copyToHost(m_allAabbsCPU); - - - int axis=0; + int axis = 0; { B3_PROFILE("CPU compute best variance axis"); - b3Vector3 s=b3MakeVector3(0,0,0),s2=b3MakeVector3(0,0,0); + b3Vector3 s = b3MakeVector3(0, 0, 0), s2 = b3MakeVector3(0, 0, 0); int numRigidBodies = m_smallAabbsMappingCPU.size(); - for(int i=0;i<numRigidBodies;i++) + for (int i = 0; i < numRigidBodies; i++) { b3SapAabb aabb = this->m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; - b3Vector3 maxAabb=b3MakeVector3(aabb.m_max[0],aabb.m_max[1],aabb.m_max[2]); - b3Vector3 minAabb=b3MakeVector3(aabb.m_min[0],aabb.m_min[1],aabb.m_min[2]); - b3Vector3 centerAabb=(maxAabb+minAabb)*0.5f; - + b3Vector3 maxAabb = b3MakeVector3(aabb.m_max[0], aabb.m_max[1], aabb.m_max[2]); + b3Vector3 minAabb = b3MakeVector3(aabb.m_min[0], aabb.m_min[1], aabb.m_min[2]); + b3Vector3 centerAabb = (maxAabb + minAabb) * 0.5f; + s += centerAabb; - s2 += centerAabb*centerAabb; + s2 += centerAabb * centerAabb; } - b3Vector3 v = s2 - (s*s) / (float)numRigidBodies; - - if(v[1] > v[0]) + b3Vector3 v = s2 - (s * s) / (float)numRigidBodies; + + if (v[1] > v[0]) axis = 1; - if(v[2] > v[axis]) + if (v[2] > v[axis]) axis = 2; } - - - b3AlignedObjectArray<b3Int4> hostPairs; { int numSmallAabbs = m_smallAabbsMappingCPU.size(); - for (int i=0;i<numSmallAabbs;i++) + for (int i = 0; i < numSmallAabbs; i++) { b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; //float reference = smallAabbi.m_max[axis]; - for (int j=i+1;j<numSmallAabbs;j++) + for (int j = i + 1; j < numSmallAabbs; j++) { - b3SapAabb smallAabbj = m_allAabbsCPU[m_smallAabbsMappingCPU[j]]; if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max, - (b3Vector3&)smallAabbj.m_min,(b3Vector3&)smallAabbj.m_max)) + (b3Vector3&)smallAabbj.m_min, (b3Vector3&)smallAabbj.m_max)) { b3Int4 pair; int a = smallAabbi.m_minIndices[3]; int b = smallAabbj.m_minIndices[3]; - if (a<=b) + if (a <= b) { - pair.x = a;//store the original index in the unsorted aabb array + pair.x = a; //store the original index in the unsorted aabb array pair.y = b; - } else + } + else { - pair.x = b;//store the original index in the unsorted aabb array + pair.x = b; //store the original index in the unsorted aabb array pair.y = a; } hostPairs.push_back(pair); @@ -978,35 +928,35 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) } } - { int numSmallAabbs = m_smallAabbsMappingCPU.size(); - for (int i=0;i<numSmallAabbs;i++) + for (int i = 0; i < numSmallAabbs; i++) { b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; //float reference = smallAabbi.m_max[axis]; int numLargeAabbs = m_largeAabbsMappingCPU.size(); - for (int j=0;j<numLargeAabbs;j++) + for (int j = 0; j < numLargeAabbs; j++) { b3SapAabb largeAabbj = m_allAabbsCPU[m_largeAabbsMappingCPU[j]]; if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max, - (b3Vector3&)largeAabbj.m_min,(b3Vector3&)largeAabbj.m_max)) + (b3Vector3&)largeAabbj.m_min, (b3Vector3&)largeAabbj.m_max)) { b3Int4 pair; int a = largeAabbj.m_minIndices[3]; int b = smallAabbi.m_minIndices[3]; - if (a<=b) + if (a <= b) { - pair.x = a; - pair.y = b;//store the original index in the unsorted aabb array - } else + pair.x = a; + pair.y = b; //store the original index in the unsorted aabb array + } + else { pair.x = b; - pair.y = a;//store the original index in the unsorted aabb array + pair.y = a; //store the original index in the unsorted aabb array } - + hostPairs.push_back(pair); } } @@ -1021,21 +971,20 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) if (hostPairs.size()) { m_overlappingPairs.copyFromHost(hostPairs); - } else + } + else { m_overlappingPairs.resize(0); } //init3dSap(); - } -void b3GpuSapBroadphase::reset() +void b3GpuSapBroadphase::reset() { m_allAabbsGPU.resize(0); m_allAabbsCPU.resize(0); - m_smallAabbsMappingGPU.resize(0); m_smallAabbsMappingCPU.resize(0); @@ -1043,13 +992,11 @@ void b3GpuSapBroadphase::reset() m_largeAabbsMappingGPU.resize(0); m_largeAabbsMappingCPU.resize(0); - } - -void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) +void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) { - if (m_sapKernel==0) + if (m_sapKernel == 0) { calculateOverlappingPairsHost(maxPairs); return; @@ -1065,68 +1012,62 @@ void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) int axis = 0; { + //bool syncOnHost = false; - //bool syncOnHost = false; - - int numSmallAabbs = m_smallAabbsMappingCPU.size(); - if (m_prefixScanFloat4 && numSmallAabbs) - { - B3_PROFILE("GPU compute best variance axis"); - - if (m_dst.size()!=(numSmallAabbs+1)) + int numSmallAabbs = m_smallAabbsMappingCPU.size(); + if (m_prefixScanFloat4 && numSmallAabbs) { - m_dst.resize(numSmallAabbs+128); - m_sum.resize(numSmallAabbs+128); - m_sum2.resize(numSmallAabbs+128); - m_sum.at(numSmallAabbs)=b3MakeVector3(0,0,0); //slow? - m_sum2.at(numSmallAabbs)=b3MakeVector3(0,0,0); //slow? - } + B3_PROFILE("GPU compute best variance axis"); - b3LauncherCL launcher(m_queue, m_prepareSumVarianceKernel ,"m_prepareSumVarianceKernel"); - launcher.setBuffer(m_allAabbsGPU.getBufferCL()); - - launcher.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); - launcher.setBuffer(m_sum.getBufferCL()); - launcher.setBuffer(m_sum2.getBufferCL()); - launcher.setConst( numSmallAabbs ); - int num = numSmallAabbs; - launcher.launch1D( num); - + if (m_dst.size() != (numSmallAabbs + 1)) + { + m_dst.resize(numSmallAabbs + 128); + m_sum.resize(numSmallAabbs + 128); + m_sum2.resize(numSmallAabbs + 128); + m_sum.at(numSmallAabbs) = b3MakeVector3(0, 0, 0); //slow? + m_sum2.at(numSmallAabbs) = b3MakeVector3(0, 0, 0); //slow? + } - b3Vector3 s; - b3Vector3 s2; - m_prefixScanFloat4->execute(m_sum,m_dst,numSmallAabbs+1,&s); - m_prefixScanFloat4->execute(m_sum2,m_dst,numSmallAabbs+1,&s2); + b3LauncherCL launcher(m_queue, m_prepareSumVarianceKernel, "m_prepareSumVarianceKernel"); + launcher.setBuffer(m_allAabbsGPU.getBufferCL()); - b3Vector3 v = s2 - (s*s) / (float)numSmallAabbs; - - if(v[1] > v[0]) - axis = 1; - if(v[2] > v[axis]) - axis = 2; - } + launcher.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); + launcher.setBuffer(m_sum.getBufferCL()); + launcher.setBuffer(m_sum2.getBufferCL()); + launcher.setConst(numSmallAabbs); + int num = numSmallAabbs; + launcher.launch1D(num); + b3Vector3 s; + b3Vector3 s2; + m_prefixScanFloat4->execute(m_sum, m_dst, numSmallAabbs + 1, &s); + m_prefixScanFloat4->execute(m_sum2, m_dst, numSmallAabbs + 1, &s2); + + b3Vector3 v = s2 - (s * s) / (float)numSmallAabbs; + + if (v[1] > v[0]) + axis = 1; + if (v[2] > v[axis]) + axis = 2; + } - m_gpuSmallSortData.resize(numSmallAabbs); - #if 1 if (m_smallAabbsMappingGPU.size()) { - B3_PROFILE("flipFloatKernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), - b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL(), true), - b3BufferInfoCL( m_gpuSmallSortData.getBufferCL())}; - b3LauncherCL launcher(m_queue, m_flipFloatKernel ,"m_flipFloatKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( numSmallAabbs ); - launcher.setConst( axis ); - + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(m_allAabbsGPU.getBufferCL(), true), + b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL(), true), + b3BufferInfoCL(m_gpuSmallSortData.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_flipFloatKernel, "m_flipFloatKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(numSmallAabbs); + launcher.setConst(axis); + int num = numSmallAabbs; - launcher.launch1D( num); + launcher.launch1D(num); clFinish(m_queue); } @@ -1141,69 +1082,66 @@ void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) if (numSmallAabbs) { B3_PROFILE("scatterKernel"); - - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), - b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL(), true), - b3BufferInfoCL( m_gpuSmallSortData.getBufferCL(),true), + + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(m_allAabbsGPU.getBufferCL(), true), + b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL(), true), + b3BufferInfoCL(m_gpuSmallSortData.getBufferCL(), true), b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())}; - b3LauncherCL launcher(m_queue, m_scatterKernel ,"m_scatterKernel "); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( numSmallAabbs); + b3LauncherCL launcher(m_queue, m_scatterKernel, "m_scatterKernel "); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(numSmallAabbs); int num = numSmallAabbs; - launcher.launch1D( num); + launcher.launch1D(num); clFinish(m_queue); - } - - m_overlappingPairs.resize(maxPairs); + m_overlappingPairs.resize(maxPairs); - m_pairCount.resize(0); - m_pairCount.push_back(0); - int numPairs=0; + m_pairCount.resize(0); + m_pairCount.push_back(0); + int numPairs = 0; + { + int numLargeAabbs = m_largeAabbsMappingGPU.size(); + if (numLargeAabbs && numSmallAabbs) { - int numLargeAabbs = m_largeAabbsMappingGPU.size(); - if (numLargeAabbs && numSmallAabbs) + //@todo + B3_PROFILE("sap2Kernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(m_allAabbsGPU.getBufferCL()), + b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()), + b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()), + b3BufferInfoCL(m_overlappingPairs.getBufferCL()), + b3BufferInfoCL(m_pairCount.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(numLargeAabbs); + launcher.setConst(numSmallAabbs); + launcher.setConst(axis); + launcher.setConst(maxPairs); + //@todo: use actual maximum work item sizes of the device instead of hardcoded values + launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64); + + numPairs = m_pairCount.at(0); + if (numPairs > maxPairs) { - //@todo - B3_PROFILE("sap2Kernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( m_allAabbsGPU.getBufferCL() ), - b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ), - b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ), - b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), - b3BufferInfoCL(m_pairCount.getBufferCL())}; - b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( numLargeAabbs ); - launcher.setConst( numSmallAabbs); - launcher.setConst( axis ); - launcher.setConst( maxPairs ); -//@todo: use actual maximum work item sizes of the device instead of hardcoded values - launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64); - - numPairs = m_pairCount.at(0); - if (numPairs >maxPairs) - { - b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); - numPairs =maxPairs; - } + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs = maxPairs; } } - if (m_gpuSmallSortedAabbs.size()) - { - B3_PROFILE("sapKernel"); - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), b3BufferInfoCL(m_pairCount.getBufferCL())}; - b3LauncherCL launcher(m_queue, m_sapKernel,"m_sapKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( numSmallAabbs ); - launcher.setConst( axis ); - launcher.setConst( maxPairs ); - - - int num = numSmallAabbs; + } + if (m_gpuSmallSortedAabbs.size()) + { + B3_PROFILE("sapKernel"); + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL()), b3BufferInfoCL(m_overlappingPairs.getBufferCL()), b3BufferInfoCL(m_pairCount.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_sapKernel, "m_sapKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(numSmallAabbs); + launcher.setConst(axis); + launcher.setConst(maxPairs); + + int num = numSmallAabbs; #if 0 int buffSize = launcher.getSerializationBufferSize(); unsigned char* buf = new unsigned char[buffSize+sizeof(int)]; @@ -1225,73 +1163,71 @@ void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) FILE* f = fopen("m_sapKernelArgs.bin","wb"); fwrite(buf,buffSize+sizeof(int),1,f); fclose(f); -#endif// +#endif // - launcher.launch1D( num); - clFinish(m_queue); - - numPairs = m_pairCount.at(0); - if (numPairs>maxPairs) - { - b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); - numPairs = maxPairs; - m_pairCount.resize(0); - m_pairCount.push_back(maxPairs); - } + launcher.launch1D(num); + clFinish(m_queue); + + numPairs = m_pairCount.at(0); + if (numPairs > maxPairs) + { + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs = maxPairs; + m_pairCount.resize(0); + m_pairCount.push_back(maxPairs); } - + } + #else - int numPairs = 0; - - - b3LauncherCL launcher(m_queue, m_sapKernel); - - const char* fileName = "m_sapKernelArgs.bin"; - FILE* f = fopen(fileName,"rb"); - if (f) - { - int sizeInBytes=0; - if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) - { - printf("error, cannot get file size\n"); - exit(0); - } - - unsigned char* buf = (unsigned char*) malloc(sizeInBytes); - fread(buf,sizeInBytes,1,f); - int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context); - int num = *(int*)&buf[serializedBytes]; - launcher.launch1D( num); - - b3OpenCLArray<int> pairCount(m_context, m_queue); - int numElements = launcher.m_arrays[2]->size()/sizeof(int); - pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements); - numPairs = pairCount.at(0); - //printf("overlapping pairs = %d\n",numPairs); - b3AlignedObjectArray<b3Int4> hostOoverlappingPairs; - b3OpenCLArray<b3Int4> tmpGpuPairs(m_context,m_queue); - tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs ); - - tmpGpuPairs.copyToHost(hostOoverlappingPairs); - m_overlappingPairs.copyFromHost(hostOoverlappingPairs); - //printf("hello %d\n", m_overlappingPairs.size()); - free(buf); - fclose(f); - - } else { - printf("error: cannot find file %s\n",fileName); - } - - clFinish(m_queue); - - + int numPairs = 0; + + b3LauncherCL launcher(m_queue, m_sapKernel); + + const char* fileName = "m_sapKernelArgs.bin"; + FILE* f = fopen(fileName, "rb"); + if (f) + { + int sizeInBytes = 0; + if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) + { + printf("error, cannot get file size\n"); + exit(0); + } + + unsigned char* buf = (unsigned char*)malloc(sizeInBytes); + fread(buf, sizeInBytes, 1, f); + int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context); + int num = *(int*)&buf[serializedBytes]; + launcher.launch1D(num); + + b3OpenCLArray<int> pairCount(m_context, m_queue); + int numElements = launcher.m_arrays[2]->size() / sizeof(int); + pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(), numElements); + numPairs = pairCount.at(0); + //printf("overlapping pairs = %d\n",numPairs); + b3AlignedObjectArray<b3Int4> hostOoverlappingPairs; + b3OpenCLArray<b3Int4> tmpGpuPairs(m_context, m_queue); + tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(), numPairs); + + tmpGpuPairs.copyToHost(hostOoverlappingPairs); + m_overlappingPairs.copyFromHost(hostOoverlappingPairs); + //printf("hello %d\n", m_overlappingPairs.size()); + free(buf); + fclose(f); + } + else + { + printf("error: cannot find file %s\n", fileName); + } + + clFinish(m_queue); + #endif - - m_overlappingPairs.resize(numPairs); - - }//B3_PROFILE("GPU_RADIX SORT"); - //init3dSap(); + m_overlappingPairs.resize(numPairs); + + } //B3_PROFILE("GPU_RADIX SORT"); + //init3dSap(); } void b3GpuSapBroadphase::writeAabbsToGpu() @@ -1299,17 +1235,14 @@ void b3GpuSapBroadphase::writeAabbsToGpu() m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU); m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU); - m_allAabbsGPU.copyFromHost(m_allAabbsCPU);//might not be necessary, the 'setupGpuAabbsFull' already takes care of this - - - + m_allAabbsGPU.copyFromHost(m_allAabbsCPU); //might not be necessary, the 'setupGpuAabbsFull' already takes care of this } -void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) { int index = userPtr; b3SapAabb aabb; - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) { aabb.m_min[i] = aabbMin[i]; aabb.m_max[i] = aabbMax[i]; @@ -1317,15 +1250,15 @@ void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vec aabb.m_minIndices[3] = index; aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size(); m_largeAabbsMappingCPU.push_back(m_allAabbsCPU.size()); - + m_allAabbsCPU.push_back(aabb); } -void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) { int index = userPtr; b3SapAabb aabb; - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) { aabb.m_min[i] = aabbMin[i]; aabb.m_max[i] = aabbMax[i]; @@ -1334,20 +1267,19 @@ void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size(); m_smallAabbsMappingCPU.push_back(m_allAabbsCPU.size()); - m_allAabbsCPU.push_back(aabb); } -cl_mem b3GpuSapBroadphase::getAabbBufferWS() +cl_mem b3GpuSapBroadphase::getAabbBufferWS() { return m_allAabbsGPU.getBufferCL(); } -int b3GpuSapBroadphase::getNumOverlap() +int b3GpuSapBroadphase::getNumOverlap() { return m_overlappingPairs.size(); } -cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer() +cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer() { return m_overlappingPairs.getBufferCL(); } |