diff options
Diffstat (limited to 'thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision')
13 files changed, 2234 insertions, 2395 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h index 0ed8aa8232..b296992525 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h @@ -12,33 +12,31 @@ class b3GpuBroadphaseInterface { public: - - typedef class b3GpuBroadphaseInterface* (CreateFunc)(cl_context ctx,cl_device_id device, cl_command_queue q); + typedef class b3GpuBroadphaseInterface*(CreateFunc)(cl_context ctx, cl_device_id device, cl_command_queue q); virtual ~b3GpuBroadphaseInterface() { } - virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0; - virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0; + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0; + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0; - virtual void calculateOverlappingPairs(int maxPairs)=0; - virtual void calculateOverlappingPairsHost(int maxPairs)=0; + virtual void calculateOverlappingPairs(int maxPairs) = 0; + virtual void calculateOverlappingPairsHost(int maxPairs) = 0; //call writeAabbsToGpu after done making all changes (createProxy etc) - virtual void writeAabbsToGpu()=0; + virtual void writeAabbsToGpu() = 0; + + virtual cl_mem getAabbBufferWS() = 0; + virtual int getNumOverlap() = 0; + virtual cl_mem getOverlappingPairBuffer() = 0; - virtual cl_mem getAabbBufferWS()=0; - virtual int getNumOverlap()=0; - virtual cl_mem getOverlappingPairBuffer()=0; + virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() = 0; + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() = 0; - virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()=0; - virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()=0; - virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() = 0; virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() = 0; virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() = 0; - }; -#endif //B3_GPU_BROADPHASE_INTERFACE_H +#endif //B3_GPU_BROADPHASE_INTERFACE_H diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp index 74d0c8056c..e714fadac3 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp @@ -5,12 +5,9 @@ #include "kernels/sapKernels.h" //#include "kernels/gridBroadphase.cl" - #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" - - #define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl" #define B3_GRID_BROADPHASE_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" @@ -21,31 +18,25 @@ cl_kernel kFindOverlappingPairs; cl_kernel m_copyAabbsKernel; cl_kernel m_sap2Kernel; - - - - //int maxPairsPerBody = 64; -int maxBodiesPerCell = 256;//?? +int maxBodiesPerCell = 256; //?? -b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q ) -:m_context(ctx), -m_device(device), -m_queue(q), -m_allAabbsGPU1(ctx,q), -m_smallAabbsMappingGPU(ctx,q), -m_largeAabbsMappingGPU(ctx,q), -m_gpuPairs(ctx,q), +b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q) + : m_context(ctx), + m_device(device), + m_queue(q), + m_allAabbsGPU1(ctx, q), + m_smallAabbsMappingGPU(ctx, q), + m_largeAabbsMappingGPU(ctx, q), + m_gpuPairs(ctx, q), -m_hashGpu(ctx,q), + m_hashGpu(ctx, q), -m_cellStartGpu(ctx,q), -m_paramsGPU(ctx,q) + m_cellStartGpu(ctx, q), + m_paramsGPU(ctx, q) { - - - b3Vector3 gridSize = b3MakeVector3(3,3,3); - b3Vector3 invGridSize = b3MakeVector3(1.f/gridSize[0],1.f/gridSize[1],1.f/gridSize[2]); + b3Vector3 gridSize = b3MakeVector3(3, 3, 3); + b3Vector3 invGridSize = b3MakeVector3(1.f / gridSize[0], 1.f / gridSize[1], 1.f / gridSize[2]); m_paramsCPU.m_gridSize[0] = 128; m_paramsCPU.m_gridSize[1] = 128; @@ -58,92 +49,79 @@ m_paramsGPU(ctx,q) m_paramsCPU.m_invCellSize[3] = 0.f; m_paramsGPU.push_back(m_paramsCPU); - cl_int errNum=0; + cl_int errNum = 0; { const char* sapSrc = sapCL; - cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH); - b3Assert(errNum==CL_SUCCESS); - m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg ); - m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg ); - b3Assert(errNum==CL_SUCCESS); + cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH); + b3Assert(errNum == CL_SUCCESS); + m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg); + m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg); + b3Assert(errNum == CL_SUCCESS); } { - - cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,gridBroadphaseCL,&errNum,"",B3_GRID_BROADPHASE_PATH); - b3Assert(errNum==CL_SUCCESS); - - kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kCalcHashAABB",&errNum,gridProg); - b3Assert(errNum==CL_SUCCESS); - - kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kClearCellStart",&errNum,gridProg); - b3Assert(errNum==CL_SUCCESS); - - kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindCellStart",&errNum,gridProg); - b3Assert(errNum==CL_SUCCESS); - - - kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindOverlappingPairs",&errNum,gridProg); - b3Assert(errNum==CL_SUCCESS); - - - - - } + cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, gridBroadphaseCL, &errNum, "", B3_GRID_BROADPHASE_PATH); + b3Assert(errNum == CL_SUCCESS); - m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue); + kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kCalcHashAABB", &errNum, gridProg); + b3Assert(errNum == CL_SUCCESS); + kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kClearCellStart", &errNum, gridProg); + b3Assert(errNum == CL_SUCCESS); + + kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindCellStart", &errNum, gridProg); + b3Assert(errNum == CL_SUCCESS); + + kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindOverlappingPairs", &errNum, gridProg); + b3Assert(errNum == CL_SUCCESS); + } + + m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue); } b3GpuGridBroadphase::~b3GpuGridBroadphase() { - clReleaseKernel( kCalcHashAABB); - clReleaseKernel( kClearCellStart); - clReleaseKernel( kFindCellStart); - clReleaseKernel( kFindOverlappingPairs); - clReleaseKernel( m_sap2Kernel); - clReleaseKernel( m_copyAabbsKernel); - - - + clReleaseKernel(kCalcHashAABB); + clReleaseKernel(kClearCellStart); + clReleaseKernel(kFindCellStart); + clReleaseKernel(kFindOverlappingPairs); + clReleaseKernel(m_sap2Kernel); + clReleaseKernel(m_copyAabbsKernel); + delete m_sorter; } - - -void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) { b3SapAabb aabb; aabb.m_minVec = aabbMin; aabb.m_maxVec = aabbMax; aabb.m_minIndices[3] = userPtr; - aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr; + aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size(); //NOT userPtr; m_smallAabbsMappingCPU.push_back(m_allAabbsCPU1.size()); m_allAabbsCPU1.push_back(aabb); - } -void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) { b3SapAabb aabb; aabb.m_minVec = aabbMin; aabb.m_maxVec = aabbMax; aabb.m_minIndices[3] = userPtr; - aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr; + aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size(); //NOT userPtr; m_largeAabbsMappingCPU.push_back(m_allAabbsCPU1.size()); m_allAabbsCPU1.push_back(aabb); } -void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs) +void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs) { B3_PROFILE("b3GpuGridBroadphase::calculateOverlappingPairs"); - if (0) { calculateOverlappingPairsHost(maxPairs); - /* + /* b3AlignedObjectArray<b3Int4> cpuPairs; m_gpuPairs.copyToHost(cpuPairs); printf("host m_gpuPairs.size()=%d\n",m_gpuPairs.size()); @@ -154,57 +132,50 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs) */ return; } - - - - int numSmallAabbs = m_smallAabbsMappingGPU.size(); - b3OpenCLArray<int> pairCount(m_context,m_queue); + b3OpenCLArray<int> pairCount(m_context, m_queue); pairCount.push_back(0); - m_gpuPairs.resize(maxPairs);//numSmallAabbs*maxPairsPerBody); + m_gpuPairs.resize(maxPairs); //numSmallAabbs*maxPairsPerBody); { int numLargeAabbs = m_largeAabbsMappingGPU.size(); if (numLargeAabbs && numSmallAabbs) { B3_PROFILE("sap2Kernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( m_allAabbsGPU1.getBufferCL() ), - b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ), - b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ), - b3BufferInfoCL( m_gpuPairs.getBufferCL() ), + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(m_allAabbsGPU1.getBufferCL()), + b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()), + b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()), + b3BufferInfoCL(m_gpuPairs.getBufferCL()), b3BufferInfoCL(pairCount.getBufferCL())}; - b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( numLargeAabbs ); - launcher.setConst( numSmallAabbs); - launcher.setConst( 0 );//axis is not used - launcher.setConst( maxPairs ); - //@todo: use actual maximum work item sizes of the device instead of hardcoded values - launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64); - + b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(numLargeAabbs); + launcher.setConst(numSmallAabbs); + launcher.setConst(0); //axis is not used + launcher.setConst(maxPairs); + //@todo: use actual maximum work item sizes of the device instead of hardcoded values + launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64); + int numPairs = pairCount.at(0); - - if (numPairs >maxPairs) + + if (numPairs > maxPairs) { b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); - numPairs =maxPairs; + numPairs = maxPairs; } } } - - - if (numSmallAabbs) { B3_PROFILE("gridKernel"); m_hashGpu.resize(numSmallAabbs); { B3_PROFILE("kCalcHashAABB"); - b3LauncherCL launch(m_queue,kCalcHashAABB,"kCalcHashAABB"); + b3LauncherCL launch(m_queue, kCalcHashAABB, "kCalcHashAABB"); launch.setConst(numSmallAabbs); launch.setBuffer(m_allAabbsGPU1.getBufferCL()); launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); @@ -214,117 +185,104 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs) } m_sorter->execute(m_hashGpu); - - int numCells = this->m_paramsCPU.m_gridSize[0]*this->m_paramsCPU.m_gridSize[1]*this->m_paramsCPU.m_gridSize[2]; + + int numCells = this->m_paramsCPU.m_gridSize[0] * this->m_paramsCPU.m_gridSize[1] * this->m_paramsCPU.m_gridSize[2]; m_cellStartGpu.resize(numCells); //b3AlignedObjectArray<int > cellStartCpu; - - + { B3_PROFILE("kClearCellStart"); - b3LauncherCL launch(m_queue,kClearCellStart,"kClearCellStart"); + b3LauncherCL launch(m_queue, kClearCellStart, "kClearCellStart"); launch.setConst(numCells); launch.setBuffer(m_cellStartGpu.getBufferCL()); launch.launch1D(numCells); //m_cellStartGpu.copyToHost(cellStartCpu); //printf("??\n"); - } - { B3_PROFILE("kFindCellStart"); - b3LauncherCL launch(m_queue,kFindCellStart,"kFindCellStart"); + b3LauncherCL launch(m_queue, kFindCellStart, "kFindCellStart"); launch.setConst(numSmallAabbs); launch.setBuffer(m_hashGpu.getBufferCL()); launch.setBuffer(m_cellStartGpu.getBufferCL()); launch.launch1D(numSmallAabbs); //m_cellStartGpu.copyToHost(cellStartCpu); //printf("??\n"); - } - + { B3_PROFILE("kFindOverlappingPairs"); - - - b3LauncherCL launch(m_queue,kFindOverlappingPairs,"kFindOverlappingPairs"); + + b3LauncherCL launch(m_queue, kFindOverlappingPairs, "kFindOverlappingPairs"); launch.setConst(numSmallAabbs); launch.setBuffer(m_allAabbsGPU1.getBufferCL()); launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); launch.setBuffer(m_hashGpu.getBufferCL()); launch.setBuffer(m_cellStartGpu.getBufferCL()); - + launch.setBuffer(m_paramsGPU.getBufferCL()); //launch.setBuffer(0); launch.setBuffer(pairCount.getBufferCL()); launch.setBuffer(m_gpuPairs.getBufferCL()); - + launch.setConst(maxPairs); launch.launch1D(numSmallAabbs); - int numPairs = pairCount.at(0); - if (numPairs >maxPairs) + if (numPairs > maxPairs) { b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); - numPairs =maxPairs; + numPairs = maxPairs; } - + m_gpuPairs.resize(numPairs); - + if (0) { b3AlignedObjectArray<b3Int4> pairsCpu; m_gpuPairs.copyToHost(pairsCpu); int sz = m_gpuPairs.size(); - printf("m_gpuPairs.size()=%d\n",sz); - for (int i=0;i<m_gpuPairs.size();i++) + printf("m_gpuPairs.size()=%d\n", sz); + for (int i = 0; i < m_gpuPairs.size(); i++) { - printf("pair %d = %d,%d\n",i,pairsCpu[i].x,pairsCpu[i].y); + printf("pair %d = %d,%d\n", i, pairsCpu[i].x, pairsCpu[i].y); } printf("?!?\n"); } - } - - } - - - - //calculateOverlappingPairsHost(maxPairs); } -void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs) +void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs) { - m_hostPairs.resize(0); m_allAabbsGPU1.copyToHost(m_allAabbsCPU1); - for (int i=0;i<m_allAabbsCPU1.size();i++) + for (int i = 0; i < m_allAabbsCPU1.size(); i++) { - for (int j=i+1;j<m_allAabbsCPU1.size();j++) + for (int j = i + 1; j < m_allAabbsCPU1.size(); j++) { if (b3TestAabbAgainstAabb2(m_allAabbsCPU1[i].m_minVec, m_allAabbsCPU1[i].m_maxVec, - m_allAabbsCPU1[j].m_minVec,m_allAabbsCPU1[j].m_maxVec)) + m_allAabbsCPU1[j].m_minVec, m_allAabbsCPU1[j].m_maxVec)) { b3Int4 pair; int a = m_allAabbsCPU1[j].m_minIndices[3]; int b = m_allAabbsCPU1[i].m_minIndices[3]; - if (a<=b) + if (a <= b) { - pair.x = a; - pair.y = b;//store the original index in the unsorted aabb array - } else + pair.x = a; + pair.y = b; //store the original index in the unsorted aabb array + } + else { pair.x = b; - pair.y = a;//store the original index in the unsorted aabb array + pair.y = a; //store the original index in the unsorted aabb array } - - if (m_hostPairs.size()<maxPairs) + + if (m_hostPairs.size() < maxPairs) { m_hostPairs.push_back(pair); } @@ -332,40 +290,36 @@ void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs) } } - m_gpuPairs.copyFromHost(m_hostPairs); - - } - //call writeAabbsToGpu after done making all changes (createProxy etc) +//call writeAabbsToGpu after done making all changes (createProxy etc) void b3GpuGridBroadphase::writeAabbsToGpu() { m_allAabbsGPU1.copyFromHost(m_allAabbsCPU1); m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU); m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU); - } -cl_mem b3GpuGridBroadphase::getAabbBufferWS() +cl_mem b3GpuGridBroadphase::getAabbBufferWS() { return this->m_allAabbsGPU1.getBufferCL(); } -int b3GpuGridBroadphase::getNumOverlap() +int b3GpuGridBroadphase::getNumOverlap() { return m_gpuPairs.size(); } -cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer() +cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer() { return m_gpuPairs.getBufferCL(); } -b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU() +b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU() { return m_allAabbsGPU1; } -b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU() +b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU() { return m_allAabbsCPU1; } @@ -382,4 +336,3 @@ b3OpenCLArray<int>& b3GpuGridBroadphase::getLargeAabbIndicesGPU() { return m_largeAabbsMappingGPU; } - diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h index ec18c9f716..b76cb43b68 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h @@ -6,83 +6,75 @@ struct b3ParamsGridBroadphaseCL { - float m_invCellSize[4]; - int m_gridSize[4]; + int m_gridSize[4]; - int getMaxBodiesPerCell() const + int getMaxBodiesPerCell() const { return m_gridSize[3]; } - void setMaxBodiesPerCell(int maxOverlap) + void setMaxBodiesPerCell(int maxOverlap) { m_gridSize[3] = maxOverlap; } }; - class b3GpuGridBroadphase : public b3GpuBroadphaseInterface { protected: - cl_context m_context; - cl_device_id m_device; - cl_command_queue m_queue; + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; - b3OpenCLArray<b3SapAabb> m_allAabbsGPU1; - b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1; + b3OpenCLArray<b3SapAabb> m_allAabbsGPU1; + b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1; - b3OpenCLArray<int> m_smallAabbsMappingGPU; + b3OpenCLArray<int> m_smallAabbsMappingGPU; b3AlignedObjectArray<int> m_smallAabbsMappingCPU; - b3OpenCLArray<int> m_largeAabbsMappingGPU; + b3OpenCLArray<int> m_largeAabbsMappingGPU; b3AlignedObjectArray<int> m_largeAabbsMappingCPU; b3AlignedObjectArray<b3Int4> m_hostPairs; - b3OpenCLArray<b3Int4> m_gpuPairs; + b3OpenCLArray<b3Int4> m_gpuPairs; - b3OpenCLArray<b3SortData> m_hashGpu; - b3OpenCLArray<int> m_cellStartGpu; - + b3OpenCLArray<b3SortData> m_hashGpu; + b3OpenCLArray<int> m_cellStartGpu; - b3ParamsGridBroadphaseCL m_paramsCPU; - b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU; + b3ParamsGridBroadphaseCL m_paramsCPU; + b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU; - class b3RadixSort32CL* m_sorter; + class b3RadixSort32CL* m_sorter; public: - - b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q ); + b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q); virtual ~b3GpuGridBroadphase(); - static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx,cl_device_id device, cl_command_queue q) + static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx, cl_device_id device, cl_command_queue q) { - return new b3GpuGridBroadphase(ctx,device,q); + return new b3GpuGridBroadphase(ctx, device, q); } - - + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); - virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); - virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); - - virtual void calculateOverlappingPairs(int maxPairs); - virtual void calculateOverlappingPairsHost(int maxPairs); + virtual void calculateOverlappingPairs(int maxPairs); + virtual void calculateOverlappingPairsHost(int maxPairs); //call writeAabbsToGpu after done making all changes (createProxy etc) virtual void writeAabbsToGpu(); - virtual cl_mem getAabbBufferWS(); - virtual int getNumOverlap(); - virtual cl_mem getOverlappingPairBuffer(); + virtual cl_mem getAabbBufferWS(); + virtual int getNumOverlap(); + virtual cl_mem getOverlappingPairBuffer(); + + virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU(); + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU(); - virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU(); - virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU(); - virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU(); virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU(); virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU(); - }; -#endif //B3_GPU_GRID_BROADPHASE_H
\ No newline at end of file +#endif //B3_GPU_GRID_BROADPHASE_H
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp index 641df9eb12..0721928684 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp @@ -16,177 +16,174 @@ subject to the following restrictions: #include "b3GpuParallelLinearBvh.h" -b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : - m_queue(queue), - m_radixSorter(context, device, queue), - - m_rootNodeIndex(context, queue), - m_maxDistanceFromRoot(context, queue), - m_temp(context, queue), - - m_internalNodeAabbs(context, queue), - m_internalNodeLeafIndexRanges(context, queue), - m_internalNodeChildNodes(context, queue), - m_internalNodeParentNodes(context, queue), - - m_commonPrefixes(context, queue), - m_commonPrefixLengths(context, queue), - m_distanceFromRoot(context, queue), - - m_leafNodeParentNodes(context, queue), - m_mortonCodesAndAabbIndicies(context, queue), - m_mergedAabb(context, queue), - m_leafNodeAabbs(context, queue), - - m_largeAabbs(context, queue) +b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : m_queue(queue), + m_radixSorter(context, device, queue), + + m_rootNodeIndex(context, queue), + m_maxDistanceFromRoot(context, queue), + m_temp(context, queue), + + m_internalNodeAabbs(context, queue), + m_internalNodeLeafIndexRanges(context, queue), + m_internalNodeChildNodes(context, queue), + m_internalNodeParentNodes(context, queue), + + m_commonPrefixes(context, queue), + m_commonPrefixLengths(context, queue), + m_distanceFromRoot(context, queue), + + m_leafNodeParentNodes(context, queue), + m_mortonCodesAndAabbIndicies(context, queue), + m_mergedAabb(context, queue), + m_leafNodeAabbs(context, queue), + + m_largeAabbs(context, queue) { m_rootNodeIndex.resize(1); m_maxDistanceFromRoot.resize(1); m_temp.resize(1); - + // const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl"; - - const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h + + const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h cl_int error; char* additionalMacros = 0; m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH); b3Assert(m_parallelLinearBvhProgram); - - m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros ); + + m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_separateAabbsKernel); - m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_findAllNodesMergedAabbKernel); - m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_assignMortonCodesAndAabbIndiciesKernel); - - m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros ); + + m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_computeAdjacentPairCommonPrefixKernel); - m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_buildBinaryRadixTreeLeafNodesKernel); - m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_buildBinaryRadixTreeInternalNodesKernel); - m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_findDistanceFromRootKernel); - m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel); - - m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros ); + + m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_findLeafIndexRangesKernel); - - m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros ); + + m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_plbvhCalculateOverlappingPairsKernel); - m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_plbvhRayTraverseKernel); - m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_plbvhLargeAabbAabbTestKernel); - m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_plbvhLargeAabbRayTestKernel); } -b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh() +b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh() { clReleaseKernel(m_separateAabbsKernel); clReleaseKernel(m_findAllNodesMergedAabbKernel); clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel); - + clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel); clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel); clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel); clReleaseKernel(m_findDistanceFromRootKernel); clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel); - + clReleaseKernel(m_findLeafIndexRangesKernel); - + clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel); clReleaseKernel(m_plbvhRayTraverseKernel); clReleaseKernel(m_plbvhLargeAabbAabbTestKernel); clReleaseKernel(m_plbvhLargeAabbRayTestKernel); - + clReleaseProgram(m_parallelLinearBvhProgram); } -void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, - const b3OpenCLArray<int>& largeAabbIndices) +void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, + const b3OpenCLArray<int>& largeAabbIndices) { B3_PROFILE("b3ParallelLinearBvh::build()"); - + int numLargeAabbs = largeAabbIndices.size(); int numSmallAabbs = smallAabbIndices.size(); - - //Since all AABBs(both large and small) are input as a contiguous array, + + //Since all AABBs(both large and small) are input as a contiguous array, //with 2 additional arrays used to indicate the indices of large and small AABBs, //it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH. { B3_PROFILE("Separate large and small AABBs"); - + m_largeAabbs.resize(numLargeAabbs); m_leafNodeAabbs.resize(numSmallAabbs); - + //Write large AABBs into m_largeAabbs { - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), - b3BufferInfoCL( largeAabbIndices.getBufferCL() ), - - b3BufferInfoCL( m_largeAabbs.getBufferCL() ) - }; - + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(worldSpaceAabbs.getBufferCL()), + b3BufferInfoCL(largeAabbIndices.getBufferCL()), + + b3BufferInfoCL(m_largeAabbs.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numLargeAabbs); - + launcher.launch1D(numLargeAabbs); } - + //Write small AABBs into m_leafNodeAabbs { - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), - b3BufferInfoCL( smallAabbIndices.getBufferCL() ), - - b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ) - }; - + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(worldSpaceAabbs.getBufferCL()), + b3BufferInfoCL(smallAabbIndices.getBufferCL()), + + b3BufferInfoCL(m_leafNodeAabbs.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numSmallAabbs); - + launcher.launch1D(numSmallAabbs); } - + clFinish(m_queue); } - + // - int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs + int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs int numInternalNodes = numLeaves - 1; - - if(numLeaves < 2) + + if (numLeaves < 2) { //Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(), //so it does not matter if numLeaves == 0 and rootNodeIndex == -1 int rootNodeIndex = numLeaves - 1; m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1); - + //Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm, //m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index //instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work. //( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs ) - if(numLeaves == 1) + if (numLeaves == 1) { b3SortData leaf; - leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set - + leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set + m_mortonCodesAndAabbIndicies.resize(1); m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1); } - + return; } - + // { m_internalNodeAabbs.resize(numInternalNodes); @@ -197,37 +194,37 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab m_commonPrefixes.resize(numInternalNodes); m_commonPrefixLengths.resize(numInternalNodes); m_distanceFromRoot.resize(numInternalNodes); - + m_leafNodeParentNodes.resize(numLeaves); m_mortonCodesAndAabbIndicies.resize(numLeaves); m_mergedAabb.resize(numLeaves); } - - //Find the merged AABB of all small AABBs; this is used to define the size of + + //Find the merged AABB of all small AABBs; this is used to define the size of //each cell in the virtual grid for the next kernel(2^10 cells in each dimension). { B3_PROFILE("Find AABB of merged nodes"); - - m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array - - for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2; - numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2) + + m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array + + for (int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2; + numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2) { - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0] - }; - + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_mergedAabb.getBufferCL()) //Resulting AABB is stored in m_mergedAabb[0] + }; + b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numAabbsNeedingMerge); - + launcher.launch1D(numAabbsNeedingMerge); } - + clFinish(m_queue); } - + //Insert the center of the AABBs into a virtual grid, //then convert the discrete grid coordinates into a morton code //For each element in m_mortonCodesAndAabbIndicies, set @@ -235,34 +232,32 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab // m_value == small AABB index { B3_PROFILE("Assign morton codes"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), - b3BufferInfoCL( m_mergedAabb.getBufferCL() ), - b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()), + b3BufferInfoCL(m_mergedAabb.getBufferCL()), + b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numLeaves); - + launcher.launch1D(numLeaves); clFinish(m_queue); } - + // { B3_PROFILE("Sort leaves by morton codes"); - + m_radixSorter.execute(m_mortonCodesAndAabbIndicies); clFinish(m_queue); } - + // constructBinaryRadixTree(); - - + //Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices. //The root node contains leaf node indices in the range [0, numLeafNodes - 1]. //The child nodes of each node split their parent's index range into 2 contiguous halves. @@ -273,17 +268,16 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab //This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice { B3_PROFILE("m_findLeafIndexRangesKernel"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()), + b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numInternalNodes); - + launcher.launch1D(numInternalNodes); clFinish(m_queue); } @@ -293,285 +287,271 @@ void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& ou { int maxPairs = out_overlappingPairs.size(); b3OpenCLArray<int>& numPairsGpu = m_temp; - + int reset = 0; numPairsGpu.copyFromHostPointer(&reset, 1); - + // - if( m_leafNodeAabbs.size() > 1 ) + if (m_leafNodeAabbs.size() > 1) { B3_PROFILE("PLBVH small-small AABB test"); - + int numQueryAabbs = m_leafNodeAabbs.size(); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), - - b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), - b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), - b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), - - b3BufferInfoCL( numPairsGpu.getBufferCL() ), - b3BufferInfoCL( out_overlappingPairs.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()), + + b3BufferInfoCL(m_rootNodeIndex.getBufferCL()), + b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()), + b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()), + b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()), + b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()), + + b3BufferInfoCL(numPairsGpu.getBufferCL()), + b3BufferInfoCL(out_overlappingPairs.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(maxPairs); launcher.setConst(numQueryAabbs); - + launcher.launch1D(numQueryAabbs); clFinish(m_queue); } - + int numLargeAabbRigids = m_largeAabbs.size(); - if( numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0 ) + if (numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0) { B3_PROFILE("PLBVH large-small AABB test"); - + int numQueryAabbs = m_leafNodeAabbs.size(); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), - b3BufferInfoCL( m_largeAabbs.getBufferCL() ), - - b3BufferInfoCL( numPairsGpu.getBufferCL() ), - b3BufferInfoCL( out_overlappingPairs.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()), + b3BufferInfoCL(m_largeAabbs.getBufferCL()), + + b3BufferInfoCL(numPairsGpu.getBufferCL()), + b3BufferInfoCL(out_overlappingPairs.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(maxPairs); launcher.setConst(numLargeAabbRigids); launcher.setConst(numQueryAabbs); - + launcher.launch1D(numQueryAabbs); clFinish(m_queue); } - - + // int numPairs = -1; numPairsGpu.copyToHostPointer(&numPairs, 1); - if(numPairs > maxPairs) + if (numPairs > maxPairs) { b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); numPairs = maxPairs; numPairsGpu.copyFromHostPointer(&maxPairs, 1); } - + out_overlappingPairs.resize(numPairs); } - -void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, - b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs) +void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, + b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs) { B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()"); - + int numRays = rays.size(); int maxRayRigidPairs = out_rayRigidPairs.size(); - + int reset = 0; out_numRayRigidPairs.copyFromHostPointer(&reset, 1); - + // - if( m_leafNodeAabbs.size() > 0 ) + if (m_leafNodeAabbs.size() > 0) { B3_PROFILE("PLBVH ray test small AABB"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), - - b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), - b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), - b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), - - b3BufferInfoCL( rays.getBufferCL() ), - - b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ), - b3BufferInfoCL( out_rayRigidPairs.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()), + + b3BufferInfoCL(m_rootNodeIndex.getBufferCL()), + b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()), + b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()), + b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()), + b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()), + + b3BufferInfoCL(rays.getBufferCL()), + + b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()), + b3BufferInfoCL(out_rayRigidPairs.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(maxRayRigidPairs); launcher.setConst(numRays); - + launcher.launch1D(numRays); clFinish(m_queue); } - + int numLargeAabbRigids = m_largeAabbs.size(); - if(numLargeAabbRigids > 0) + if (numLargeAabbRigids > 0) { B3_PROFILE("PLBVH ray test large AABB"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_largeAabbs.getBufferCL() ), - b3BufferInfoCL( rays.getBufferCL() ), - - b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ), - b3BufferInfoCL( out_rayRigidPairs.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_largeAabbs.getBufferCL()), + b3BufferInfoCL(rays.getBufferCL()), + + b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()), + b3BufferInfoCL(out_rayRigidPairs.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numLargeAabbRigids); launcher.setConst(maxRayRigidPairs); launcher.setConst(numRays); - + launcher.launch1D(numRays); clFinish(m_queue); } - + // int numRayRigidPairs = -1; out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1); - - if(numRayRigidPairs > maxRayRigidPairs) + + if (numRayRigidPairs > maxRayRigidPairs) b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs); - } void b3GpuParallelLinearBvh::constructBinaryRadixTree() { B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()"); - + int numLeaves = m_leafNodeAabbs.size(); int numInternalNodes = numLeaves - 1; - + //Each internal node is placed in between 2 leaf nodes. //By using this arrangement and computing the common prefix between //these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree. { B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), - b3BufferInfoCL( m_commonPrefixes.getBufferCL() ), - b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()), + b3BufferInfoCL(m_commonPrefixes.getBufferCL()), + b3BufferInfoCL(m_commonPrefixLengths.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numInternalNodes); - + launcher.launch1D(numInternalNodes); clFinish(m_queue); } - - //For each leaf node, select its parent node by + + //For each leaf node, select its parent node by //comparing the 2 nearest internal nodes and assign child node indices { B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ), - b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()), + b3BufferInfoCL(m_leafNodeParentNodes.getBufferCL()), + b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numLeaves); - + launcher.launch1D(numLeaves); clFinish(m_queue); } - + //For each internal node, perform 2 binary searches among the other internal nodes //to its left and right to find its potential parent nodes and assign child node indices { B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_commonPrefixes.getBufferCL() ), - b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), - b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_commonPrefixes.getBufferCL()), + b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()), + b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()), + b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()), + b3BufferInfoCL(m_rootNodeIndex.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numInternalNodes); - + launcher.launch1D(numInternalNodes); clFinish(m_queue); } - + //Find the number of nodes seperating each internal node and the root node //so that the AABBs can be set using the next kernel. //Also determine the maximum number of nodes separating an internal node and the root node. { B3_PROFILE("m_findDistanceFromRootKernel"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), - b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), - b3BufferInfoCL( m_maxDistanceFromRoot.getBufferCL() ), - b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_rootNodeIndex.getBufferCL()), + b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()), + b3BufferInfoCL(m_maxDistanceFromRoot.getBufferCL()), + b3BufferInfoCL(m_distanceFromRoot.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numInternalNodes); - + launcher.launch1D(numInternalNodes); clFinish(m_queue); } - + //Starting from the internal nodes nearest to the leaf nodes, recursively move up //the tree towards the root to set the AABBs of each internal node; each internal node //checks its children and merges their AABBs { B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel"); - + int maxDistanceFromRoot = -1; { B3_PROFILE("copy maxDistanceFromRoot to CPU"); m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1); clFinish(m_queue); } - - for(int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot) + + for (int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot) { - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ), - b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), - b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), - b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ) - }; - + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_distanceFromRoot.getBufferCL()), + b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()), + b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()), + b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()), + b3BufferInfoCL(m_internalNodeAabbs.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(maxDistanceFromRoot); launcher.setConst(distanceFromRoot); launcher.setConst(numInternalNodes); - + //It may seem inefficent to launch a thread for each internal node when a //much smaller number of nodes is actually processed, but this is actually - //faster than determining the exact nodes that are ready to merge their child AABBs. + //faster than determining the exact nodes that are ready to merge their child AABBs. launcher.launch1D(numInternalNodes); } - + clFinish(m_queue); } } - -
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h index effe617b7b..b390775129 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h @@ -37,10 +37,10 @@ subject to the following restrictions: ///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n ///@par ///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages: -/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid) +/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid) /// - [fully parallel] Sort morton codes -/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH) -/// - [somewhat parallel] Set internal node AABBs +/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH) +/// - [somewhat parallel] Set internal node AABBs ///@par ///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages. ///The BVH implementation here shares many concepts with [Karras 2012], but a different method is used for constructing the tree. @@ -49,75 +49,75 @@ subject to the following restrictions: class b3GpuParallelLinearBvh { cl_command_queue m_queue; - + cl_program m_parallelLinearBvhProgram; - + cl_kernel m_separateAabbsKernel; cl_kernel m_findAllNodesMergedAabbKernel; cl_kernel m_assignMortonCodesAndAabbIndiciesKernel; - + //Binary radix tree construction kernels cl_kernel m_computeAdjacentPairCommonPrefixKernel; cl_kernel m_buildBinaryRadixTreeLeafNodesKernel; cl_kernel m_buildBinaryRadixTreeInternalNodesKernel; cl_kernel m_findDistanceFromRootKernel; cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel; - + cl_kernel m_findLeafIndexRangesKernel; - + //Traversal kernels cl_kernel m_plbvhCalculateOverlappingPairsKernel; cl_kernel m_plbvhRayTraverseKernel; cl_kernel m_plbvhLargeAabbAabbTestKernel; cl_kernel m_plbvhLargeAabbRayTestKernel; - + b3RadixSort32CL m_radixSorter; - + //1 element - b3OpenCLArray<int> m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node - b3OpenCLArray<int> m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node - b3OpenCLArray<int> m_temp; //Used to hold the number of pairs in calculateOverlappingPairs() - + b3OpenCLArray<int> m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node + b3OpenCLArray<int> m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node + b3OpenCLArray<int> m_temp; //Used to hold the number of pairs in calculateOverlappingPairs() + //1 element per internal node (number_of_internal_nodes == number_of_leaves - 1) b3OpenCLArray<b3SapAabb> m_internalNodeAabbs; - b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index - b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node - b3OpenCLArray<int> m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal - + b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index + b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node + b3OpenCLArray<int> m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal + //1 element per internal node; for binary radix tree construction b3OpenCLArray<b3Int64> m_commonPrefixes; b3OpenCLArray<int> m_commonPrefixLengths; - b3OpenCLArray<int> m_distanceFromRoot; //Number of internal nodes between this node and the root - + b3OpenCLArray<int> m_distanceFromRoot; //Number of internal nodes between this node and the root + //1 element per leaf node (leaf nodes only include small AABBs) - b3OpenCLArray<int> m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal - b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs - b3OpenCLArray<b3SapAabb> m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes - b3OpenCLArray<b3SapAabb> m_leafNodeAabbs; //Contains only small AABBs - + b3OpenCLArray<int> m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal + b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs + b3OpenCLArray<b3SapAabb> m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes + b3OpenCLArray<b3SapAabb> m_leafNodeAabbs; //Contains only small AABBs + //1 element per large AABB, which is not stored in the BVH b3OpenCLArray<b3SapAabb> m_largeAabbs; - + public: b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue); virtual ~b3GpuParallelLinearBvh(); - + ///Must be called before any other function - void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, - const b3OpenCLArray<int>& largeAabbIndices); - + void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, + const b3OpenCLArray<int>& largeAabbIndices); + ///calculateOverlappingPairs() uses the worldSpaceAabbs parameter of b3GpuParallelLinearBvh::build() as the query AABBs. ///@param out_overlappingPairs The size() of this array is used to determine the max number of pairs. ///If the number of overlapping pairs is < out_overlappingPairs.size(), out_overlappingPairs is resized. void calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs); - + ///@param out_numRigidRayPairs Array of length 1; contains the number of detected ray-rigid AABB intersections; ///this value may be greater than out_rayRigidPairs.size() if out_rayRigidPairs is not large enough. ///@param out_rayRigidPairs Contains an array of rays intersecting rigid AABBs; x == ray index, y == rigid body index. ///If the size of this array is insufficient to hold all ray-rigid AABB intersections, additional intersections are discarded. - void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, - b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs); - + void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, + b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs); + private: void constructBinaryRadixTree(); }; diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp index d2618024ac..62ea7a32df 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp @@ -13,45 +13,44 @@ subject to the following restrictions: #include "b3GpuParallelLinearBvhBroadphase.h" -b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : - m_plbvh(context, device, queue), - - m_overlappingPairsGpu(context, queue), - - m_aabbsGpu(context, queue), - m_smallAabbsMappingGpu(context, queue), - m_largeAabbsMappingGpu(context, queue) +b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : m_plbvh(context, device, queue), + + m_overlappingPairsGpu(context, queue), + + m_aabbsGpu(context, queue), + m_smallAabbsMappingGpu(context, queue), + m_largeAabbsMappingGpu(context, queue) { } -void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) +void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) { int newAabbIndex = m_aabbsCpu.size(); b3SapAabb aabb; aabb.m_minVec = aabbMin; aabb.m_maxVec = aabbMax; - + aabb.m_minIndices[3] = userPtr; aabb.m_signedMaxIndices[3] = newAabbIndex; - + m_smallAabbsMappingCpu.push_back(newAabbIndex); - + m_aabbsCpu.push_back(aabb); } -void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) +void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) { int newAabbIndex = m_aabbsCpu.size(); b3SapAabb aabb; aabb.m_minVec = aabbMin; aabb.m_maxVec = aabbMax; - + aabb.m_minIndices[3] = userPtr; aabb.m_signedMaxIndices[3] = newAabbIndex; - + m_largeAabbsMappingCpu.push_back(newAabbIndex); - + m_aabbsCpu.push_back(aabb); } @@ -59,22 +58,19 @@ void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairs(int maxPairs) { //Reconstruct BVH m_plbvh.build(m_aabbsGpu, m_smallAabbsMappingGpu, m_largeAabbsMappingGpu); - + // m_overlappingPairsGpu.resize(maxPairs); m_plbvh.calculateOverlappingPairs(m_overlappingPairsGpu); } void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairsHost(int maxPairs) { - b3Assert(0); //CPU version not implemented + b3Assert(0); //CPU version not implemented } -void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu() -{ - m_aabbsGpu.copyFromHost(m_aabbsCpu); +void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu() +{ + m_aabbsGpu.copyFromHost(m_aabbsCpu); m_smallAabbsMappingGpu.copyFromHost(m_smallAabbsMappingCpu); m_largeAabbsMappingGpu.copyFromHost(m_largeAabbsMappingCpu); } - - - diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h index e518500637..dda0eea7be 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h @@ -21,42 +21,42 @@ subject to the following restrictions: class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface { b3GpuParallelLinearBvh m_plbvh; - + b3OpenCLArray<b3Int4> m_overlappingPairsGpu; - + b3OpenCLArray<b3SapAabb> m_aabbsGpu; b3OpenCLArray<int> m_smallAabbsMappingGpu; b3OpenCLArray<int> m_largeAabbsMappingGpu; - + b3AlignedObjectArray<b3SapAabb> m_aabbsCpu; b3AlignedObjectArray<int> m_smallAabbsMappingCpu; b3AlignedObjectArray<int> m_largeAabbsMappingCpu; - + public: b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue); virtual ~b3GpuParallelLinearBvhBroadphase() {} - virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); - virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); - + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); + virtual void calculateOverlappingPairs(int maxPairs); virtual void calculateOverlappingPairsHost(int maxPairs); //call writeAabbsToGpu after done making all changes (createProxy etc) virtual void writeAabbsToGpu(); - - virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); } + + virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); } virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); } virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); } virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; } - + virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() { return m_overlappingPairsGpu; } virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() { return m_smallAabbsMappingGpu; } virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() { return m_largeAabbsMappingGpu; } - + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_aabbsCpu; } - + static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue) { return new b3GpuParallelLinearBvhBroadphase(context, device, queue); diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp index c45fbbdcaa..4126d03ed0 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp @@ -6,7 +6,6 @@ bool searchIncremental3dSapOnGpu = true; #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h" - #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" #include "kernels/sapKernels.h" @@ -56,110 +55,105 @@ bool searchIncremental3dSapOnGpu = true; class b3PrefixScanFloat4CL* m_prefixScanFloat4; */ -b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType) -:m_context(ctx), -m_device(device), -m_queue(q), - -m_objectMinMaxIndexGPUaxis0(ctx,q), -m_objectMinMaxIndexGPUaxis1(ctx,q), -m_objectMinMaxIndexGPUaxis2(ctx,q), -m_objectMinMaxIndexGPUaxis0prev(ctx,q), -m_objectMinMaxIndexGPUaxis1prev(ctx,q), -m_objectMinMaxIndexGPUaxis2prev(ctx,q), -m_sortedAxisGPU0(ctx,q), -m_sortedAxisGPU1(ctx,q), -m_sortedAxisGPU2(ctx,q), -m_sortedAxisGPU0prev(ctx,q), -m_sortedAxisGPU1prev(ctx,q), -m_sortedAxisGPU2prev(ctx,q), -m_addedHostPairsGPU(ctx,q), -m_removedHostPairsGPU(ctx,q), -m_addedCountGPU(ctx,q), -m_removedCountGPU(ctx,q), -m_currentBuffer(-1), -m_pairCount(ctx,q), -m_allAabbsGPU(ctx,q), -m_sum(ctx,q), -m_sum2(ctx,q), -m_dst(ctx,q), -m_smallAabbsMappingGPU(ctx,q), -m_largeAabbsMappingGPU(ctx,q), -m_overlappingPairs(ctx,q), -m_gpuSmallSortData(ctx,q), -m_gpuSmallSortedAabbs(ctx,q) +b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType) + : m_context(ctx), + m_device(device), + m_queue(q), + + m_objectMinMaxIndexGPUaxis0(ctx, q), + m_objectMinMaxIndexGPUaxis1(ctx, q), + m_objectMinMaxIndexGPUaxis2(ctx, q), + m_objectMinMaxIndexGPUaxis0prev(ctx, q), + m_objectMinMaxIndexGPUaxis1prev(ctx, q), + m_objectMinMaxIndexGPUaxis2prev(ctx, q), + m_sortedAxisGPU0(ctx, q), + m_sortedAxisGPU1(ctx, q), + m_sortedAxisGPU2(ctx, q), + m_sortedAxisGPU0prev(ctx, q), + m_sortedAxisGPU1prev(ctx, q), + m_sortedAxisGPU2prev(ctx, q), + m_addedHostPairsGPU(ctx, q), + m_removedHostPairsGPU(ctx, q), + m_addedCountGPU(ctx, q), + m_removedCountGPU(ctx, q), + m_currentBuffer(-1), + m_pairCount(ctx, q), + m_allAabbsGPU(ctx, q), + m_sum(ctx, q), + m_sum2(ctx, q), + m_dst(ctx, q), + m_smallAabbsMappingGPU(ctx, q), + m_largeAabbsMappingGPU(ctx, q), + m_overlappingPairs(ctx, q), + m_gpuSmallSortData(ctx, q), + m_gpuSmallSortedAabbs(ctx, q) { const char* sapSrc = sapCL; - - - cl_int errNum=0; + + cl_int errNum = 0; b3Assert(m_context); b3Assert(m_device); - cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH); - b3Assert(errNum==CL_SUCCESS); - + cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH); + b3Assert(errNum == CL_SUCCESS); - b3Assert(errNum==CL_SUCCESS); + b3Assert(errNum == CL_SUCCESS); #ifndef __APPLE__ - m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context,m_device,m_queue); + m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context, m_device, m_queue); #else m_prefixScanFloat4 = 0; #endif m_sapKernel = 0; - + switch (kernelType) { case B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU: { - m_sapKernel=0; + m_sapKernel = 0; break; } - case B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU: + case B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU: { - m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBruteForce",&errNum,sapProg ); + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelBruteForce", &errNum, sapProg); break; } case B3_GPU_SAP_KERNEL_ORIGINAL: { - m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg ); + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelOriginal", &errNum, sapProg); break; } case B3_GPU_SAP_KERNEL_BARRIER: { - m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg ); + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelBarrier", &errNum, sapProg); break; } case B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY: { - m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg ); + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelLocalSharedMemory", &errNum, sapProg); break; } default: { - m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg ); + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelLocalSharedMemory", &errNum, sapProg); b3Error("Unknown 3D GPU SAP provided, fallback to computePairsKernelLocalSharedMemory"); } }; - - - - m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg ); - b3Assert(errNum==CL_SUCCESS); - m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "prepareSumVarianceKernel",&errNum,sapProg ); - b3Assert(errNum==CL_SUCCESS); + m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg); + b3Assert(errNum == CL_SUCCESS); - - m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg ); + m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "prepareSumVarianceKernel", &errNum, sapProg); + b3Assert(errNum == CL_SUCCESS); - m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg ); + m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "flipFloatKernel", &errNum, sapProg); - m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "scatterKernel",&errNum,sapProg ); + m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg); - m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue); + m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "scatterKernel", &errNum, sapProg); + + m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue); } b3GpuSapBroadphase::~b3GpuSapBroadphase() @@ -173,13 +167,11 @@ b3GpuSapBroadphase::~b3GpuSapBroadphase() clReleaseKernel(m_sapKernel); clReleaseKernel(m_sap2Kernel); clReleaseKernel(m_prepareSumVarianceKernel); - - } /// conservative test for overlap between two aabbs -static bool TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aabbMax1, - const b3Vector3 &aabbMin2, const b3Vector3 &aabbMax2) +static bool TestAabbAgainstAabb2(const b3Vector3& aabbMin1, const b3Vector3& aabbMax1, + const b3Vector3& aabbMin2, const b3Vector3& aabbMax2) { bool overlap = true; overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap; @@ -188,8 +180,6 @@ static bool TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aab return overlap; } - - //http://stereopsis.com/radix.html static unsigned int FloatFlip(float fl) { @@ -198,79 +188,77 @@ static unsigned int FloatFlip(float fl) return f ^ mask; }; -void b3GpuSapBroadphase::init3dSap() +void b3GpuSapBroadphase::init3dSap() { - if (m_currentBuffer<0) + if (m_currentBuffer < 0) { m_allAabbsGPU.copyToHost(m_allAabbsCPU); m_currentBuffer = 0; - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { - for (int buf=0;buf<2;buf++) + for (int buf = 0; buf < 2; buf++) { int totalNumAabbs = m_allAabbsCPU.size(); - int numEndPoints = 2*totalNumAabbs; + int numEndPoints = 2 * totalNumAabbs; m_sortedAxisCPU[axis][buf].resize(numEndPoints); - if (buf==m_currentBuffer) + if (buf == m_currentBuffer) { - for (int i=0;i<totalNumAabbs;i++) + for (int i = 0; i < totalNumAabbs; i++) { - m_sortedAxisCPU[axis][buf][i*2].m_key = FloatFlip(m_allAabbsCPU[i].m_min[axis])-1; - m_sortedAxisCPU[axis][buf][i*2].m_value = i*2; - m_sortedAxisCPU[axis][buf][i*2+1].m_key = FloatFlip(m_allAabbsCPU[i].m_max[axis])+1; - m_sortedAxisCPU[axis][buf][i*2+1].m_value = i*2+1; + m_sortedAxisCPU[axis][buf][i * 2].m_key = FloatFlip(m_allAabbsCPU[i].m_min[axis]) - 1; + m_sortedAxisCPU[axis][buf][i * 2].m_value = i * 2; + m_sortedAxisCPU[axis][buf][i * 2 + 1].m_key = FloatFlip(m_allAabbsCPU[i].m_max[axis]) + 1; + m_sortedAxisCPU[axis][buf][i * 2 + 1].m_value = i * 2 + 1; } } } } - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]); } - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { //int totalNumAabbs = m_allAabbsCPU.size(); int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size(); m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(numEndPoints); - for (int i=0;i<numEndPoints;i++) + for (int i = 0; i < numEndPoints; i++) { int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value; - int newDest = destIndex/2; - if (destIndex&1) + int newDest = destIndex / 2; + if (destIndex & 1) { - m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i; - } else + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y = i; + } + else { - m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i; + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x = i; } } } - } } - static bool b3PairCmp(const b3Int4& p, const b3Int4& q) { - return ((p.x<q.x) || ((p.x==q.x) && (p.y<q.y))); + return ((p.x < q.x) || ((p.x == q.x) && (p.y < q.y))); } - -static bool operator==(const b3Int4& a,const b3Int4& b) +static bool operator==(const b3Int4& a, const b3Int4& b) { return a.x == b.x && a.y == b.y; }; -static bool operator<(const b3Int4& a,const b3Int4& b) +static bool operator<(const b3Int4& a, const b3Int4& b) { return a.x < b.x || (a.x == b.x && a.y < b.y); }; -static bool operator>(const b3Int4& a,const b3Int4& b) +static bool operator>(const b3Int4& a, const b3Int4& b) { return a.x > b.x || (a.x == b.x && a.y > b.y); }; @@ -278,31 +266,29 @@ static bool operator>(const b3Int4& a,const b3Int4& b) b3AlignedObjectArray<b3Int4> addedHostPairs; b3AlignedObjectArray<b3Int4> removedHostPairs; -b3AlignedObjectArray<b3SapAabb> preAabbs; +b3AlignedObjectArray<b3SapAabb> preAabbs; -void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() +void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() { //static int framepje = 0; //printf("framepje=%d\n",framepje++); - B3_PROFILE("calculateOverlappingPairsHostIncremental3Sap"); addedHostPairs.resize(0); removedHostPairs.resize(0); - b3Assert(m_currentBuffer>=0); - + b3Assert(m_currentBuffer >= 0); + { preAabbs.resize(m_allAabbsCPU.size()); - for (int i=0;i<preAabbs.size();i++) + for (int i = 0; i < preAabbs.size(); i++) { - preAabbs[i]=m_allAabbsCPU[i]; + preAabbs[i] = m_allAabbsCPU[i]; } } - - if (m_currentBuffer<0) + if (m_currentBuffer < 0) return; { B3_PROFILE("m_allAabbsGPU.copyToHost"); @@ -316,100 +302,87 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() } if (0) { - { - printf("ab[40].min=%f,%f,%f,ab[40].max=%f,%f,%f\n", - m_allAabbsCPU[40].m_min[0], m_allAabbsCPU[40].m_min[1],m_allAabbsCPU[40].m_min[2], - m_allAabbsCPU[40].m_max[0], m_allAabbsCPU[40].m_max[1],m_allAabbsCPU[40].m_max[2]); - } - - { - printf("ab[53].min=%f,%f,%f,ab[53].max=%f,%f,%f\n", - m_allAabbsCPU[53].m_min[0], m_allAabbsCPU[53].m_min[1],m_allAabbsCPU[53].m_min[2], - m_allAabbsCPU[53].m_max[0], m_allAabbsCPU[53].m_max[1],m_allAabbsCPU[53].m_max[2]); - } - - - { - b3Int4 newPair; - newPair.x = 40; - newPair.y = 53; - int index = allPairs.findBinarySearch(newPair); - printf("hasPair(40,53)=%d out of %d\n",index, allPairs.size()); - { - int overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[40].m_min, (const b3Vector3&)m_allAabbsCPU[40].m_max,(const b3Vector3&)m_allAabbsCPU[53].m_min,(const b3Vector3&)m_allAabbsCPU[53].m_max); - printf("overlap=%d\n",overlap); + printf("ab[40].min=%f,%f,%f,ab[40].max=%f,%f,%f\n", + m_allAabbsCPU[40].m_min[0], m_allAabbsCPU[40].m_min[1], m_allAabbsCPU[40].m_min[2], + m_allAabbsCPU[40].m_max[0], m_allAabbsCPU[40].m_max[1], m_allAabbsCPU[40].m_max[2]); } - if (preAabbs.size()) - { - int prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[40].m_min, (const b3Vector3&)preAabbs[40].m_max,(const b3Vector3&)preAabbs[53].m_min,(const b3Vector3&)preAabbs[53].m_max); - printf("prevoverlap=%d\n",prevOverlap); - } else { - printf("unknown prevoverlap\n"); + printf("ab[53].min=%f,%f,%f,ab[53].max=%f,%f,%f\n", + m_allAabbsCPU[53].m_min[0], m_allAabbsCPU[53].m_min[1], m_allAabbsCPU[53].m_min[2], + m_allAabbsCPU[53].m_max[0], m_allAabbsCPU[53].m_max[1], m_allAabbsCPU[53].m_max[2]); } - } - } + { + b3Int4 newPair; + newPair.x = 40; + newPair.y = 53; + int index = allPairs.findBinarySearch(newPair); + printf("hasPair(40,53)=%d out of %d\n", index, allPairs.size()); + { + int overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[40].m_min, (const b3Vector3&)m_allAabbsCPU[40].m_max, (const b3Vector3&)m_allAabbsCPU[53].m_min, (const b3Vector3&)m_allAabbsCPU[53].m_max); + printf("overlap=%d\n", overlap); + } + + if (preAabbs.size()) + { + int prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[40].m_min, (const b3Vector3&)preAabbs[40].m_max, (const b3Vector3&)preAabbs[53].m_min, (const b3Vector3&)preAabbs[53].m_max); + printf("prevoverlap=%d\n", prevOverlap); + } + else + { + printf("unknown prevoverlap\n"); + } + } + } if (0) { - for (int i=0;i<m_allAabbsCPU.size();i++) + for (int i = 0; i < m_allAabbsCPU.size(); i++) { //printf("aabb[%d] min=%f,%f,%f max=%f,%f,%f\n",i,m_allAabbsCPU[i].m_min[0],m_allAabbsCPU[i].m_min[1],m_allAabbsCPU[i].m_min[2], m_allAabbsCPU[i].m_max[0],m_allAabbsCPU[i].m_max[1],m_allAabbsCPU[i].m_max[2]); - - } - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { - for (int buf=0;buf<2;buf++) + for (int buf = 0; buf < 2; buf++) { - b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size()*2); + b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size() * 2); } } } - - - m_currentBuffer = 1-m_currentBuffer; - - + m_currentBuffer = 1 - m_currentBuffer; int totalNumAabbs = m_allAabbsCPU.size(); { B3_PROFILE("assign m_sortedAxisCPU(FloatFlip)"); - for (int i=0;i<totalNumAabbs;i++) + for (int i = 0; i < totalNumAabbs; i++) { - - unsigned int keyMin[3]; unsigned int keyMax[3]; - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { - float vmin=m_allAabbsCPU[i].m_min[axis]; + float vmin = m_allAabbsCPU[i].m_min[axis]; float vmax = m_allAabbsCPU[i].m_max[axis]; keyMin[axis] = FloatFlip(vmin); keyMax[axis] = FloatFlip(vmax); - - m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_key = keyMin[axis]-1; - m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_value = i*2; - m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_key = keyMax[axis]+1; - m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_value = i*2+1; + + m_sortedAxisCPU[axis][m_currentBuffer][i * 2].m_key = keyMin[axis] - 1; + m_sortedAxisCPU[axis][m_currentBuffer][i * 2].m_value = i * 2; + m_sortedAxisCPU[axis][m_currentBuffer][i * 2 + 1].m_key = keyMax[axis] + 1; + m_sortedAxisCPU[axis][m_currentBuffer][i * 2 + 1].m_value = i * 2 + 1; } //printf("aabb[%d] min=%u,%u,%u max %u,%u,%u\n", i,keyMin[0],keyMin[1],keyMin[2],keyMax[0],keyMax[1],keyMax[2]); - } } - - { B3_PROFILE("sort m_sortedAxisCPU"); - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]); } @@ -432,21 +405,22 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() { B3_PROFILE("assign m_objectMinMaxIndexCPU"); - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { int totalNumAabbs = m_allAabbsCPU.size(); int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size(); m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(totalNumAabbs); - for (int i=0;i<numEndPoints;i++) + for (int i = 0; i < numEndPoints; i++) { int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value; - int newDest = destIndex/2; - if (destIndex&1) + int newDest = destIndex / 2; + if (destIndex & 1) { - m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i; - } else + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y = i; + } + else { - m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i; + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x = i; } } } @@ -485,12 +459,11 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() } #endif - int a = m_objectMinMaxIndexCPU[0][m_currentBuffer].size(); int b = m_objectMinMaxIndexCPU[1][m_currentBuffer].size(); int c = m_objectMinMaxIndexCPU[2][m_currentBuffer].size(); - b3Assert(a==b); - b3Assert(b==c); + b3Assert(a == b); + b3Assert(b == c); /* if (searchIncremental3dSapOnGpu) { @@ -574,175 +547,170 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size(); B3_PROFILE("actual search"); - for (int i=0;i<numObjects;i++) + for (int i = 0; i < numObjects; i++) { //int numObjects = m_objectMinMaxIndexCPU[axis][m_currentBuffer].size(); //int checkObjects[]={40,53}; //int numCheckObjects = sizeof(checkObjects)/sizeof(int); - + //for (int a=0;a<numCheckObjects ;a++) - - for (int axis=0;axis<3;axis++) + + for (int axis = 0; axis < 3; axis++) { //int i = checkObjects[a]; unsigned int curMinIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].x; unsigned int curMaxIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].y; - unsigned int prevMinIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].x; + unsigned int prevMinIndex = m_objectMinMaxIndexCPU[axis][1 - m_currentBuffer][i].x; int dmin = curMinIndex - prevMinIndex; - - unsigned int prevMaxIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].y; - + unsigned int prevMaxIndex = m_objectMinMaxIndexCPU[axis][1 - m_currentBuffer][i].y; int dmax = curMaxIndex - prevMaxIndex; - if (dmin!=0) + if (dmin != 0) { //printf("for object %d, dmin=%d\n",i,dmin); } - if (dmax!=0) + if (dmax != 0) { //printf("for object %d, dmax=%d\n",i,dmax); } - for (int otherbuffer = 0;otherbuffer<2;otherbuffer++) + for (int otherbuffer = 0; otherbuffer < 2; otherbuffer++) { - if (dmin!=0) + if (dmin != 0) { - int stepMin = dmin<0 ? -1 : 1; - for (int j=prevMinIndex;j!=curMinIndex;j+=stepMin) + int stepMin = dmin < 0 ? -1 : 1; + for (int j = prevMinIndex; j != curMinIndex; j += stepMin) { int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y; - int otherIndex = otherIndex2/2; - if (otherIndex!=i) + int otherIndex = otherIndex2 / 2; + if (otherIndex != i) { - bool otherIsMax = ((otherIndex2&1)!=0); + bool otherIsMax = ((otherIndex2 & 1) != 0); if (otherIsMax) { //bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max); //bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max); - + bool overlap = true; - for (int ax=0;ax<3;ax++) + for (int ax = 0; ax < 3; ax++) { if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) || (m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x)) - overlap=false; + overlap = false; } - // b3Assert(overlap2==overlap); + // b3Assert(overlap2==overlap); bool prevOverlap = true; - for (int ax=0;ax<3;ax++) + for (int ax = 0; ax < 3; ax++) { - if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) || - (m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x)) - prevOverlap=false; + if ((m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].y) || + (m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].x)) + prevOverlap = false; } - //b3Assert(overlap==overlap2); - - - if (dmin<0) + if (dmin < 0) { if (overlap && !prevOverlap) { //add a pair b3Int4 newPair; - if (i<=otherIndex) + if (i <= otherIndex) { newPair.x = i; newPair.y = otherIndex; - } else + } + else { newPair.x = otherIndex; newPair.y = i; } addedHostPairs.push_back(newPair); } - } + } else { if (!overlap && prevOverlap) { - //remove a pair b3Int4 removedPair; - if (i<=otherIndex) + if (i <= otherIndex) { removedPair.x = i; removedPair.y = otherIndex; - } else + } + else { removedPair.x = otherIndex; removedPair.y = i; } removedHostPairs.push_back(removedPair); } - }//otherisMax - }//if (dmin<0) - }//if (otherIndex!=i) - }//for (int j= + } //otherisMax + } //if (dmin<0) + } //if (otherIndex!=i) + } //for (int j= } - - if (dmax!=0) + + if (dmax != 0) { - int stepMax = dmax<0 ? -1 : 1; - for (int j=prevMaxIndex;j!=curMaxIndex;j+=stepMax) + int stepMax = dmax < 0 ? -1 : 1; + for (int j = prevMaxIndex; j != curMaxIndex; j += stepMax) { int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y; - int otherIndex = otherIndex2/2; - if (otherIndex!=i) + int otherIndex = otherIndex2 / 2; + if (otherIndex != i) { //bool otherIsMin = ((otherIndex2&1)==0); //if (otherIsMin) { //bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max); //bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max); - + bool overlap = true; - for (int ax=0;ax<3;ax++) + for (int ax = 0; ax < 3; ax++) { if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) || (m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x)) - overlap=false; + overlap = false; } //b3Assert(overlap2==overlap); bool prevOverlap = true; - for (int ax=0;ax<3;ax++) + for (int ax = 0; ax < 3; ax++) { - if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) || - (m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x)) - prevOverlap=false; + if ((m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].y) || + (m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].x)) + prevOverlap = false; } - - if (dmax>0) + if (dmax > 0) { if (overlap && !prevOverlap) { //add a pair b3Int4 newPair; - if (i<=otherIndex) + if (i <= otherIndex) { newPair.x = i; newPair.y = otherIndex; - } else + } + else { newPair.x = otherIndex; newPair.y = i; } addedHostPairs.push_back(newPair); - } - } + } else { if (!overlap && prevOverlap) @@ -750,33 +718,31 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() //if (otherIndex2&1==0) -> min? //remove a pair b3Int4 removedPair; - if (i<=otherIndex) + if (i <= otherIndex) { removedPair.x = i; removedPair.y = otherIndex; - } else + } + else { removedPair.x = otherIndex; removedPair.y = i; } removedHostPairs.push_back(removedPair); - } } - - }//if (dmin<0) - }//if (otherIndex!=i) - }//for (int j= + + } //if (dmin<0) + } //if (otherIndex!=i) + } //for (int j= } - }//for (int otherbuffer - }//for (int axis=0; - }//for (int i=0;i<numObjects + } //for (int otherbuffer + } //for (int axis=0; + } //for (int i=0;i<numObjects } //remove duplicates and add/remove then to existing m_overlappingPairs - - - + { { B3_PROFILE("sort allPairs"); @@ -795,31 +761,28 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() b3Int4 prevPair; prevPair.x = -1; prevPair.y = -1; - + int uniqueRemovedPairs = 0; b3AlignedObjectArray<int> removedPositions; { B3_PROFILE("actual removing"); - for (int i=0;i<removedHostPairs.size();i++) + for (int i = 0; i < removedHostPairs.size(); i++) { b3Int4 removedPair = removedHostPairs[i]; if ((removedPair.x != prevPair.x) || (removedPair.y != prevPair.y)) { + int index1 = allPairs.findBinarySearch(removedPair); - int index1 = allPairs.findBinarySearch(removedPair); + //#ifdef _DEBUG - //#ifdef _DEBUG - - - int index2 = allPairs.findLinearSearch(removedPair); - b3Assert(index1==index2); - + b3Assert(index1 == index2); + //b3Assert(index1!=allPairs.size()); - if (index1<allPairs.size()) - //#endif//_DEBUG + if (index1 < allPairs.size()) + //#endif//_DEBUG { uniqueRemovedPairs++; removedPositions.push_back(index1); @@ -833,13 +796,13 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() if (uniqueRemovedPairs) { - for (int i=0;i<removedPositions.size();i++) + for (int i = 0; i < removedPositions.size(); i++) { - allPairs[removedPositions[i]].x = INT_MAX ; - allPairs[removedPositions[i]].y = INT_MAX ; + allPairs[removedPositions[i]].x = INT_MAX; + allPairs[removedPositions[i]].y = INT_MAX; } allPairs.quickSort(b3PairCmp); - allPairs.resize(allPairs.size()-uniqueRemovedPairs); + allPairs.resize(allPairs.size() - uniqueRemovedPairs); } } //if (uniqueRemovedPairs) @@ -848,33 +811,31 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() prevPair.x = -1; prevPair.y = -1; - - int uniqueAddedPairs=0; + + int uniqueAddedPairs = 0; b3AlignedObjectArray<b3Int4> actualAddedPairs; { B3_PROFILE("actual adding"); - for (int i=0;i<addedHostPairs.size();i++) + for (int i = 0; i < addedHostPairs.size(); i++) { b3Int4 newPair = addedHostPairs[i]; if ((newPair.x != prevPair.x) || (newPair.y != prevPair.y)) { -//#ifdef _DEBUG + //#ifdef _DEBUG int index1 = allPairs.findBinarySearch(newPair); - - + int index2 = allPairs.findLinearSearch(newPair); - b3Assert(index1==index2); - + b3Assert(index1 == index2); - b3Assert(index1==allPairs.size()); - if (index1!=allPairs.size()) + b3Assert(index1 == allPairs.size()); + if (index1 != allPairs.size()) { printf("??\n"); } - if (index1==allPairs.size()) -//#endif //_DEBUG + if (index1 == allPairs.size()) + //#endif //_DEBUG { uniqueAddedPairs++; actualAddedPairs.push_back(newPair); @@ -882,94 +843,83 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() } prevPair = newPair; } - for (int i=0;i<actualAddedPairs.size();i++) + for (int i = 0; i < actualAddedPairs.size(); i++) { //printf("framepje (%d), new pair(%d):%d,%d\n",framepje,i,actualAddedPairs[i].x,actualAddedPairs[i].y); allPairs.push_back(actualAddedPairs[i]); } } - + //if (uniqueAddedPairs) // printf("uniqueAddedPairs=%d\n", uniqueAddedPairs); - { B3_PROFILE("m_overlappingPairs.copyFromHost"); m_overlappingPairs.copyFromHost(allPairs); } - - } - - - -void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) +void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) { //test -// if (m_currentBuffer>=0) + // if (m_currentBuffer>=0) // return calculateOverlappingPairsHostIncremental3Sap(); b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size()); m_allAabbsGPU.copyToHost(m_allAabbsCPU); - - - int axis=0; + int axis = 0; { B3_PROFILE("CPU compute best variance axis"); - b3Vector3 s=b3MakeVector3(0,0,0),s2=b3MakeVector3(0,0,0); + b3Vector3 s = b3MakeVector3(0, 0, 0), s2 = b3MakeVector3(0, 0, 0); int numRigidBodies = m_smallAabbsMappingCPU.size(); - for(int i=0;i<numRigidBodies;i++) + for (int i = 0; i < numRigidBodies; i++) { b3SapAabb aabb = this->m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; - b3Vector3 maxAabb=b3MakeVector3(aabb.m_max[0],aabb.m_max[1],aabb.m_max[2]); - b3Vector3 minAabb=b3MakeVector3(aabb.m_min[0],aabb.m_min[1],aabb.m_min[2]); - b3Vector3 centerAabb=(maxAabb+minAabb)*0.5f; - + b3Vector3 maxAabb = b3MakeVector3(aabb.m_max[0], aabb.m_max[1], aabb.m_max[2]); + b3Vector3 minAabb = b3MakeVector3(aabb.m_min[0], aabb.m_min[1], aabb.m_min[2]); + b3Vector3 centerAabb = (maxAabb + minAabb) * 0.5f; + s += centerAabb; - s2 += centerAabb*centerAabb; + s2 += centerAabb * centerAabb; } - b3Vector3 v = s2 - (s*s) / (float)numRigidBodies; - - if(v[1] > v[0]) + b3Vector3 v = s2 - (s * s) / (float)numRigidBodies; + + if (v[1] > v[0]) axis = 1; - if(v[2] > v[axis]) + if (v[2] > v[axis]) axis = 2; } - - - b3AlignedObjectArray<b3Int4> hostPairs; { int numSmallAabbs = m_smallAabbsMappingCPU.size(); - for (int i=0;i<numSmallAabbs;i++) + for (int i = 0; i < numSmallAabbs; i++) { b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; //float reference = smallAabbi.m_max[axis]; - for (int j=i+1;j<numSmallAabbs;j++) + for (int j = i + 1; j < numSmallAabbs; j++) { - b3SapAabb smallAabbj = m_allAabbsCPU[m_smallAabbsMappingCPU[j]]; if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max, - (b3Vector3&)smallAabbj.m_min,(b3Vector3&)smallAabbj.m_max)) + (b3Vector3&)smallAabbj.m_min, (b3Vector3&)smallAabbj.m_max)) { b3Int4 pair; int a = smallAabbi.m_minIndices[3]; int b = smallAabbj.m_minIndices[3]; - if (a<=b) + if (a <= b) { - pair.x = a;//store the original index in the unsorted aabb array + pair.x = a; //store the original index in the unsorted aabb array pair.y = b; - } else + } + else { - pair.x = b;//store the original index in the unsorted aabb array + pair.x = b; //store the original index in the unsorted aabb array pair.y = a; } hostPairs.push_back(pair); @@ -978,35 +928,35 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) } } - { int numSmallAabbs = m_smallAabbsMappingCPU.size(); - for (int i=0;i<numSmallAabbs;i++) + for (int i = 0; i < numSmallAabbs; i++) { b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; //float reference = smallAabbi.m_max[axis]; int numLargeAabbs = m_largeAabbsMappingCPU.size(); - for (int j=0;j<numLargeAabbs;j++) + for (int j = 0; j < numLargeAabbs; j++) { b3SapAabb largeAabbj = m_allAabbsCPU[m_largeAabbsMappingCPU[j]]; if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max, - (b3Vector3&)largeAabbj.m_min,(b3Vector3&)largeAabbj.m_max)) + (b3Vector3&)largeAabbj.m_min, (b3Vector3&)largeAabbj.m_max)) { b3Int4 pair; int a = largeAabbj.m_minIndices[3]; int b = smallAabbi.m_minIndices[3]; - if (a<=b) + if (a <= b) { - pair.x = a; - pair.y = b;//store the original index in the unsorted aabb array - } else + pair.x = a; + pair.y = b; //store the original index in the unsorted aabb array + } + else { pair.x = b; - pair.y = a;//store the original index in the unsorted aabb array + pair.y = a; //store the original index in the unsorted aabb array } - + hostPairs.push_back(pair); } } @@ -1021,21 +971,20 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) if (hostPairs.size()) { m_overlappingPairs.copyFromHost(hostPairs); - } else + } + else { m_overlappingPairs.resize(0); } //init3dSap(); - } -void b3GpuSapBroadphase::reset() +void b3GpuSapBroadphase::reset() { m_allAabbsGPU.resize(0); m_allAabbsCPU.resize(0); - m_smallAabbsMappingGPU.resize(0); m_smallAabbsMappingCPU.resize(0); @@ -1043,13 +992,11 @@ void b3GpuSapBroadphase::reset() m_largeAabbsMappingGPU.resize(0); m_largeAabbsMappingCPU.resize(0); - } - -void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) +void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) { - if (m_sapKernel==0) + if (m_sapKernel == 0) { calculateOverlappingPairsHost(maxPairs); return; @@ -1065,68 +1012,62 @@ void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) int axis = 0; { + //bool syncOnHost = false; - //bool syncOnHost = false; - - int numSmallAabbs = m_smallAabbsMappingCPU.size(); - if (m_prefixScanFloat4 && numSmallAabbs) - { - B3_PROFILE("GPU compute best variance axis"); - - if (m_dst.size()!=(numSmallAabbs+1)) + int numSmallAabbs = m_smallAabbsMappingCPU.size(); + if (m_prefixScanFloat4 && numSmallAabbs) { - m_dst.resize(numSmallAabbs+128); - m_sum.resize(numSmallAabbs+128); - m_sum2.resize(numSmallAabbs+128); - m_sum.at(numSmallAabbs)=b3MakeVector3(0,0,0); //slow? - m_sum2.at(numSmallAabbs)=b3MakeVector3(0,0,0); //slow? - } + B3_PROFILE("GPU compute best variance axis"); - b3LauncherCL launcher(m_queue, m_prepareSumVarianceKernel ,"m_prepareSumVarianceKernel"); - launcher.setBuffer(m_allAabbsGPU.getBufferCL()); - - launcher.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); - launcher.setBuffer(m_sum.getBufferCL()); - launcher.setBuffer(m_sum2.getBufferCL()); - launcher.setConst( numSmallAabbs ); - int num = numSmallAabbs; - launcher.launch1D( num); - + if (m_dst.size() != (numSmallAabbs + 1)) + { + m_dst.resize(numSmallAabbs + 128); + m_sum.resize(numSmallAabbs + 128); + m_sum2.resize(numSmallAabbs + 128); + m_sum.at(numSmallAabbs) = b3MakeVector3(0, 0, 0); //slow? + m_sum2.at(numSmallAabbs) = b3MakeVector3(0, 0, 0); //slow? + } - b3Vector3 s; - b3Vector3 s2; - m_prefixScanFloat4->execute(m_sum,m_dst,numSmallAabbs+1,&s); - m_prefixScanFloat4->execute(m_sum2,m_dst,numSmallAabbs+1,&s2); + b3LauncherCL launcher(m_queue, m_prepareSumVarianceKernel, "m_prepareSumVarianceKernel"); + launcher.setBuffer(m_allAabbsGPU.getBufferCL()); - b3Vector3 v = s2 - (s*s) / (float)numSmallAabbs; - - if(v[1] > v[0]) - axis = 1; - if(v[2] > v[axis]) - axis = 2; - } + launcher.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); + launcher.setBuffer(m_sum.getBufferCL()); + launcher.setBuffer(m_sum2.getBufferCL()); + launcher.setConst(numSmallAabbs); + int num = numSmallAabbs; + launcher.launch1D(num); + b3Vector3 s; + b3Vector3 s2; + m_prefixScanFloat4->execute(m_sum, m_dst, numSmallAabbs + 1, &s); + m_prefixScanFloat4->execute(m_sum2, m_dst, numSmallAabbs + 1, &s2); + + b3Vector3 v = s2 - (s * s) / (float)numSmallAabbs; + + if (v[1] > v[0]) + axis = 1; + if (v[2] > v[axis]) + axis = 2; + } - m_gpuSmallSortData.resize(numSmallAabbs); - #if 1 if (m_smallAabbsMappingGPU.size()) { - B3_PROFILE("flipFloatKernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), - b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL(), true), - b3BufferInfoCL( m_gpuSmallSortData.getBufferCL())}; - b3LauncherCL launcher(m_queue, m_flipFloatKernel ,"m_flipFloatKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( numSmallAabbs ); - launcher.setConst( axis ); - + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(m_allAabbsGPU.getBufferCL(), true), + b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL(), true), + b3BufferInfoCL(m_gpuSmallSortData.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_flipFloatKernel, "m_flipFloatKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(numSmallAabbs); + launcher.setConst(axis); + int num = numSmallAabbs; - launcher.launch1D( num); + launcher.launch1D(num); clFinish(m_queue); } @@ -1141,69 +1082,66 @@ void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) if (numSmallAabbs) { B3_PROFILE("scatterKernel"); - - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), - b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL(), true), - b3BufferInfoCL( m_gpuSmallSortData.getBufferCL(),true), + + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(m_allAabbsGPU.getBufferCL(), true), + b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL(), true), + b3BufferInfoCL(m_gpuSmallSortData.getBufferCL(), true), b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())}; - b3LauncherCL launcher(m_queue, m_scatterKernel ,"m_scatterKernel "); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( numSmallAabbs); + b3LauncherCL launcher(m_queue, m_scatterKernel, "m_scatterKernel "); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(numSmallAabbs); int num = numSmallAabbs; - launcher.launch1D( num); + launcher.launch1D(num); clFinish(m_queue); - } - - m_overlappingPairs.resize(maxPairs); + m_overlappingPairs.resize(maxPairs); - m_pairCount.resize(0); - m_pairCount.push_back(0); - int numPairs=0; + m_pairCount.resize(0); + m_pairCount.push_back(0); + int numPairs = 0; + { + int numLargeAabbs = m_largeAabbsMappingGPU.size(); + if (numLargeAabbs && numSmallAabbs) { - int numLargeAabbs = m_largeAabbsMappingGPU.size(); - if (numLargeAabbs && numSmallAabbs) + //@todo + B3_PROFILE("sap2Kernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(m_allAabbsGPU.getBufferCL()), + b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()), + b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()), + b3BufferInfoCL(m_overlappingPairs.getBufferCL()), + b3BufferInfoCL(m_pairCount.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(numLargeAabbs); + launcher.setConst(numSmallAabbs); + launcher.setConst(axis); + launcher.setConst(maxPairs); + //@todo: use actual maximum work item sizes of the device instead of hardcoded values + launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64); + + numPairs = m_pairCount.at(0); + if (numPairs > maxPairs) { - //@todo - B3_PROFILE("sap2Kernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( m_allAabbsGPU.getBufferCL() ), - b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ), - b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ), - b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), - b3BufferInfoCL(m_pairCount.getBufferCL())}; - b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( numLargeAabbs ); - launcher.setConst( numSmallAabbs); - launcher.setConst( axis ); - launcher.setConst( maxPairs ); -//@todo: use actual maximum work item sizes of the device instead of hardcoded values - launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64); - - numPairs = m_pairCount.at(0); - if (numPairs >maxPairs) - { - b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); - numPairs =maxPairs; - } + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs = maxPairs; } } - if (m_gpuSmallSortedAabbs.size()) - { - B3_PROFILE("sapKernel"); - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), b3BufferInfoCL(m_pairCount.getBufferCL())}; - b3LauncherCL launcher(m_queue, m_sapKernel,"m_sapKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( numSmallAabbs ); - launcher.setConst( axis ); - launcher.setConst( maxPairs ); - - - int num = numSmallAabbs; + } + if (m_gpuSmallSortedAabbs.size()) + { + B3_PROFILE("sapKernel"); + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL()), b3BufferInfoCL(m_overlappingPairs.getBufferCL()), b3BufferInfoCL(m_pairCount.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_sapKernel, "m_sapKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(numSmallAabbs); + launcher.setConst(axis); + launcher.setConst(maxPairs); + + int num = numSmallAabbs; #if 0 int buffSize = launcher.getSerializationBufferSize(); unsigned char* buf = new unsigned char[buffSize+sizeof(int)]; @@ -1225,73 +1163,71 @@ void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) FILE* f = fopen("m_sapKernelArgs.bin","wb"); fwrite(buf,buffSize+sizeof(int),1,f); fclose(f); -#endif// +#endif // - launcher.launch1D( num); - clFinish(m_queue); - - numPairs = m_pairCount.at(0); - if (numPairs>maxPairs) - { - b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); - numPairs = maxPairs; - m_pairCount.resize(0); - m_pairCount.push_back(maxPairs); - } + launcher.launch1D(num); + clFinish(m_queue); + + numPairs = m_pairCount.at(0); + if (numPairs > maxPairs) + { + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs = maxPairs; + m_pairCount.resize(0); + m_pairCount.push_back(maxPairs); } - + } + #else - int numPairs = 0; - - - b3LauncherCL launcher(m_queue, m_sapKernel); - - const char* fileName = "m_sapKernelArgs.bin"; - FILE* f = fopen(fileName,"rb"); - if (f) - { - int sizeInBytes=0; - if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) - { - printf("error, cannot get file size\n"); - exit(0); - } - - unsigned char* buf = (unsigned char*) malloc(sizeInBytes); - fread(buf,sizeInBytes,1,f); - int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context); - int num = *(int*)&buf[serializedBytes]; - launcher.launch1D( num); - - b3OpenCLArray<int> pairCount(m_context, m_queue); - int numElements = launcher.m_arrays[2]->size()/sizeof(int); - pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements); - numPairs = pairCount.at(0); - //printf("overlapping pairs = %d\n",numPairs); - b3AlignedObjectArray<b3Int4> hostOoverlappingPairs; - b3OpenCLArray<b3Int4> tmpGpuPairs(m_context,m_queue); - tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs ); - - tmpGpuPairs.copyToHost(hostOoverlappingPairs); - m_overlappingPairs.copyFromHost(hostOoverlappingPairs); - //printf("hello %d\n", m_overlappingPairs.size()); - free(buf); - fclose(f); - - } else { - printf("error: cannot find file %s\n",fileName); - } - - clFinish(m_queue); - - + int numPairs = 0; + + b3LauncherCL launcher(m_queue, m_sapKernel); + + const char* fileName = "m_sapKernelArgs.bin"; + FILE* f = fopen(fileName, "rb"); + if (f) + { + int sizeInBytes = 0; + if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) + { + printf("error, cannot get file size\n"); + exit(0); + } + + unsigned char* buf = (unsigned char*)malloc(sizeInBytes); + fread(buf, sizeInBytes, 1, f); + int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context); + int num = *(int*)&buf[serializedBytes]; + launcher.launch1D(num); + + b3OpenCLArray<int> pairCount(m_context, m_queue); + int numElements = launcher.m_arrays[2]->size() / sizeof(int); + pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(), numElements); + numPairs = pairCount.at(0); + //printf("overlapping pairs = %d\n",numPairs); + b3AlignedObjectArray<b3Int4> hostOoverlappingPairs; + b3OpenCLArray<b3Int4> tmpGpuPairs(m_context, m_queue); + tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(), numPairs); + + tmpGpuPairs.copyToHost(hostOoverlappingPairs); + m_overlappingPairs.copyFromHost(hostOoverlappingPairs); + //printf("hello %d\n", m_overlappingPairs.size()); + free(buf); + fclose(f); + } + else + { + printf("error: cannot find file %s\n", fileName); + } + + clFinish(m_queue); + #endif - - m_overlappingPairs.resize(numPairs); - - }//B3_PROFILE("GPU_RADIX SORT"); - //init3dSap(); + m_overlappingPairs.resize(numPairs); + + } //B3_PROFILE("GPU_RADIX SORT"); + //init3dSap(); } void b3GpuSapBroadphase::writeAabbsToGpu() @@ -1299,17 +1235,14 @@ void b3GpuSapBroadphase::writeAabbsToGpu() m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU); m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU); - m_allAabbsGPU.copyFromHost(m_allAabbsCPU);//might not be necessary, the 'setupGpuAabbsFull' already takes care of this - - - + m_allAabbsGPU.copyFromHost(m_allAabbsCPU); //might not be necessary, the 'setupGpuAabbsFull' already takes care of this } -void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) { int index = userPtr; b3SapAabb aabb; - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) { aabb.m_min[i] = aabbMin[i]; aabb.m_max[i] = aabbMax[i]; @@ -1317,15 +1250,15 @@ void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vec aabb.m_minIndices[3] = index; aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size(); m_largeAabbsMappingCPU.push_back(m_allAabbsCPU.size()); - + m_allAabbsCPU.push_back(aabb); } -void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) { int index = userPtr; b3SapAabb aabb; - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) { aabb.m_min[i] = aabbMin[i]; aabb.m_max[i] = aabbMax[i]; @@ -1334,20 +1267,19 @@ void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size(); m_smallAabbsMappingCPU.push_back(m_allAabbsCPU.size()); - m_allAabbsCPU.push_back(aabb); } -cl_mem b3GpuSapBroadphase::getAabbBufferWS() +cl_mem b3GpuSapBroadphase::getAabbBufferWS() { return m_allAabbsGPU.getBufferCL(); } -int b3GpuSapBroadphase::getNumOverlap() +int b3GpuSapBroadphase::getNumOverlap() { return m_overlappingPairs.size(); } -cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer() +cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer() { return m_overlappingPairs.getBufferCL(); } diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h index 8d36ac78f2..d17590b14a 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h @@ -2,7 +2,7 @@ #define B3_GPU_SAP_BROADPHASE_H #include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" -#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2 +#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2 class b3Vector3; #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" @@ -11,141 +11,133 @@ class b3Vector3; #include "b3GpuBroadphaseInterface.h" - class b3GpuSapBroadphase : public b3GpuBroadphaseInterface { - - cl_context m_context; - cl_device_id m_device; - cl_command_queue m_queue; - cl_kernel m_flipFloatKernel; - cl_kernel m_scatterKernel ; - cl_kernel m_copyAabbsKernel; - cl_kernel m_sapKernel; - cl_kernel m_sap2Kernel; - cl_kernel m_prepareSumVarianceKernel; - + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + cl_kernel m_flipFloatKernel; + cl_kernel m_scatterKernel; + cl_kernel m_copyAabbsKernel; + cl_kernel m_sapKernel; + cl_kernel m_sap2Kernel; + cl_kernel m_prepareSumVarianceKernel; class b3RadixSort32CL* m_sorter; ///test for 3d SAP - b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2]; - b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2]; - b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0; - b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1; - b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2; - b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev; - b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev; - b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev; - - b3OpenCLArray<b3SortData> m_sortedAxisGPU0; - b3OpenCLArray<b3SortData> m_sortedAxisGPU1; - b3OpenCLArray<b3SortData> m_sortedAxisGPU2; - b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev; - b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev; - b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev; - - - b3OpenCLArray<b3Int4> m_addedHostPairsGPU; - b3OpenCLArray<b3Int4> m_removedHostPairsGPU; - b3OpenCLArray<int> m_addedCountGPU; - b3OpenCLArray<int> m_removedCountGPU; - - int m_currentBuffer; + b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2]; + b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2]; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev; + + b3OpenCLArray<b3SortData> m_sortedAxisGPU0; + b3OpenCLArray<b3SortData> m_sortedAxisGPU1; + b3OpenCLArray<b3SortData> m_sortedAxisGPU2; + b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev; + b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev; + b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev; + + b3OpenCLArray<b3Int4> m_addedHostPairsGPU; + b3OpenCLArray<b3Int4> m_removedHostPairsGPU; + b3OpenCLArray<int> m_addedCountGPU; + b3OpenCLArray<int> m_removedCountGPU; + + int m_currentBuffer; public: - b3OpenCLArray<int> m_pairCount; + b3OpenCLArray<b3SapAabb> m_allAabbsGPU; + b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU; - b3OpenCLArray<b3SapAabb> m_allAabbsGPU; - b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU; - - virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() + virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_allAabbsGPU; } - virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_allAabbsCPU; } - b3OpenCLArray<b3Vector3> m_sum; - b3OpenCLArray<b3Vector3> m_sum2; - b3OpenCLArray<b3Vector3> m_dst; + b3OpenCLArray<b3Vector3> m_sum; + b3OpenCLArray<b3Vector3> m_sum2; + b3OpenCLArray<b3Vector3> m_dst; - b3OpenCLArray<int> m_smallAabbsMappingGPU; + b3OpenCLArray<int> m_smallAabbsMappingGPU; b3AlignedObjectArray<int> m_smallAabbsMappingCPU; - b3OpenCLArray<int> m_largeAabbsMappingGPU; + b3OpenCLArray<int> m_largeAabbsMappingGPU; b3AlignedObjectArray<int> m_largeAabbsMappingCPU; - - b3OpenCLArray<b3Int4> m_overlappingPairs; + b3OpenCLArray<b3Int4> m_overlappingPairs; //temporary gpu work memory - b3OpenCLArray<b3SortData> m_gpuSmallSortData; - b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs; + b3OpenCLArray<b3SortData> m_gpuSmallSortData; + b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs; - class b3PrefixScanFloat4CL* m_prefixScanFloat4; + class b3PrefixScanFloat4CL* m_prefixScanFloat4; enum b3GpuSapKernelType { - B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU=1, + B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU = 1, B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU, B3_GPU_SAP_KERNEL_ORIGINAL, B3_GPU_SAP_KERNEL_BARRIER, B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY }; - b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType=B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY); + b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType = B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY); virtual ~b3GpuSapBroadphase(); - - static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx,cl_device_id device, cl_command_queue q) + + static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx, cl_device_id device, cl_command_queue q) { - return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU); + return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU); } - static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx,cl_device_id device, cl_command_queue q) + static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx, cl_device_id device, cl_command_queue q) { - return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU); + return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU); } - static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx,cl_device_id device, cl_command_queue q) + static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx, cl_device_id device, cl_command_queue q) { - return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_ORIGINAL); + return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_ORIGINAL); } - static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx,cl_device_id device, cl_command_queue q) + static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx, cl_device_id device, cl_command_queue q) { - return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BARRIER); + return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BARRIER); } - static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx,cl_device_id device, cl_command_queue q) + static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx, cl_device_id device, cl_command_queue q) { - return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY); + return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY); } - - virtual void calculateOverlappingPairs(int maxPairs); - virtual void calculateOverlappingPairsHost(int maxPairs); - - void reset(); + virtual void calculateOverlappingPairs(int maxPairs); + virtual void calculateOverlappingPairsHost(int maxPairs); + + void reset(); void init3dSap(); virtual void calculateOverlappingPairsHostIncremental3Sap(); - virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); - virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); //call writeAabbsToGpu after done making all changes (createProxy etc) virtual void writeAabbsToGpu(); - virtual cl_mem getAabbBufferWS(); - virtual int getNumOverlap(); - virtual cl_mem getOverlappingPairBuffer(); - + virtual cl_mem getAabbBufferWS(); + virtual int getNumOverlap(); + virtual cl_mem getOverlappingPairBuffer(); + virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU(); virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU(); virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU(); }; -#endif //B3_GPU_SAP_BROADPHASE_H
\ No newline at end of file +#endif //B3_GPU_SAP_BROADPHASE_H
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h index ea6550fede..60570f2605 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h @@ -5,10 +5,9 @@ #include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h" ///just make sure that the b3Aabb is 16-byte aligned -B3_ATTRIBUTE_ALIGNED16(struct) b3SapAabb : public b3Aabb -{ +B3_ATTRIBUTE_ALIGNED16(struct) +b3SapAabb : public b3Aabb{ -}; + }; - -#endif //B3_SAP_AABB_H +#endif //B3_SAP_AABB_H diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h index dad42477c3..0185417786 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h @@ -1,199 +1,198 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* gridBroadphaseCL= \ -"int getPosHash(int4 gridPos, __global float4* pParams)\n" -"{\n" -" int4 gridDim = *((__global int4*)(pParams + 1));\n" -" gridPos.x &= gridDim.x - 1;\n" -" gridPos.y &= gridDim.y - 1;\n" -" gridPos.z &= gridDim.z - 1;\n" -" int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n" -" return hash;\n" -"} \n" -"int4 getGridPos(float4 worldPos, __global float4* pParams)\n" -"{\n" -" int4 gridPos;\n" -" int4 gridDim = *((__global int4*)(pParams + 1));\n" -" gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n" -" gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n" -" gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n" -" return gridPos;\n" -"}\n" -"// calculate grid hash value for each body using its AABB\n" -"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n" -"{\n" -" int index = get_global_id(0);\n" -" if(index >= numObjects)\n" -" {\n" -" return;\n" -" }\n" -" float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n" -" float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n" -" float4 pos;\n" -" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" -" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" -" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" -" pos.w = 0.f;\n" -" // get address in grid\n" -" int4 gridPos = getGridPos(pos, pParams);\n" -" int gridHash = getPosHash(gridPos, pParams);\n" -" // store grid hash and body index\n" -" int2 hashVal;\n" -" hashVal.x = gridHash;\n" -" hashVal.y = index;\n" -" pHash[index] = hashVal;\n" -"}\n" -"__kernel void kClearCellStart( int numCells, \n" -" __global int* pCellStart )\n" -"{\n" -" int index = get_global_id(0);\n" -" if(index >= numCells)\n" -" {\n" -" return;\n" -" }\n" -" pCellStart[index] = -1;\n" -"}\n" -"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n" -"{\n" -" __local int sharedHash[513];\n" -" int index = get_global_id(0);\n" -" int2 sortedData;\n" -" if(index < numObjects)\n" -" {\n" -" sortedData = pHash[index];\n" -" // Load hash data into shared memory so that we can look \n" -" // at neighboring body's hash value without loading\n" -" // two hash values per thread\n" -" sharedHash[get_local_id(0) + 1] = sortedData.x;\n" -" if((index > 0) && (get_local_id(0) == 0))\n" -" {\n" -" // first thread in block must load neighbor body hash\n" -" sharedHash[0] = pHash[index-1].x;\n" -" }\n" -" }\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" if(index < numObjects)\n" -" {\n" -" if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n" -" {\n" -" cellStart[sortedData.x] = index;\n" -" }\n" -" }\n" -"}\n" -"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n" -"{\n" -" return (min0.x <= max1.x)&& (min1.x <= max0.x) && \n" -" (min0.y <= max1.y)&& (min1.y <= max0.y) && \n" -" (min0.z <= max1.z)&& (min1.z <= max0.z); \n" -"}\n" -"//search for AABB 'index' against other AABBs' in this cell\n" -"void findPairsInCell( int numObjects,\n" -" int4 gridPos,\n" -" int index,\n" -" __global int2* pHash,\n" -" __global int* pCellStart,\n" -" __global float4* allpAABB, \n" -" __global const int* smallAabbMapping,\n" -" __global float4* pParams,\n" -" volatile __global int* pairCount,\n" -" __global int4* pPairBuff2,\n" -" int maxPairs\n" -" )\n" -"{\n" -" int4 pGridDim = *((__global int4*)(pParams + 1));\n" -" int maxBodiesPerCell = pGridDim.w;\n" -" int gridHash = getPosHash(gridPos, pParams);\n" -" // get start of bucket for this cell\n" -" int bucketStart = pCellStart[gridHash];\n" -" if (bucketStart == -1)\n" -" {\n" -" return; // cell empty\n" -" }\n" -" // iterate over bodies in this cell\n" -" int2 sortedData = pHash[index];\n" -" int unsorted_indx = sortedData.y;\n" -" float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n" -" float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n" -" int handleIndex = as_int(min0.w);\n" -" \n" -" int bucketEnd = bucketStart + maxBodiesPerCell;\n" -" bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n" -" for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n" -" {\n" -" int2 cellData = pHash[index2];\n" -" if (cellData.x != gridHash)\n" -" {\n" -" break; // no longer in same bucket\n" -" }\n" -" int unsorted_indx2 = cellData.y;\n" -" //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n" -" if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n" -" { \n" -" float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n" -" float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n" -" if(testAABBOverlap(min0, max0, min1, max1))\n" -" {\n" -" if (pairCount)\n" -" {\n" -" int handleIndex2 = as_int(min1.w);\n" -" if (handleIndex<handleIndex2)\n" -" {\n" -" int curPair = atomic_add(pairCount,1);\n" -" if (curPair<maxPairs)\n" -" {\n" -" int4 newpair;\n" -" newpair.x = handleIndex;\n" -" newpair.y = handleIndex2;\n" -" newpair.z = -1;\n" -" newpair.w = -1;\n" -" pPairBuff2[curPair] = newpair;\n" -" }\n" -" }\n" -" \n" -" }\n" -" }\n" -" }\n" -" }\n" -"}\n" -"__kernel void kFindOverlappingPairs( int numObjects,\n" -" __global float4* allpAABB, \n" -" __global const int* smallAabbMapping,\n" -" __global int2* pHash, \n" -" __global int* pCellStart, \n" -" __global float4* pParams ,\n" -" volatile __global int* pairCount,\n" -" __global int4* pPairBuff2,\n" -" int maxPairs\n" -" )\n" -"{\n" -" int index = get_global_id(0);\n" -" if(index >= numObjects)\n" -" {\n" -" return;\n" -" }\n" -" int2 sortedData = pHash[index];\n" -" int unsorted_indx = sortedData.y;\n" -" float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n" -" float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n" -" float4 pos;\n" -" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" -" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" -" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" -" // get address in grid\n" -" int4 gridPosA = getGridPos(pos, pParams);\n" -" int4 gridPosB; \n" -" // examine only neighbouring cells\n" -" for(int z=-1; z<=1; z++) \n" -" {\n" -" gridPosB.z = gridPosA.z + z;\n" -" for(int y=-1; y<=1; y++) \n" -" {\n" -" gridPosB.y = gridPosA.y + y;\n" -" for(int x=-1; x<=1; x++) \n" -" {\n" -" gridPosB.x = gridPosA.x + x;\n" -" findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n" -" }\n" -" }\n" -" }\n" -"}\n" -; +static const char* gridBroadphaseCL = + "int getPosHash(int4 gridPos, __global float4* pParams)\n" + "{\n" + " int4 gridDim = *((__global int4*)(pParams + 1));\n" + " gridPos.x &= gridDim.x - 1;\n" + " gridPos.y &= gridDim.y - 1;\n" + " gridPos.z &= gridDim.z - 1;\n" + " int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n" + " return hash;\n" + "} \n" + "int4 getGridPos(float4 worldPos, __global float4* pParams)\n" + "{\n" + " int4 gridPos;\n" + " int4 gridDim = *((__global int4*)(pParams + 1));\n" + " gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n" + " gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n" + " gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n" + " return gridPos;\n" + "}\n" + "// calculate grid hash value for each body using its AABB\n" + "__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n" + "{\n" + " int index = get_global_id(0);\n" + " if(index >= numObjects)\n" + " {\n" + " return;\n" + " }\n" + " float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n" + " float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n" + " float4 pos;\n" + " pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" + " pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" + " pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" + " pos.w = 0.f;\n" + " // get address in grid\n" + " int4 gridPos = getGridPos(pos, pParams);\n" + " int gridHash = getPosHash(gridPos, pParams);\n" + " // store grid hash and body index\n" + " int2 hashVal;\n" + " hashVal.x = gridHash;\n" + " hashVal.y = index;\n" + " pHash[index] = hashVal;\n" + "}\n" + "__kernel void kClearCellStart( int numCells, \n" + " __global int* pCellStart )\n" + "{\n" + " int index = get_global_id(0);\n" + " if(index >= numCells)\n" + " {\n" + " return;\n" + " }\n" + " pCellStart[index] = -1;\n" + "}\n" + "__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n" + "{\n" + " __local int sharedHash[513];\n" + " int index = get_global_id(0);\n" + " int2 sortedData;\n" + " if(index < numObjects)\n" + " {\n" + " sortedData = pHash[index];\n" + " // Load hash data into shared memory so that we can look \n" + " // at neighboring body's hash value without loading\n" + " // two hash values per thread\n" + " sharedHash[get_local_id(0) + 1] = sortedData.x;\n" + " if((index > 0) && (get_local_id(0) == 0))\n" + " {\n" + " // first thread in block must load neighbor body hash\n" + " sharedHash[0] = pHash[index-1].x;\n" + " }\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " if(index < numObjects)\n" + " {\n" + " if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n" + " {\n" + " cellStart[sortedData.x] = index;\n" + " }\n" + " }\n" + "}\n" + "int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n" + "{\n" + " return (min0.x <= max1.x)&& (min1.x <= max0.x) && \n" + " (min0.y <= max1.y)&& (min1.y <= max0.y) && \n" + " (min0.z <= max1.z)&& (min1.z <= max0.z); \n" + "}\n" + "//search for AABB 'index' against other AABBs' in this cell\n" + "void findPairsInCell( int numObjects,\n" + " int4 gridPos,\n" + " int index,\n" + " __global int2* pHash,\n" + " __global int* pCellStart,\n" + " __global float4* allpAABB, \n" + " __global const int* smallAabbMapping,\n" + " __global float4* pParams,\n" + " volatile __global int* pairCount,\n" + " __global int4* pPairBuff2,\n" + " int maxPairs\n" + " )\n" + "{\n" + " int4 pGridDim = *((__global int4*)(pParams + 1));\n" + " int maxBodiesPerCell = pGridDim.w;\n" + " int gridHash = getPosHash(gridPos, pParams);\n" + " // get start of bucket for this cell\n" + " int bucketStart = pCellStart[gridHash];\n" + " if (bucketStart == -1)\n" + " {\n" + " return; // cell empty\n" + " }\n" + " // iterate over bodies in this cell\n" + " int2 sortedData = pHash[index];\n" + " int unsorted_indx = sortedData.y;\n" + " float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n" + " float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n" + " int handleIndex = as_int(min0.w);\n" + " \n" + " int bucketEnd = bucketStart + maxBodiesPerCell;\n" + " bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n" + " for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n" + " {\n" + " int2 cellData = pHash[index2];\n" + " if (cellData.x != gridHash)\n" + " {\n" + " break; // no longer in same bucket\n" + " }\n" + " int unsorted_indx2 = cellData.y;\n" + " //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n" + " if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n" + " { \n" + " float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n" + " float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n" + " if(testAABBOverlap(min0, max0, min1, max1))\n" + " {\n" + " if (pairCount)\n" + " {\n" + " int handleIndex2 = as_int(min1.w);\n" + " if (handleIndex<handleIndex2)\n" + " {\n" + " int curPair = atomic_add(pairCount,1);\n" + " if (curPair<maxPairs)\n" + " {\n" + " int4 newpair;\n" + " newpair.x = handleIndex;\n" + " newpair.y = handleIndex2;\n" + " newpair.z = -1;\n" + " newpair.w = -1;\n" + " pPairBuff2[curPair] = newpair;\n" + " }\n" + " }\n" + " \n" + " }\n" + " }\n" + " }\n" + " }\n" + "}\n" + "__kernel void kFindOverlappingPairs( int numObjects,\n" + " __global float4* allpAABB, \n" + " __global const int* smallAabbMapping,\n" + " __global int2* pHash, \n" + " __global int* pCellStart, \n" + " __global float4* pParams ,\n" + " volatile __global int* pairCount,\n" + " __global int4* pPairBuff2,\n" + " int maxPairs\n" + " )\n" + "{\n" + " int index = get_global_id(0);\n" + " if(index >= numObjects)\n" + " {\n" + " return;\n" + " }\n" + " int2 sortedData = pHash[index];\n" + " int unsorted_indx = sortedData.y;\n" + " float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n" + " float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n" + " float4 pos;\n" + " pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" + " pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" + " pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" + " // get address in grid\n" + " int4 gridPosA = getGridPos(pos, pParams);\n" + " int4 gridPosB; \n" + " // examine only neighbouring cells\n" + " for(int z=-1; z<=1; z++) \n" + " {\n" + " gridPosB.z = gridPosA.z + z;\n" + " for(int y=-1; y<=1; y++) \n" + " {\n" + " gridPosB.y = gridPosA.y + y;\n" + " for(int x=-1; x<=1; x++) \n" + " {\n" + " gridPosB.x = gridPosA.x + x;\n" + " findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n" + " }\n" + " }\n" + " }\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h index 5eb8f45b16..c02877dde9 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h @@ -1,729 +1,728 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* parallelLinearBvhCL= \ -"/*\n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose,\n" -"including commercial applications, and to alter it and redistribute it freely,\n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Initial Author Jackson Lee, 2014\n" -"typedef float b3Scalar;\n" -"typedef float4 b3Vector3;\n" -"#define b3Max max\n" -"#define b3Min min\n" -"#define b3Sqrt sqrt\n" -"typedef struct\n" -"{\n" -" unsigned int m_key;\n" -" unsigned int m_value;\n" -"} SortDataCL;\n" -"typedef struct \n" -"{\n" -" union\n" -" {\n" -" float4 m_min;\n" -" float m_minElems[4];\n" -" int m_minIndices[4];\n" -" };\n" -" union\n" -" {\n" -" float4 m_max;\n" -" float m_maxElems[4];\n" -" int m_maxIndices[4];\n" -" };\n" -"} b3AabbCL;\n" -"unsigned int interleaveBits(unsigned int x)\n" -"{\n" -" //........ ........ ......12 3456789A //x\n" -" //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits\n" -" \n" -" //......12 3456789A ......12 3456789A //x ^ (x << 16)\n" -" //11111111 ........ ........ 11111111 //0x FF 00 00 FF\n" -" //......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF;\n" -" \n" -" //......12 ........ 3456789A 3456789A //x ^ (x << 8)\n" -" //......11 ........ 1111.... ....1111 //0x 03 00 F0 0F\n" -" //......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F;\n" -" \n" -" //..12..12 ....3456 3456.... 789A789A //x ^ (x << 4)\n" -" //......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3\n" -" //......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3;\n" -" \n" -" //....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2)\n" -" //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49\n" -" //....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249;\n" -" \n" -" //........ ........ ......11 11111111 //0x000003FF\n" -" x &= 0x000003FF; //Clear all bits above bit 10\n" -" \n" -" x = (x ^ (x << 16)) & 0xFF0000FF;\n" -" x = (x ^ (x << 8)) & 0x0300F00F;\n" -" x = (x ^ (x << 4)) & 0x030C30C3;\n" -" x = (x ^ (x << 2)) & 0x09249249;\n" -" \n" -" return x;\n" -"}\n" -"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n" -"{\n" -" return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n" -"}\n" -"__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n" -"{\n" -" int separatedAabbIndex = get_global_id(0);\n" -" if(separatedAabbIndex >= numAabbsToSeparate) return;\n" -" int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n" -" out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n" -"}\n" -"//Should replace with an optimized parallel reduction\n" -"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n" -"{\n" -" //Each time this kernel is added to the command queue, \n" -" //the number of AABBs needing to be merged is halved\n" -" //\n" -" //Example with 159 AABBs:\n" -" // numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n" -" // numMergedAabbs == 159 - 80 == 79\n" -" //So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n" -" \n" -" int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n" -" int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n" -" \n" -" int aabbIndex = get_global_id(0);\n" -" if(aabbIndex >= numMergedAabbs) return;\n" -" \n" -" int otherAabbIndex = aabbIndex + numRemainingAabbs;\n" -" \n" -" b3AabbCL aabb = out_mergedAabb[aabbIndex];\n" -" b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n" -" \n" -" b3AabbCL mergedAabb;\n" -" mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n" -" mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n" -" out_mergedAabb[aabbIndex] = mergedAabb;\n" -"}\n" -"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n" -" __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n" -"{\n" -" int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index\n" -" if(leafNodeIndex >= numAabbs) return;\n" -" \n" -" b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n" -" b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n" -" b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n" -" \n" -" b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n" -" b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n" -" b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n" -" \n" -" //Quantize into integer coordinates\n" -" //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n" -" b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n" -" \n" -" int4 discretePosition;\n" -" discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n" -" discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n" -" discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n" -" \n" -" //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n" -" discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n" -" discretePosition += 512;\n" -" \n" -" //Interleave bits(assign a morton code, also known as a z-curve)\n" -" unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n" -" \n" -" //\n" -" SortDataCL mortonCodeIndexPair;\n" -" mortonCodeIndexPair.m_key = mortonCode;\n" -" mortonCodeIndexPair.m_value = leafNodeIndex;\n" -" \n" -" out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n" -"}\n" -"#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n" -"//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n" -"//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n" -"//In both cases, the bit should be cleared to access the actual node index.\n" -"int isLeafNode(int index) { return (index >> 31 == 0); }\n" -"int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n" -"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n" -"//From sap.cl\n" -"#define NEW_PAIR_MARKER -1\n" -"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n" -"{\n" -" bool overlap = true;\n" -" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" -" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" -" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" -" return overlap;\n" -"}\n" -"//From sap.cl\n" -"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n" -" __global int* rootNodeIndex, \n" -" __global int2* internalNodeChildIndices, \n" -" __global b3AabbCL* internalNodeAabbs,\n" -" __global int2* internalNodeLeafIndexRanges,\n" -" \n" -" __global SortDataCL* mortonCodesAndAabbIndices,\n" -" __global int* out_numPairs, __global int4* out_overlappingPairs, \n" -" int maxPairs, int numQueryAabbs)\n" -"{\n" -" //Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n" -" //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n" -" int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" -" if(queryBvhNodeIndex >= numQueryAabbs) return;\n" -" \n" -" int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n" -" b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n" -" \n" -" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" -" \n" -" int stackSize = 1;\n" -" stack[0] = *rootNodeIndex;\n" -" \n" -" while(stackSize)\n" -" {\n" -" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" -" --stackSize;\n" -" \n" -" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n" -" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" -" \n" -" //Optimization - if the BVH is structured as a binary radix tree, then\n" -" //each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n" -" //This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n" -" {\n" -" int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n" -" if(highestLeafIndex <= queryBvhNodeIndex) continue;\n" -" }\n" -" \n" -" //bvhRigidIndex is not used if internal node\n" -" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" -" \n" -" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" -" if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n" -" {\n" -" if(isLeaf)\n" -" {\n" -" int4 pair;\n" -" pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n" -" pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" -" pair.z = NEW_PAIR_MARKER;\n" -" pair.w = NEW_PAIR_MARKER;\n" -" \n" -" int pairIndex = atomic_inc(out_numPairs);\n" -" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" -" }\n" -" \n" -" if(!isLeaf) //Internal node\n" -" {\n" -" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" -" {\n" -" //Error\n" -" }\n" -" else\n" -" {\n" -" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" -" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" -" }\n" -" }\n" -" }\n" -" \n" -" }\n" -"}\n" -"//From rayCastKernels.cl\n" -"typedef struct\n" -"{\n" -" float4 m_from;\n" -" float4 m_to;\n" -"} b3RayInfo;\n" -"//From rayCastKernels.cl\n" -"b3Vector3 b3Vector3_normalize(b3Vector3 v)\n" -"{\n" -" b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n" -" return normalize(normal); //OpenCL normalize == vector4 normalize\n" -"}\n" -"b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n" -"b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n" -"int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n" -"{\n" -" //AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n" -" //t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n" -" //\n" -" //if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n" -" //and min.x will be the far plane; otherwise, it is reversed.\n" -" //\n" -" //In order for there to be a collision, the t_min and t_max of each pair must overlap.\n" -" //This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n" -" \n" -" int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) ); //isless(x,y) returns (x < y)\n" -" \n" -" //When using vector types, the select() function checks the most signficant bit, \n" -" //but isless() sets the least significant bit.\n" -" isNegative <<= 31;\n" -" //select(b, a, condition) == condition ? a : b\n" -" //When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n" -" b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" -" b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" -" \n" -" b3Scalar t_min_final = 0.0f;\n" -" b3Scalar t_max_final = rayLength;\n" -" \n" -" //Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n" -" //Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n" -" //Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n" -" t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n" -" t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n" -" \n" -" return (t_min_final <= t_max_final);\n" -"}\n" -"__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n" -" __global int* rootNodeIndex, \n" -" __global int2* internalNodeChildIndices, \n" -" __global b3AabbCL* internalNodeAabbs,\n" -" __global int2* internalNodeLeafIndexRanges,\n" -" __global SortDataCL* mortonCodesAndAabbIndices,\n" -" \n" -" __global b3RayInfo* rays,\n" -" \n" -" __global int* out_numRayRigidPairs, \n" -" __global int2* out_rayRigidPairs,\n" -" int maxRayRigidPairs, int numRays)\n" -"{\n" -" int rayIndex = get_global_id(0);\n" -" if(rayIndex >= numRays) return;\n" -" \n" -" //\n" -" b3Vector3 rayFrom = rays[rayIndex].m_from;\n" -" b3Vector3 rayTo = rays[rayIndex].m_to;\n" -" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" -" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" -" \n" -" //\n" -" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" -" \n" -" int stackSize = 1;\n" -" stack[0] = *rootNodeIndex;\n" -" \n" -" while(stackSize)\n" -" {\n" -" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" -" --stackSize;\n" -" \n" -" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n" -" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" -" \n" -" //bvhRigidIndex is not used if internal node\n" -" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" -" \n" -" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" -" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) )\n" -" {\n" -" if(isLeaf)\n" -" {\n" -" int2 rayRigidPair;\n" -" rayRigidPair.x = rayIndex;\n" -" rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" -" \n" -" int pairIndex = atomic_inc(out_numRayRigidPairs);\n" -" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" -" }\n" -" \n" -" if(!isLeaf) //Internal node\n" -" {\n" -" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" -" {\n" -" //Error\n" -" }\n" -" else\n" -" {\n" -" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" -" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" -" }\n" -" }\n" -" }\n" -" }\n" -"}\n" -"__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n" -" __global int* out_numPairs, __global int4* out_overlappingPairs, \n" -" int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n" -"{\n" -" int smallAabbIndex = get_global_id(0);\n" -" if(smallAabbIndex >= numSmallAabbRigids) return;\n" -" \n" -" b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n" -" for(int i = 0; i < numLargeAabbRigids; ++i)\n" -" {\n" -" b3AabbCL largeAabb = largeAabbs[i];\n" -" if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n" -" {\n" -" int4 pair;\n" -" pair.x = largeAabb.m_minIndices[3];\n" -" pair.y = smallAabb.m_minIndices[3];\n" -" pair.z = NEW_PAIR_MARKER;\n" -" pair.w = NEW_PAIR_MARKER;\n" -" \n" -" int pairIndex = atomic_inc(out_numPairs);\n" -" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" -" }\n" -" }\n" -"}\n" -"__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n" -" __global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs,\n" -" int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n" -"{\n" -" int rayIndex = get_global_id(0);\n" -" if(rayIndex >= numRays) return;\n" -" \n" -" b3Vector3 rayFrom = rays[rayIndex].m_from;\n" -" b3Vector3 rayTo = rays[rayIndex].m_to;\n" -" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" -" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" -" \n" -" for(int i = 0; i < numLargeAabbRigids; ++i)\n" -" {\n" -" b3AabbCL rigidAabb = largeRigidAabbs[i];\n" -" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n" -" {\n" -" int2 rayRigidPair;\n" -" rayRigidPair.x = rayIndex;\n" -" rayRigidPair.y = rigidAabb.m_minIndices[3];\n" -" \n" -" int pairIndex = atomic_inc(out_numRayRigidPairs);\n" -" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" -" }\n" -" }\n" -"}\n" -"//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n" -"//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n" -"//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n" -"//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n" -"#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n" -"#define B3_PLBVH_ROOT_NODE_MARKER -1\n" -"#define b3Int64 long\n" -"int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n" -"b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n" -"{\n" -" //This function only needs to return (i & j) in order for the algorithm to work,\n" -" //but it may help with debugging to mask out the lower bits.\n" -" b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n" -" b3Int64 sharedBits = i & j;\n" -" b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0\n" -" \n" -" return sharedBits & bitmask;\n" -"}\n" -"//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n" -"int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n" -"{\n" -" return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n" -"}\n" -"__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n" -" __global b3Int64* out_commonPrefixes,\n" -" __global int* out_commonPrefixLengths,\n" -" int numInternalNodes)\n" -"{\n" -" int internalNodeIndex = get_global_id(0);\n" -" if (internalNodeIndex >= numInternalNodes) return;\n" -" \n" -" //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n" -" //and the number of internal nodes is always numLeafNodes - 1\n" -" int leftLeafIndex = internalNodeIndex;\n" -" int rightLeafIndex = internalNodeIndex + 1;\n" -" \n" -" int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n" -" int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n" -" \n" -" //Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n" -" //Append the index of each leaf node to each morton code so that there are no duplicates.\n" -" //The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n" -" //is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n" -" //\n" -" //upsample(a, b) == ( ((b3Int64)a) << 32) | b\n" -" b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n" -" b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n" -" \n" -" out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" -" out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" -"}\n" -"__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n" -" __global int2* out_childNodes, int numLeafNodes)\n" -"{\n" -" int leafNodeIndex = get_global_id(0);\n" -" if (leafNodeIndex >= numLeafNodes) return;\n" -" \n" -" int numInternalNodes = numLeafNodes - 1;\n" -" \n" -" int leftSplitIndex = leafNodeIndex - 1;\n" -" int rightSplitIndex = leafNodeIndex;\n" -" \n" -" int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" -" int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" -" \n" -" //Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n" -" //Leaf nodes are considered as having the highest common prefix\n" -" int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n" -" \n" -" //Handle cases for the edge nodes; the first and last node\n" -" //For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n" -" if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n" -" if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n" -" \n" -" int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n" -" out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n" -" \n" -" int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa\n" -" \n" -" //out_childNodesAsInt[0] == int2.x == left child\n" -" //out_childNodesAsInt[1] == int2.y == right child\n" -" int isLeaf = 1;\n" -" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" -" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n" -"}\n" -"__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n" -" __global int2* out_childNodes,\n" -" __global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n" -" int numInternalNodes)\n" -"{\n" -" int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" -" if(internalNodeIndex >= numInternalNodes) return;\n" -" \n" -" b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n" -" int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n" -" \n" -"//#define USE_LINEAR_SEARCH\n" -"#ifdef USE_LINEAR_SEARCH\n" -" int leftIndex = -1;\n" -" int rightIndex = -1;\n" -" \n" -" //Find nearest element to left with a lower common prefix\n" -" for(int i = internalNodeIndex - 1; i >= 0; --i)\n" -" {\n" -" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" -" if(nodeLeftSharedPrefixLength < nodePrefixLength)\n" -" {\n" -" leftIndex = i;\n" -" break;\n" -" }\n" -" }\n" -" \n" -" //Find nearest element to right with a lower common prefix\n" -" for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n" -" {\n" -" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" -" if(nodeRightSharedPrefixLength < nodePrefixLength)\n" -" {\n" -" rightIndex = i;\n" -" break;\n" -" }\n" -" }\n" -" \n" -"#else //Use binary search\n" -" //Find nearest element to left with a lower common prefix\n" -" int leftIndex = -1;\n" -" {\n" -" int lower = 0;\n" -" int upper = internalNodeIndex - 1;\n" -" \n" -" while(lower <= upper)\n" -" {\n" -" int mid = (lower + upper) / 2;\n" -" b3Int64 midPrefix = commonPrefixes[mid];\n" -" int midPrefixLength = commonPrefixLengths[mid];\n" -" \n" -" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" -" if(nodeMidSharedPrefixLength < nodePrefixLength) \n" -" {\n" -" int right = mid + 1;\n" -" if(right < internalNodeIndex)\n" -" {\n" -" b3Int64 rightPrefix = commonPrefixes[right];\n" -" int rightPrefixLength = commonPrefixLengths[right];\n" -" \n" -" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n" -" if(nodeRightSharedPrefixLength < nodePrefixLength) \n" -" {\n" -" lower = right;\n" -" leftIndex = right;\n" -" }\n" -" else \n" -" {\n" -" leftIndex = mid;\n" -" break;\n" -" }\n" -" }\n" -" else \n" -" {\n" -" leftIndex = mid;\n" -" break;\n" -" }\n" -" }\n" -" else upper = mid - 1;\n" -" }\n" -" }\n" -" \n" -" //Find nearest element to right with a lower common prefix\n" -" int rightIndex = -1;\n" -" {\n" -" int lower = internalNodeIndex + 1;\n" -" int upper = numInternalNodes - 1;\n" -" \n" -" while(lower <= upper)\n" -" {\n" -" int mid = (lower + upper) / 2;\n" -" b3Int64 midPrefix = commonPrefixes[mid];\n" -" int midPrefixLength = commonPrefixLengths[mid];\n" -" \n" -" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" -" if(nodeMidSharedPrefixLength < nodePrefixLength) \n" -" {\n" -" int left = mid - 1;\n" -" if(left > internalNodeIndex)\n" -" {\n" -" b3Int64 leftPrefix = commonPrefixes[left];\n" -" int leftPrefixLength = commonPrefixLengths[left];\n" -" \n" -" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n" -" if(nodeLeftSharedPrefixLength < nodePrefixLength) \n" -" {\n" -" upper = left;\n" -" rightIndex = left;\n" -" }\n" -" else \n" -" {\n" -" rightIndex = mid;\n" -" break;\n" -" }\n" -" }\n" -" else \n" -" {\n" -" rightIndex = mid;\n" -" break;\n" -" }\n" -" }\n" -" else lower = mid + 1;\n" -" }\n" -" }\n" -"#endif\n" -" \n" -" //Select parent\n" -" {\n" -" int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" -" int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" -" \n" -" int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n" -" \n" -" if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n" -" else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n" -" \n" -" int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n" -" \n" -" int isRootNode = (leftIndex == -1 && rightIndex == -1);\n" -" out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n" -" \n" -" int isLeaf = 0;\n" -" if(!isRootNode)\n" -" {\n" -" int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa\n" -" \n" -" //out_childNodesAsInt[0] == int2.x == left child\n" -" //out_childNodesAsInt[1] == int2.y == right child\n" -" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" -" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" -" }\n" -" else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" -" }\n" -"}\n" -"__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n" -" __global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n" -"{\n" -" if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n" -" int internalNodeIndex = get_global_id(0);\n" -" if(internalNodeIndex >= numInternalNodes) return;\n" -" \n" -" //\n" -" int distanceFromRoot = 0;\n" -" {\n" -" int parentIndex = internalNodeParentNodes[internalNodeIndex];\n" -" while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n" -" {\n" -" parentIndex = internalNodeParentNodes[parentIndex];\n" -" ++distanceFromRoot;\n" -" }\n" -" }\n" -" out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n" -" \n" -" //\n" -" __local int localMaxDistanceFromRoot;\n" -" if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" \n" -" atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" \n" -" if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n" -"}\n" -"__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n" -" __global int2* childNodes,\n" -" __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n" -" int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n" -"{\n" -" int internalNodeIndex = get_global_id(0);\n" -" if(internalNodeIndex >= numInternalNodes) return;\n" -" \n" -" int distance = distanceFromRoot[internalNodeIndex];\n" -" \n" -" if(distance == processedDistance)\n" -" {\n" -" int leftChildIndex = childNodes[internalNodeIndex].x;\n" -" int rightChildIndex = childNodes[internalNodeIndex].y;\n" -" \n" -" int isLeftChildLeaf = isLeafNode(leftChildIndex);\n" -" int isRightChildLeaf = isLeafNode(rightChildIndex);\n" -" \n" -" leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n" -" rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n" -" \n" -" //leftRigidIndex/rightRigidIndex is not used if internal node\n" -" int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n" -" int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n" -" \n" -" b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n" -" b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n" -" \n" -" b3AabbCL mergedAabb;\n" -" mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n" -" mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n" -" internalNodeAabbs[internalNodeIndex] = mergedAabb;\n" -" }\n" -"}\n" -"__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n" -"{\n" -" int internalNodeIndex = get_global_id(0);\n" -" if(internalNodeIndex >= numInternalNodes) return;\n" -" \n" -" int numLeafNodes = numInternalNodes + 1;\n" -" \n" -" int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n" -" \n" -" int2 leafIndexRange; //x == min leaf index, y == max leaf index\n" -" \n" -" //Find lowest leaf index covered by this internal node\n" -" {\n" -" int lowestIndex = childNodes.x; //childNodes.x == Left child\n" -" while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n" -" leafIndexRange.x = lowestIndex;\n" -" }\n" -" \n" -" //Find highest leaf index covered by this internal node\n" -" {\n" -" int highestIndex = childNodes.y; //childNodes.y == Right child\n" -" while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n" -" leafIndexRange.y = highestIndex;\n" -" }\n" -" \n" -" //\n" -" out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n" -"}\n" -; +static const char* parallelLinearBvhCL = + "/*\n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose,\n" + "including commercial applications, and to alter it and redistribute it freely,\n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Initial Author Jackson Lee, 2014\n" + "typedef float b3Scalar;\n" + "typedef float4 b3Vector3;\n" + "#define b3Max max\n" + "#define b3Min min\n" + "#define b3Sqrt sqrt\n" + "typedef struct\n" + "{\n" + " unsigned int m_key;\n" + " unsigned int m_value;\n" + "} SortDataCL;\n" + "typedef struct \n" + "{\n" + " union\n" + " {\n" + " float4 m_min;\n" + " float m_minElems[4];\n" + " int m_minIndices[4];\n" + " };\n" + " union\n" + " {\n" + " float4 m_max;\n" + " float m_maxElems[4];\n" + " int m_maxIndices[4];\n" + " };\n" + "} b3AabbCL;\n" + "unsigned int interleaveBits(unsigned int x)\n" + "{\n" + " //........ ........ ......12 3456789A //x\n" + " //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits\n" + " \n" + " //......12 3456789A ......12 3456789A //x ^ (x << 16)\n" + " //11111111 ........ ........ 11111111 //0x FF 00 00 FF\n" + " //......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF;\n" + " \n" + " //......12 ........ 3456789A 3456789A //x ^ (x << 8)\n" + " //......11 ........ 1111.... ....1111 //0x 03 00 F0 0F\n" + " //......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F;\n" + " \n" + " //..12..12 ....3456 3456.... 789A789A //x ^ (x << 4)\n" + " //......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3\n" + " //......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3;\n" + " \n" + " //....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2)\n" + " //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49\n" + " //....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249;\n" + " \n" + " //........ ........ ......11 11111111 //0x000003FF\n" + " x &= 0x000003FF; //Clear all bits above bit 10\n" + " \n" + " x = (x ^ (x << 16)) & 0xFF0000FF;\n" + " x = (x ^ (x << 8)) & 0x0300F00F;\n" + " x = (x ^ (x << 4)) & 0x030C30C3;\n" + " x = (x ^ (x << 2)) & 0x09249249;\n" + " \n" + " return x;\n" + "}\n" + "unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n" + "{\n" + " return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n" + "}\n" + "__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n" + "{\n" + " int separatedAabbIndex = get_global_id(0);\n" + " if(separatedAabbIndex >= numAabbsToSeparate) return;\n" + " int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n" + " out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n" + "}\n" + "//Should replace with an optimized parallel reduction\n" + "__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n" + "{\n" + " //Each time this kernel is added to the command queue, \n" + " //the number of AABBs needing to be merged is halved\n" + " //\n" + " //Example with 159 AABBs:\n" + " // numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n" + " // numMergedAabbs == 159 - 80 == 79\n" + " //So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n" + " \n" + " int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n" + " int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n" + " \n" + " int aabbIndex = get_global_id(0);\n" + " if(aabbIndex >= numMergedAabbs) return;\n" + " \n" + " int otherAabbIndex = aabbIndex + numRemainingAabbs;\n" + " \n" + " b3AabbCL aabb = out_mergedAabb[aabbIndex];\n" + " b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n" + " \n" + " b3AabbCL mergedAabb;\n" + " mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n" + " mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n" + " out_mergedAabb[aabbIndex] = mergedAabb;\n" + "}\n" + "__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n" + " __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n" + "{\n" + " int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index\n" + " if(leafNodeIndex >= numAabbs) return;\n" + " \n" + " b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n" + " b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n" + " b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n" + " \n" + " b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n" + " b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n" + " b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n" + " \n" + " //Quantize into integer coordinates\n" + " //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n" + " b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n" + " \n" + " int4 discretePosition;\n" + " discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n" + " discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n" + " discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n" + " \n" + " //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n" + " discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n" + " discretePosition += 512;\n" + " \n" + " //Interleave bits(assign a morton code, also known as a z-curve)\n" + " unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n" + " \n" + " //\n" + " SortDataCL mortonCodeIndexPair;\n" + " mortonCodeIndexPair.m_key = mortonCode;\n" + " mortonCodeIndexPair.m_value = leafNodeIndex;\n" + " \n" + " out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n" + "}\n" + "#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n" + "//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n" + "//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n" + "//In both cases, the bit should be cleared to access the actual node index.\n" + "int isLeafNode(int index) { return (index >> 31 == 0); }\n" + "int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n" + "int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n" + "//From sap.cl\n" + "#define NEW_PAIR_MARKER -1\n" + "bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n" + "{\n" + " bool overlap = true;\n" + " overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" + " overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" + " overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" + " return overlap;\n" + "}\n" + "//From sap.cl\n" + "__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n" + " __global int* rootNodeIndex, \n" + " __global int2* internalNodeChildIndices, \n" + " __global b3AabbCL* internalNodeAabbs,\n" + " __global int2* internalNodeLeafIndexRanges,\n" + " \n" + " __global SortDataCL* mortonCodesAndAabbIndices,\n" + " __global int* out_numPairs, __global int4* out_overlappingPairs, \n" + " int maxPairs, int numQueryAabbs)\n" + "{\n" + " //Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n" + " //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n" + " int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" + " if(queryBvhNodeIndex >= numQueryAabbs) return;\n" + " \n" + " int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n" + " b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n" + " \n" + " int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" + " \n" + " int stackSize = 1;\n" + " stack[0] = *rootNodeIndex;\n" + " \n" + " while(stackSize)\n" + " {\n" + " int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" + " --stackSize;\n" + " \n" + " int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n" + " int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" + " \n" + " //Optimization - if the BVH is structured as a binary radix tree, then\n" + " //each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n" + " //This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n" + " {\n" + " int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n" + " if(highestLeafIndex <= queryBvhNodeIndex) continue;\n" + " }\n" + " \n" + " //bvhRigidIndex is not used if internal node\n" + " int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" + " \n" + " b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" + " if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n" + " {\n" + " if(isLeaf)\n" + " {\n" + " int4 pair;\n" + " pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n" + " pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" + " pair.z = NEW_PAIR_MARKER;\n" + " pair.w = NEW_PAIR_MARKER;\n" + " \n" + " int pairIndex = atomic_inc(out_numPairs);\n" + " if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" + " }\n" + " \n" + " if(!isLeaf) //Internal node\n" + " {\n" + " if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" + " {\n" + " //Error\n" + " }\n" + " else\n" + " {\n" + " stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" + " stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" + " }\n" + " }\n" + " }\n" + " \n" + " }\n" + "}\n" + "//From rayCastKernels.cl\n" + "typedef struct\n" + "{\n" + " float4 m_from;\n" + " float4 m_to;\n" + "} b3RayInfo;\n" + "//From rayCastKernels.cl\n" + "b3Vector3 b3Vector3_normalize(b3Vector3 v)\n" + "{\n" + " b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n" + " return normalize(normal); //OpenCL normalize == vector4 normalize\n" + "}\n" + "b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n" + "b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n" + "int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n" + "{\n" + " //AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n" + " //t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n" + " //\n" + " //if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n" + " //and min.x will be the far plane; otherwise, it is reversed.\n" + " //\n" + " //In order for there to be a collision, the t_min and t_max of each pair must overlap.\n" + " //This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n" + " \n" + " int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) ); //isless(x,y) returns (x < y)\n" + " \n" + " //When using vector types, the select() function checks the most signficant bit, \n" + " //but isless() sets the least significant bit.\n" + " isNegative <<= 31;\n" + " //select(b, a, condition) == condition ? a : b\n" + " //When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n" + " b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" + " b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" + " \n" + " b3Scalar t_min_final = 0.0f;\n" + " b3Scalar t_max_final = rayLength;\n" + " \n" + " //Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n" + " //Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n" + " //Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n" + " t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n" + " t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n" + " \n" + " return (t_min_final <= t_max_final);\n" + "}\n" + "__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n" + " __global int* rootNodeIndex, \n" + " __global int2* internalNodeChildIndices, \n" + " __global b3AabbCL* internalNodeAabbs,\n" + " __global int2* internalNodeLeafIndexRanges,\n" + " __global SortDataCL* mortonCodesAndAabbIndices,\n" + " \n" + " __global b3RayInfo* rays,\n" + " \n" + " __global int* out_numRayRigidPairs, \n" + " __global int2* out_rayRigidPairs,\n" + " int maxRayRigidPairs, int numRays)\n" + "{\n" + " int rayIndex = get_global_id(0);\n" + " if(rayIndex >= numRays) return;\n" + " \n" + " //\n" + " b3Vector3 rayFrom = rays[rayIndex].m_from;\n" + " b3Vector3 rayTo = rays[rayIndex].m_to;\n" + " b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" + " b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" + " \n" + " //\n" + " int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" + " \n" + " int stackSize = 1;\n" + " stack[0] = *rootNodeIndex;\n" + " \n" + " while(stackSize)\n" + " {\n" + " int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" + " --stackSize;\n" + " \n" + " int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n" + " int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" + " \n" + " //bvhRigidIndex is not used if internal node\n" + " int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" + " \n" + " b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" + " if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) )\n" + " {\n" + " if(isLeaf)\n" + " {\n" + " int2 rayRigidPair;\n" + " rayRigidPair.x = rayIndex;\n" + " rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" + " \n" + " int pairIndex = atomic_inc(out_numRayRigidPairs);\n" + " if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" + " }\n" + " \n" + " if(!isLeaf) //Internal node\n" + " {\n" + " if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" + " {\n" + " //Error\n" + " }\n" + " else\n" + " {\n" + " stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" + " stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" + " }\n" + " }\n" + " }\n" + " }\n" + "}\n" + "__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n" + " __global int* out_numPairs, __global int4* out_overlappingPairs, \n" + " int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n" + "{\n" + " int smallAabbIndex = get_global_id(0);\n" + " if(smallAabbIndex >= numSmallAabbRigids) return;\n" + " \n" + " b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n" + " for(int i = 0; i < numLargeAabbRigids; ++i)\n" + " {\n" + " b3AabbCL largeAabb = largeAabbs[i];\n" + " if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n" + " {\n" + " int4 pair;\n" + " pair.x = largeAabb.m_minIndices[3];\n" + " pair.y = smallAabb.m_minIndices[3];\n" + " pair.z = NEW_PAIR_MARKER;\n" + " pair.w = NEW_PAIR_MARKER;\n" + " \n" + " int pairIndex = atomic_inc(out_numPairs);\n" + " if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" + " }\n" + " }\n" + "}\n" + "__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n" + " __global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs,\n" + " int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n" + "{\n" + " int rayIndex = get_global_id(0);\n" + " if(rayIndex >= numRays) return;\n" + " \n" + " b3Vector3 rayFrom = rays[rayIndex].m_from;\n" + " b3Vector3 rayTo = rays[rayIndex].m_to;\n" + " b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" + " b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" + " \n" + " for(int i = 0; i < numLargeAabbRigids; ++i)\n" + " {\n" + " b3AabbCL rigidAabb = largeRigidAabbs[i];\n" + " if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n" + " {\n" + " int2 rayRigidPair;\n" + " rayRigidPair.x = rayIndex;\n" + " rayRigidPair.y = rigidAabb.m_minIndices[3];\n" + " \n" + " int pairIndex = atomic_inc(out_numRayRigidPairs);\n" + " if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" + " }\n" + " }\n" + "}\n" + "//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n" + "//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n" + "//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n" + "//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n" + "#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n" + "#define B3_PLBVH_ROOT_NODE_MARKER -1\n" + "#define b3Int64 long\n" + "int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n" + "b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n" + "{\n" + " //This function only needs to return (i & j) in order for the algorithm to work,\n" + " //but it may help with debugging to mask out the lower bits.\n" + " b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n" + " b3Int64 sharedBits = i & j;\n" + " b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0\n" + " \n" + " return sharedBits & bitmask;\n" + "}\n" + "//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n" + "int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n" + "{\n" + " return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n" + "}\n" + "__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n" + " __global b3Int64* out_commonPrefixes,\n" + " __global int* out_commonPrefixLengths,\n" + " int numInternalNodes)\n" + "{\n" + " int internalNodeIndex = get_global_id(0);\n" + " if (internalNodeIndex >= numInternalNodes) return;\n" + " \n" + " //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n" + " //and the number of internal nodes is always numLeafNodes - 1\n" + " int leftLeafIndex = internalNodeIndex;\n" + " int rightLeafIndex = internalNodeIndex + 1;\n" + " \n" + " int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n" + " int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n" + " \n" + " //Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n" + " //Append the index of each leaf node to each morton code so that there are no duplicates.\n" + " //The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n" + " //is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n" + " //\n" + " //upsample(a, b) == ( ((b3Int64)a) << 32) | b\n" + " b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n" + " b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n" + " \n" + " out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" + " out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" + "}\n" + "__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n" + " __global int2* out_childNodes, int numLeafNodes)\n" + "{\n" + " int leafNodeIndex = get_global_id(0);\n" + " if (leafNodeIndex >= numLeafNodes) return;\n" + " \n" + " int numInternalNodes = numLeafNodes - 1;\n" + " \n" + " int leftSplitIndex = leafNodeIndex - 1;\n" + " int rightSplitIndex = leafNodeIndex;\n" + " \n" + " int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" + " int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" + " \n" + " //Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n" + " //Leaf nodes are considered as having the highest common prefix\n" + " int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n" + " \n" + " //Handle cases for the edge nodes; the first and last node\n" + " //For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n" + " if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n" + " if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n" + " \n" + " int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n" + " out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n" + " \n" + " int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa\n" + " \n" + " //out_childNodesAsInt[0] == int2.x == left child\n" + " //out_childNodesAsInt[1] == int2.y == right child\n" + " int isLeaf = 1;\n" + " __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" + " out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n" + "}\n" + "__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n" + " __global int2* out_childNodes,\n" + " __global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n" + " int numInternalNodes)\n" + "{\n" + " int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" + " if(internalNodeIndex >= numInternalNodes) return;\n" + " \n" + " b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n" + " int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n" + " \n" + "//#define USE_LINEAR_SEARCH\n" + "#ifdef USE_LINEAR_SEARCH\n" + " int leftIndex = -1;\n" + " int rightIndex = -1;\n" + " \n" + " //Find nearest element to left with a lower common prefix\n" + " for(int i = internalNodeIndex - 1; i >= 0; --i)\n" + " {\n" + " int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" + " if(nodeLeftSharedPrefixLength < nodePrefixLength)\n" + " {\n" + " leftIndex = i;\n" + " break;\n" + " }\n" + " }\n" + " \n" + " //Find nearest element to right with a lower common prefix\n" + " for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n" + " {\n" + " int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" + " if(nodeRightSharedPrefixLength < nodePrefixLength)\n" + " {\n" + " rightIndex = i;\n" + " break;\n" + " }\n" + " }\n" + " \n" + "#else //Use binary search\n" + " //Find nearest element to left with a lower common prefix\n" + " int leftIndex = -1;\n" + " {\n" + " int lower = 0;\n" + " int upper = internalNodeIndex - 1;\n" + " \n" + " while(lower <= upper)\n" + " {\n" + " int mid = (lower + upper) / 2;\n" + " b3Int64 midPrefix = commonPrefixes[mid];\n" + " int midPrefixLength = commonPrefixLengths[mid];\n" + " \n" + " int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" + " if(nodeMidSharedPrefixLength < nodePrefixLength) \n" + " {\n" + " int right = mid + 1;\n" + " if(right < internalNodeIndex)\n" + " {\n" + " b3Int64 rightPrefix = commonPrefixes[right];\n" + " int rightPrefixLength = commonPrefixLengths[right];\n" + " \n" + " int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n" + " if(nodeRightSharedPrefixLength < nodePrefixLength) \n" + " {\n" + " lower = right;\n" + " leftIndex = right;\n" + " }\n" + " else \n" + " {\n" + " leftIndex = mid;\n" + " break;\n" + " }\n" + " }\n" + " else \n" + " {\n" + " leftIndex = mid;\n" + " break;\n" + " }\n" + " }\n" + " else upper = mid - 1;\n" + " }\n" + " }\n" + " \n" + " //Find nearest element to right with a lower common prefix\n" + " int rightIndex = -1;\n" + " {\n" + " int lower = internalNodeIndex + 1;\n" + " int upper = numInternalNodes - 1;\n" + " \n" + " while(lower <= upper)\n" + " {\n" + " int mid = (lower + upper) / 2;\n" + " b3Int64 midPrefix = commonPrefixes[mid];\n" + " int midPrefixLength = commonPrefixLengths[mid];\n" + " \n" + " int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" + " if(nodeMidSharedPrefixLength < nodePrefixLength) \n" + " {\n" + " int left = mid - 1;\n" + " if(left > internalNodeIndex)\n" + " {\n" + " b3Int64 leftPrefix = commonPrefixes[left];\n" + " int leftPrefixLength = commonPrefixLengths[left];\n" + " \n" + " int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n" + " if(nodeLeftSharedPrefixLength < nodePrefixLength) \n" + " {\n" + " upper = left;\n" + " rightIndex = left;\n" + " }\n" + " else \n" + " {\n" + " rightIndex = mid;\n" + " break;\n" + " }\n" + " }\n" + " else \n" + " {\n" + " rightIndex = mid;\n" + " break;\n" + " }\n" + " }\n" + " else lower = mid + 1;\n" + " }\n" + " }\n" + "#endif\n" + " \n" + " //Select parent\n" + " {\n" + " int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" + " int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" + " \n" + " int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n" + " \n" + " if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n" + " else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n" + " \n" + " int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n" + " \n" + " int isRootNode = (leftIndex == -1 && rightIndex == -1);\n" + " out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n" + " \n" + " int isLeaf = 0;\n" + " if(!isRootNode)\n" + " {\n" + " int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa\n" + " \n" + " //out_childNodesAsInt[0] == int2.x == left child\n" + " //out_childNodesAsInt[1] == int2.y == right child\n" + " __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" + " out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" + " }\n" + " else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" + " }\n" + "}\n" + "__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n" + " __global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n" + "{\n" + " if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n" + " int internalNodeIndex = get_global_id(0);\n" + " if(internalNodeIndex >= numInternalNodes) return;\n" + " \n" + " //\n" + " int distanceFromRoot = 0;\n" + " {\n" + " int parentIndex = internalNodeParentNodes[internalNodeIndex];\n" + " while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n" + " {\n" + " parentIndex = internalNodeParentNodes[parentIndex];\n" + " ++distanceFromRoot;\n" + " }\n" + " }\n" + " out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n" + " \n" + " //\n" + " __local int localMaxDistanceFromRoot;\n" + " if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " \n" + " atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " \n" + " if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n" + "}\n" + "__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n" + " __global int2* childNodes,\n" + " __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n" + " int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n" + "{\n" + " int internalNodeIndex = get_global_id(0);\n" + " if(internalNodeIndex >= numInternalNodes) return;\n" + " \n" + " int distance = distanceFromRoot[internalNodeIndex];\n" + " \n" + " if(distance == processedDistance)\n" + " {\n" + " int leftChildIndex = childNodes[internalNodeIndex].x;\n" + " int rightChildIndex = childNodes[internalNodeIndex].y;\n" + " \n" + " int isLeftChildLeaf = isLeafNode(leftChildIndex);\n" + " int isRightChildLeaf = isLeafNode(rightChildIndex);\n" + " \n" + " leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n" + " rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n" + " \n" + " //leftRigidIndex/rightRigidIndex is not used if internal node\n" + " int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n" + " int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n" + " \n" + " b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n" + " b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n" + " \n" + " b3AabbCL mergedAabb;\n" + " mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n" + " mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n" + " internalNodeAabbs[internalNodeIndex] = mergedAabb;\n" + " }\n" + "}\n" + "__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n" + "{\n" + " int internalNodeIndex = get_global_id(0);\n" + " if(internalNodeIndex >= numInternalNodes) return;\n" + " \n" + " int numLeafNodes = numInternalNodes + 1;\n" + " \n" + " int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n" + " \n" + " int2 leafIndexRange; //x == min leaf index, y == max leaf index\n" + " \n" + " //Find lowest leaf index covered by this internal node\n" + " {\n" + " int lowestIndex = childNodes.x; //childNodes.x == Left child\n" + " while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n" + " leafIndexRange.x = lowestIndex;\n" + " }\n" + " \n" + " //Find highest leaf index covered by this internal node\n" + " {\n" + " int highestIndex = childNodes.y; //childNodes.y == Right child\n" + " while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n" + " leafIndexRange.y = highestIndex;\n" + " }\n" + " \n" + " //\n" + " out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h index 04d40fcf26..d6999b94cb 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h @@ -1,342 +1,341 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* sapCL= \ -"/*\n" -"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Originally written by Erwin Coumans\n" -"#define NEW_PAIR_MARKER -1\n" -"typedef struct \n" -"{\n" -" union\n" -" {\n" -" float4 m_min;\n" -" float m_minElems[4];\n" -" int m_minIndices[4];\n" -" };\n" -" union\n" -" {\n" -" float4 m_max;\n" -" float m_maxElems[4];\n" -" int m_maxIndices[4];\n" -" };\n" -"} btAabbCL;\n" -"/// conservative test for overlap between two aabbs\n" -"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n" -"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n" -"{\n" -" bool overlap = true;\n" -" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" -" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" -" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" -" return overlap;\n" -"}\n" -"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" -"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" -"{\n" -" bool overlap = true;\n" -" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" -" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" -" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" -" return overlap;\n" -"}\n" -"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" -"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" -"{\n" -" bool overlap = true;\n" -" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" -" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" -" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" -" return overlap;\n" -"}\n" -"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numUnsortedAabbs)\n" -" return;\n" -" int j = get_global_id(1);\n" -" if (j>=numUnSortedAabbs2)\n" -" return;\n" -" __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n" -" __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n" -" if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n" -" {\n" -" int4 myPair;\n" -" \n" -" int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n" -" int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n" -" if (xIndex>yIndex)\n" -" {\n" -" int tmp = xIndex;\n" -" xIndex=yIndex;\n" -" yIndex=tmp;\n" -" }\n" -" \n" -" myPair.x = xIndex;\n" -" myPair.y = yIndex;\n" -" myPair.z = NEW_PAIR_MARKER;\n" -" myPair.w = NEW_PAIR_MARKER;\n" -" int curPair = atomic_inc (pairCount);\n" -" if (curPair<maxPairs)\n" -" {\n" -" pairsOut[curPair] = myPair; //flush to main memory\n" -" }\n" -" }\n" -"}\n" -"__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numObjects)\n" -" return;\n" -" for (int j=i+1;j<numObjects;j++)\n" -" {\n" -" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" -" {\n" -" int4 myPair;\n" -" myPair.x = aabbs[i].m_minIndices[3];\n" -" myPair.y = aabbs[j].m_minIndices[3];\n" -" myPair.z = NEW_PAIR_MARKER;\n" -" myPair.w = NEW_PAIR_MARKER;\n" -" int curPair = atomic_inc (pairCount);\n" -" if (curPair<maxPairs)\n" -" {\n" -" pairsOut[curPair] = myPair; //flush to main memory\n" -" }\n" -" }\n" -" }\n" -"}\n" -"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numObjects)\n" -" return;\n" -" for (int j=i+1;j<numObjects;j++)\n" -" {\n" -" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n" -" {\n" -" break;\n" -" }\n" -" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" -" {\n" -" int4 myPair;\n" -" myPair.x = aabbs[i].m_minIndices[3];\n" -" myPair.y = aabbs[j].m_minIndices[3];\n" -" myPair.z = NEW_PAIR_MARKER;\n" -" myPair.w = NEW_PAIR_MARKER;\n" -" int curPair = atomic_inc (pairCount);\n" -" if (curPair<maxPairs)\n" -" {\n" -" pairsOut[curPair] = myPair; //flush to main memory\n" -" }\n" -" }\n" -" }\n" -"}\n" -"__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" -"{\n" -" int i = get_global_id(0);\n" -" int localId = get_local_id(0);\n" -" __local int numActiveWgItems[1];\n" -" __local int breakRequest[1];\n" -" if (localId==0)\n" -" {\n" -" numActiveWgItems[0] = 0;\n" -" breakRequest[0] = 0;\n" -" }\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" atomic_inc(numActiveWgItems);\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" int localBreak = 0;\n" -" int j=i+1;\n" -" do\n" -" {\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" \n" -" if (j<numObjects)\n" -" {\n" -" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n" -" {\n" -" if (!localBreak)\n" -" {\n" -" atomic_inc(breakRequest);\n" -" localBreak = 1;\n" -" }\n" -" }\n" -" }\n" -" \n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" \n" -" if (j>=numObjects && !localBreak)\n" -" {\n" -" atomic_inc(breakRequest);\n" -" localBreak = 1;\n" -" }\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" \n" -" if (!localBreak)\n" -" {\n" -" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" -" {\n" -" int4 myPair;\n" -" myPair.x = aabbs[i].m_minIndices[3];\n" -" myPair.y = aabbs[j].m_minIndices[3];\n" -" myPair.z = NEW_PAIR_MARKER;\n" -" myPair.w = NEW_PAIR_MARKER;\n" -" int curPair = atomic_inc (pairCount);\n" -" if (curPair<maxPairs)\n" -" {\n" -" pairsOut[curPair] = myPair; //flush to main memory\n" -" }\n" -" }\n" -" }\n" -" j++;\n" -" } while (breakRequest[0]<numActiveWgItems[0]);\n" -"}\n" -"__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" -"{\n" -" int i = get_global_id(0);\n" -" int localId = get_local_id(0);\n" -" __local int numActiveWgItems[1];\n" -" __local int breakRequest[1];\n" -" __local btAabbCL localAabbs[128];// = aabbs[i];\n" -" \n" -" btAabbCL myAabb;\n" -" \n" -" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n" -" float testValue = myAabb.m_maxElems[axis];\n" -" \n" -" if (localId==0)\n" -" {\n" -" numActiveWgItems[0] = 0;\n" -" breakRequest[0] = 0;\n" -" }\n" -" int localCount=0;\n" -" int block=0;\n" -" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n" -" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n" -" \n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" atomic_inc(numActiveWgItems);\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" int localBreak = 0;\n" -" \n" -" int j=i+1;\n" -" do\n" -" {\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" \n" -" if (j<numObjects)\n" -" {\n" -" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n" -" {\n" -" if (!localBreak)\n" -" {\n" -" atomic_inc(breakRequest);\n" -" localBreak = 1;\n" -" }\n" -" }\n" -" }\n" -" \n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" \n" -" if (j>=numObjects && !localBreak)\n" -" {\n" -" atomic_inc(breakRequest);\n" -" localBreak = 1;\n" -" }\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" \n" -" if (!localBreak)\n" -" {\n" -" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n" -" {\n" -" int4 myPair;\n" -" myPair.x = myAabb.m_minIndices[3];\n" -" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n" -" myPair.z = NEW_PAIR_MARKER;\n" -" myPair.w = NEW_PAIR_MARKER;\n" -" int curPair = atomic_inc (pairCount);\n" -" if (curPair<maxPairs)\n" -" {\n" -" pairsOut[curPair] = myPair; //flush to main memory\n" -" }\n" -" }\n" -" }\n" -" \n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" localCount++;\n" -" if (localCount==64)\n" -" {\n" -" localCount = 0;\n" -" block+=64; \n" -" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n" -" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n" -" }\n" -" j++;\n" -" \n" -" } while (breakRequest[0]<numActiveWgItems[0]);\n" -" \n" -"}\n" -"//http://stereopsis.com/radix.html\n" -"unsigned int FloatFlip(float fl);\n" -"unsigned int FloatFlip(float fl)\n" -"{\n" -" unsigned int f = *(unsigned int*)&fl;\n" -" unsigned int mask = -(int)(f >> 31) | 0x80000000;\n" -" return f ^ mask;\n" -"}\n" -"float IFloatFlip(unsigned int f);\n" -"float IFloatFlip(unsigned int f)\n" -"{\n" -" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n" -" unsigned int fl = f ^ mask;\n" -" return *(float*)&fl;\n" -"}\n" -"__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numObjects)\n" -" return;\n" -" int src = destAabbs[i].m_maxIndices[3];\n" -" destAabbs[i] = allAabbs[src];\n" -" destAabbs[i].m_maxIndices[3] = src;\n" -"}\n" -"__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numObjects)\n" -" return;\n" -" \n" -" \n" -" sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n" -" sortData[i].y = i;\n" -" \n" -"}\n" -"__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numObjects)\n" -" return;\n" -" \n" -" sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n" -"}\n" -"__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numAabbs)\n" -" return;\n" -" \n" -" btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n" -" \n" -" float4 s;\n" -" s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n" -" sum[i]=s;\n" -" sum2[i]=s*s; \n" -"}\n" -; +static const char* sapCL = + "/*\n" + "Copyright (c) 2012 Advanced Micro Devices, Inc. \n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose, \n" + "including commercial applications, and to alter it and redistribute it freely, \n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Originally written by Erwin Coumans\n" + "#define NEW_PAIR_MARKER -1\n" + "typedef struct \n" + "{\n" + " union\n" + " {\n" + " float4 m_min;\n" + " float m_minElems[4];\n" + " int m_minIndices[4];\n" + " };\n" + " union\n" + " {\n" + " float4 m_max;\n" + " float m_maxElems[4];\n" + " int m_maxIndices[4];\n" + " };\n" + "} btAabbCL;\n" + "/// conservative test for overlap between two aabbs\n" + "bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n" + "bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n" + "{\n" + " bool overlap = true;\n" + " overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" + " overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" + " overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" + " return overlap;\n" + "}\n" + "bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" + "bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" + "{\n" + " bool overlap = true;\n" + " overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" + " overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" + " overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" + " return overlap;\n" + "}\n" + "bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" + "bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" + "{\n" + " bool overlap = true;\n" + " overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" + " overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" + " overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" + " return overlap;\n" + "}\n" + "__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numUnsortedAabbs)\n" + " return;\n" + " int j = get_global_id(1);\n" + " if (j>=numUnSortedAabbs2)\n" + " return;\n" + " __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n" + " __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n" + " if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n" + " {\n" + " int4 myPair;\n" + " \n" + " int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n" + " int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n" + " if (xIndex>yIndex)\n" + " {\n" + " int tmp = xIndex;\n" + " xIndex=yIndex;\n" + " yIndex=tmp;\n" + " }\n" + " \n" + " myPair.x = xIndex;\n" + " myPair.y = yIndex;\n" + " myPair.z = NEW_PAIR_MARKER;\n" + " myPair.w = NEW_PAIR_MARKER;\n" + " int curPair = atomic_inc (pairCount);\n" + " if (curPair<maxPairs)\n" + " {\n" + " pairsOut[curPair] = myPair; //flush to main memory\n" + " }\n" + " }\n" + "}\n" + "__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numObjects)\n" + " return;\n" + " for (int j=i+1;j<numObjects;j++)\n" + " {\n" + " if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" + " {\n" + " int4 myPair;\n" + " myPair.x = aabbs[i].m_minIndices[3];\n" + " myPair.y = aabbs[j].m_minIndices[3];\n" + " myPair.z = NEW_PAIR_MARKER;\n" + " myPair.w = NEW_PAIR_MARKER;\n" + " int curPair = atomic_inc (pairCount);\n" + " if (curPair<maxPairs)\n" + " {\n" + " pairsOut[curPair] = myPair; //flush to main memory\n" + " }\n" + " }\n" + " }\n" + "}\n" + "__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numObjects)\n" + " return;\n" + " for (int j=i+1;j<numObjects;j++)\n" + " {\n" + " if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n" + " {\n" + " break;\n" + " }\n" + " if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" + " {\n" + " int4 myPair;\n" + " myPair.x = aabbs[i].m_minIndices[3];\n" + " myPair.y = aabbs[j].m_minIndices[3];\n" + " myPair.z = NEW_PAIR_MARKER;\n" + " myPair.w = NEW_PAIR_MARKER;\n" + " int curPair = atomic_inc (pairCount);\n" + " if (curPair<maxPairs)\n" + " {\n" + " pairsOut[curPair] = myPair; //flush to main memory\n" + " }\n" + " }\n" + " }\n" + "}\n" + "__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" + "{\n" + " int i = get_global_id(0);\n" + " int localId = get_local_id(0);\n" + " __local int numActiveWgItems[1];\n" + " __local int breakRequest[1];\n" + " if (localId==0)\n" + " {\n" + " numActiveWgItems[0] = 0;\n" + " breakRequest[0] = 0;\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " atomic_inc(numActiveWgItems);\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " int localBreak = 0;\n" + " int j=i+1;\n" + " do\n" + " {\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " \n" + " if (j<numObjects)\n" + " {\n" + " if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n" + " {\n" + " if (!localBreak)\n" + " {\n" + " atomic_inc(breakRequest);\n" + " localBreak = 1;\n" + " }\n" + " }\n" + " }\n" + " \n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " \n" + " if (j>=numObjects && !localBreak)\n" + " {\n" + " atomic_inc(breakRequest);\n" + " localBreak = 1;\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " \n" + " if (!localBreak)\n" + " {\n" + " if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" + " {\n" + " int4 myPair;\n" + " myPair.x = aabbs[i].m_minIndices[3];\n" + " myPair.y = aabbs[j].m_minIndices[3];\n" + " myPair.z = NEW_PAIR_MARKER;\n" + " myPair.w = NEW_PAIR_MARKER;\n" + " int curPair = atomic_inc (pairCount);\n" + " if (curPair<maxPairs)\n" + " {\n" + " pairsOut[curPair] = myPair; //flush to main memory\n" + " }\n" + " }\n" + " }\n" + " j++;\n" + " } while (breakRequest[0]<numActiveWgItems[0]);\n" + "}\n" + "__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" + "{\n" + " int i = get_global_id(0);\n" + " int localId = get_local_id(0);\n" + " __local int numActiveWgItems[1];\n" + " __local int breakRequest[1];\n" + " __local btAabbCL localAabbs[128];// = aabbs[i];\n" + " \n" + " btAabbCL myAabb;\n" + " \n" + " myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n" + " float testValue = myAabb.m_maxElems[axis];\n" + " \n" + " if (localId==0)\n" + " {\n" + " numActiveWgItems[0] = 0;\n" + " breakRequest[0] = 0;\n" + " }\n" + " int localCount=0;\n" + " int block=0;\n" + " localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n" + " localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n" + " \n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " atomic_inc(numActiveWgItems);\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " int localBreak = 0;\n" + " \n" + " int j=i+1;\n" + " do\n" + " {\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " \n" + " if (j<numObjects)\n" + " {\n" + " if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n" + " {\n" + " if (!localBreak)\n" + " {\n" + " atomic_inc(breakRequest);\n" + " localBreak = 1;\n" + " }\n" + " }\n" + " }\n" + " \n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " \n" + " if (j>=numObjects && !localBreak)\n" + " {\n" + " atomic_inc(breakRequest);\n" + " localBreak = 1;\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " \n" + " if (!localBreak)\n" + " {\n" + " if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n" + " {\n" + " int4 myPair;\n" + " myPair.x = myAabb.m_minIndices[3];\n" + " myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n" + " myPair.z = NEW_PAIR_MARKER;\n" + " myPair.w = NEW_PAIR_MARKER;\n" + " int curPair = atomic_inc (pairCount);\n" + " if (curPair<maxPairs)\n" + " {\n" + " pairsOut[curPair] = myPair; //flush to main memory\n" + " }\n" + " }\n" + " }\n" + " \n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " localCount++;\n" + " if (localCount==64)\n" + " {\n" + " localCount = 0;\n" + " block+=64; \n" + " localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n" + " localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n" + " }\n" + " j++;\n" + " \n" + " } while (breakRequest[0]<numActiveWgItems[0]);\n" + " \n" + "}\n" + "//http://stereopsis.com/radix.html\n" + "unsigned int FloatFlip(float fl);\n" + "unsigned int FloatFlip(float fl)\n" + "{\n" + " unsigned int f = *(unsigned int*)&fl;\n" + " unsigned int mask = -(int)(f >> 31) | 0x80000000;\n" + " return f ^ mask;\n" + "}\n" + "float IFloatFlip(unsigned int f);\n" + "float IFloatFlip(unsigned int f)\n" + "{\n" + " unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n" + " unsigned int fl = f ^ mask;\n" + " return *(float*)&fl;\n" + "}\n" + "__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numObjects)\n" + " return;\n" + " int src = destAabbs[i].m_maxIndices[3];\n" + " destAabbs[i] = allAabbs[src];\n" + " destAabbs[i].m_maxIndices[3] = src;\n" + "}\n" + "__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numObjects)\n" + " return;\n" + " \n" + " \n" + " sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n" + " sortData[i].y = i;\n" + " \n" + "}\n" + "__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numObjects)\n" + " return;\n" + " \n" + " sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n" + "}\n" + "__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numAabbs)\n" + " return;\n" + " \n" + " btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n" + " \n" + " float4 s;\n" + " s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n" + " sum[i]=s;\n" + " sum2[i]=s*s; \n" + "}\n"; |