diff options
Diffstat (limited to 'thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision')
16 files changed, 5538 insertions, 0 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h new file mode 100644 index 0000000000..0ed8aa8232 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h @@ -0,0 +1,44 @@ + +#ifndef B3_GPU_BROADPHASE_INTERFACE_H +#define B3_GPU_BROADPHASE_INTERFACE_H + +#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h" +#include "Bullet3Common/b3Vector3.h" +#include "b3SapAabb.h" +#include "Bullet3Common/shared/b3Int2.h" +#include "Bullet3Common/shared/b3Int4.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" + +class b3GpuBroadphaseInterface +{ +public: + + typedef class b3GpuBroadphaseInterface* (CreateFunc)(cl_context ctx,cl_device_id device, cl_command_queue q); + + virtual ~b3GpuBroadphaseInterface() + { + } + + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0; + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0; + + virtual void calculateOverlappingPairs(int maxPairs)=0; + virtual void calculateOverlappingPairsHost(int maxPairs)=0; + + //call writeAabbsToGpu after done making all changes (createProxy etc) + virtual void writeAabbsToGpu()=0; + + virtual cl_mem getAabbBufferWS()=0; + virtual int getNumOverlap()=0; + virtual cl_mem getOverlappingPairBuffer()=0; + + virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()=0; + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()=0; + + virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() = 0; + virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() = 0; + virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() = 0; + +}; + +#endif //B3_GPU_BROADPHASE_INTERFACE_H diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp new file mode 100644 index 0000000000..74d0c8056c --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp @@ -0,0 +1,385 @@ + +#include "b3GpuGridBroadphase.h" +#include "Bullet3Geometry/b3AabbUtil.h" +#include "kernels/gridBroadphaseKernels.h" +#include "kernels/sapKernels.h" +//#include "kernels/gridBroadphase.cl" + + +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" + + + +#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl" +#define B3_GRID_BROADPHASE_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" + +cl_kernel kCalcHashAABB; +cl_kernel kClearCellStart; +cl_kernel kFindCellStart; +cl_kernel kFindOverlappingPairs; +cl_kernel m_copyAabbsKernel; +cl_kernel m_sap2Kernel; + + + + + +//int maxPairsPerBody = 64; +int maxBodiesPerCell = 256;//?? + +b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q ) +:m_context(ctx), +m_device(device), +m_queue(q), +m_allAabbsGPU1(ctx,q), +m_smallAabbsMappingGPU(ctx,q), +m_largeAabbsMappingGPU(ctx,q), +m_gpuPairs(ctx,q), + +m_hashGpu(ctx,q), + +m_cellStartGpu(ctx,q), +m_paramsGPU(ctx,q) +{ + + + b3Vector3 gridSize = b3MakeVector3(3,3,3); + b3Vector3 invGridSize = b3MakeVector3(1.f/gridSize[0],1.f/gridSize[1],1.f/gridSize[2]); + + m_paramsCPU.m_gridSize[0] = 128; + m_paramsCPU.m_gridSize[1] = 128; + m_paramsCPU.m_gridSize[2] = 128; + m_paramsCPU.m_gridSize[3] = maxBodiesPerCell; + m_paramsCPU.setMaxBodiesPerCell(maxBodiesPerCell); + m_paramsCPU.m_invCellSize[0] = invGridSize[0]; + m_paramsCPU.m_invCellSize[1] = invGridSize[1]; + m_paramsCPU.m_invCellSize[2] = invGridSize[2]; + m_paramsCPU.m_invCellSize[3] = 0.f; + m_paramsGPU.push_back(m_paramsCPU); + + cl_int errNum=0; + + { + const char* sapSrc = sapCL; + cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH); + b3Assert(errNum==CL_SUCCESS); + m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg ); + m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg ); + b3Assert(errNum==CL_SUCCESS); + } + + { + + cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,gridBroadphaseCL,&errNum,"",B3_GRID_BROADPHASE_PATH); + b3Assert(errNum==CL_SUCCESS); + + kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kCalcHashAABB",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); + + kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kClearCellStart",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); + + kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindCellStart",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); + + + kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindOverlappingPairs",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); + + + + + } + + m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue); + +} +b3GpuGridBroadphase::~b3GpuGridBroadphase() +{ + clReleaseKernel( kCalcHashAABB); + clReleaseKernel( kClearCellStart); + clReleaseKernel( kFindCellStart); + clReleaseKernel( kFindOverlappingPairs); + clReleaseKernel( m_sap2Kernel); + clReleaseKernel( m_copyAabbsKernel); + + + + delete m_sorter; +} + + + +void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +{ + b3SapAabb aabb; + aabb.m_minVec = aabbMin; + aabb.m_maxVec = aabbMax; + aabb.m_minIndices[3] = userPtr; + aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr; + m_smallAabbsMappingCPU.push_back(m_allAabbsCPU1.size()); + + m_allAabbsCPU1.push_back(aabb); + +} +void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +{ + b3SapAabb aabb; + aabb.m_minVec = aabbMin; + aabb.m_maxVec = aabbMax; + aabb.m_minIndices[3] = userPtr; + aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr; + m_largeAabbsMappingCPU.push_back(m_allAabbsCPU1.size()); + + m_allAabbsCPU1.push_back(aabb); +} + +void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs) +{ + B3_PROFILE("b3GpuGridBroadphase::calculateOverlappingPairs"); + + + if (0) + { + calculateOverlappingPairsHost(maxPairs); + /* + b3AlignedObjectArray<b3Int4> cpuPairs; + m_gpuPairs.copyToHost(cpuPairs); + printf("host m_gpuPairs.size()=%d\n",m_gpuPairs.size()); + for (int i=0;i<m_gpuPairs.size();i++) + { + printf("host pair %d = %d,%d\n",i,cpuPairs[i].x,cpuPairs[i].y); + } + */ + return; + } + + + + + + int numSmallAabbs = m_smallAabbsMappingGPU.size(); + + b3OpenCLArray<int> pairCount(m_context,m_queue); + pairCount.push_back(0); + m_gpuPairs.resize(maxPairs);//numSmallAabbs*maxPairsPerBody); + + { + int numLargeAabbs = m_largeAabbsMappingGPU.size(); + if (numLargeAabbs && numSmallAabbs) + { + B3_PROFILE("sap2Kernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( m_allAabbsGPU1.getBufferCL() ), + b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ), + b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ), + b3BufferInfoCL( m_gpuPairs.getBufferCL() ), + b3BufferInfoCL(pairCount.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( numLargeAabbs ); + launcher.setConst( numSmallAabbs); + launcher.setConst( 0 );//axis is not used + launcher.setConst( maxPairs ); + //@todo: use actual maximum work item sizes of the device instead of hardcoded values + launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64); + + int numPairs = pairCount.at(0); + + if (numPairs >maxPairs) + { + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs =maxPairs; + } + } + } + + + + + if (numSmallAabbs) + { + B3_PROFILE("gridKernel"); + m_hashGpu.resize(numSmallAabbs); + { + B3_PROFILE("kCalcHashAABB"); + b3LauncherCL launch(m_queue,kCalcHashAABB,"kCalcHashAABB"); + launch.setConst(numSmallAabbs); + launch.setBuffer(m_allAabbsGPU1.getBufferCL()); + launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); + launch.setBuffer(m_hashGpu.getBufferCL()); + launch.setBuffer(this->m_paramsGPU.getBufferCL()); + launch.launch1D(numSmallAabbs); + } + + m_sorter->execute(m_hashGpu); + + int numCells = this->m_paramsCPU.m_gridSize[0]*this->m_paramsCPU.m_gridSize[1]*this->m_paramsCPU.m_gridSize[2]; + m_cellStartGpu.resize(numCells); + //b3AlignedObjectArray<int > cellStartCpu; + + + { + B3_PROFILE("kClearCellStart"); + b3LauncherCL launch(m_queue,kClearCellStart,"kClearCellStart"); + launch.setConst(numCells); + launch.setBuffer(m_cellStartGpu.getBufferCL()); + launch.launch1D(numCells); + //m_cellStartGpu.copyToHost(cellStartCpu); + //printf("??\n"); + + } + + + { + B3_PROFILE("kFindCellStart"); + b3LauncherCL launch(m_queue,kFindCellStart,"kFindCellStart"); + launch.setConst(numSmallAabbs); + launch.setBuffer(m_hashGpu.getBufferCL()); + launch.setBuffer(m_cellStartGpu.getBufferCL()); + launch.launch1D(numSmallAabbs); + //m_cellStartGpu.copyToHost(cellStartCpu); + //printf("??\n"); + + } + + { + B3_PROFILE("kFindOverlappingPairs"); + + + b3LauncherCL launch(m_queue,kFindOverlappingPairs,"kFindOverlappingPairs"); + launch.setConst(numSmallAabbs); + launch.setBuffer(m_allAabbsGPU1.getBufferCL()); + launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); + launch.setBuffer(m_hashGpu.getBufferCL()); + launch.setBuffer(m_cellStartGpu.getBufferCL()); + + launch.setBuffer(m_paramsGPU.getBufferCL()); + //launch.setBuffer(0); + launch.setBuffer(pairCount.getBufferCL()); + launch.setBuffer(m_gpuPairs.getBufferCL()); + + launch.setConst(maxPairs); + launch.launch1D(numSmallAabbs); + + + int numPairs = pairCount.at(0); + if (numPairs >maxPairs) + { + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs =maxPairs; + } + + m_gpuPairs.resize(numPairs); + + if (0) + { + b3AlignedObjectArray<b3Int4> pairsCpu; + m_gpuPairs.copyToHost(pairsCpu); + + int sz = m_gpuPairs.size(); + printf("m_gpuPairs.size()=%d\n",sz); + for (int i=0;i<m_gpuPairs.size();i++) + { + printf("pair %d = %d,%d\n",i,pairsCpu[i].x,pairsCpu[i].y); + } + + printf("?!?\n"); + } + + } + + + } + + + + + + //calculateOverlappingPairsHost(maxPairs); +} +void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs) +{ + + m_hostPairs.resize(0); + m_allAabbsGPU1.copyToHost(m_allAabbsCPU1); + for (int i=0;i<m_allAabbsCPU1.size();i++) + { + for (int j=i+1;j<m_allAabbsCPU1.size();j++) + { + if (b3TestAabbAgainstAabb2(m_allAabbsCPU1[i].m_minVec, m_allAabbsCPU1[i].m_maxVec, + m_allAabbsCPU1[j].m_minVec,m_allAabbsCPU1[j].m_maxVec)) + { + b3Int4 pair; + int a = m_allAabbsCPU1[j].m_minIndices[3]; + int b = m_allAabbsCPU1[i].m_minIndices[3]; + if (a<=b) + { + pair.x = a; + pair.y = b;//store the original index in the unsorted aabb array + } else + { + pair.x = b; + pair.y = a;//store the original index in the unsorted aabb array + } + + if (m_hostPairs.size()<maxPairs) + { + m_hostPairs.push_back(pair); + } + } + } + } + + + m_gpuPairs.copyFromHost(m_hostPairs); + + +} + + //call writeAabbsToGpu after done making all changes (createProxy etc) +void b3GpuGridBroadphase::writeAabbsToGpu() +{ + m_allAabbsGPU1.copyFromHost(m_allAabbsCPU1); + m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU); + m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU); + +} + +cl_mem b3GpuGridBroadphase::getAabbBufferWS() +{ + return this->m_allAabbsGPU1.getBufferCL(); +} +int b3GpuGridBroadphase::getNumOverlap() +{ + return m_gpuPairs.size(); +} +cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer() +{ + return m_gpuPairs.getBufferCL(); +} + +b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU() +{ + return m_allAabbsGPU1; +} + +b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU() +{ + return m_allAabbsCPU1; +} + +b3OpenCLArray<b3Int4>& b3GpuGridBroadphase::getOverlappingPairsGPU() +{ + return m_gpuPairs; +} +b3OpenCLArray<int>& b3GpuGridBroadphase::getSmallAabbIndicesGPU() +{ + return m_smallAabbsMappingGPU; +} +b3OpenCLArray<int>& b3GpuGridBroadphase::getLargeAabbIndicesGPU() +{ + return m_largeAabbsMappingGPU; +} + diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h new file mode 100644 index 0000000000..ec18c9f716 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h @@ -0,0 +1,88 @@ +#ifndef B3_GPU_GRID_BROADPHASE_H +#define B3_GPU_GRID_BROADPHASE_H + +#include "b3GpuBroadphaseInterface.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" + +struct b3ParamsGridBroadphaseCL +{ + + float m_invCellSize[4]; + int m_gridSize[4]; + + int getMaxBodiesPerCell() const + { + return m_gridSize[3]; + } + + void setMaxBodiesPerCell(int maxOverlap) + { + m_gridSize[3] = maxOverlap; + } +}; + + +class b3GpuGridBroadphase : public b3GpuBroadphaseInterface +{ +protected: + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + + b3OpenCLArray<b3SapAabb> m_allAabbsGPU1; + b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1; + + b3OpenCLArray<int> m_smallAabbsMappingGPU; + b3AlignedObjectArray<int> m_smallAabbsMappingCPU; + + b3OpenCLArray<int> m_largeAabbsMappingGPU; + b3AlignedObjectArray<int> m_largeAabbsMappingCPU; + + b3AlignedObjectArray<b3Int4> m_hostPairs; + b3OpenCLArray<b3Int4> m_gpuPairs; + + b3OpenCLArray<b3SortData> m_hashGpu; + b3OpenCLArray<int> m_cellStartGpu; + + + b3ParamsGridBroadphaseCL m_paramsCPU; + b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU; + + class b3RadixSort32CL* m_sorter; + +public: + + b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q ); + virtual ~b3GpuGridBroadphase(); + + static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx,cl_device_id device, cl_command_queue q) + { + return new b3GpuGridBroadphase(ctx,device,q); + } + + + + + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); + + virtual void calculateOverlappingPairs(int maxPairs); + virtual void calculateOverlappingPairsHost(int maxPairs); + + //call writeAabbsToGpu after done making all changes (createProxy etc) + virtual void writeAabbsToGpu(); + + virtual cl_mem getAabbBufferWS(); + virtual int getNumOverlap(); + virtual cl_mem getOverlappingPairBuffer(); + + virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU(); + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU(); + + virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU(); + virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU(); + virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU(); + +}; + +#endif //B3_GPU_GRID_BROADPHASE_H
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp new file mode 100644 index 0000000000..641df9eb12 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp @@ -0,0 +1,577 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" + +#include "b3GpuParallelLinearBvh.h" + +b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : + m_queue(queue), + m_radixSorter(context, device, queue), + + m_rootNodeIndex(context, queue), + m_maxDistanceFromRoot(context, queue), + m_temp(context, queue), + + m_internalNodeAabbs(context, queue), + m_internalNodeLeafIndexRanges(context, queue), + m_internalNodeChildNodes(context, queue), + m_internalNodeParentNodes(context, queue), + + m_commonPrefixes(context, queue), + m_commonPrefixLengths(context, queue), + m_distanceFromRoot(context, queue), + + m_leafNodeParentNodes(context, queue), + m_mortonCodesAndAabbIndicies(context, queue), + m_mergedAabb(context, queue), + m_leafNodeAabbs(context, queue), + + m_largeAabbs(context, queue) +{ + m_rootNodeIndex.resize(1); + m_maxDistanceFromRoot.resize(1); + m_temp.resize(1); + + // + const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl"; + + const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h + cl_int error; + char* additionalMacros = 0; + m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH); + b3Assert(m_parallelLinearBvhProgram); + + m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_separateAabbsKernel); + m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_findAllNodesMergedAabbKernel); + m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_assignMortonCodesAndAabbIndiciesKernel); + + m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_computeAdjacentPairCommonPrefixKernel); + m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_buildBinaryRadixTreeLeafNodesKernel); + m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_buildBinaryRadixTreeInternalNodesKernel); + m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_findDistanceFromRootKernel); + m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel); + + m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_findLeafIndexRangesKernel); + + m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_plbvhCalculateOverlappingPairsKernel); + m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_plbvhRayTraverseKernel); + m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_plbvhLargeAabbAabbTestKernel); + m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_plbvhLargeAabbRayTestKernel); +} + +b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh() +{ + clReleaseKernel(m_separateAabbsKernel); + clReleaseKernel(m_findAllNodesMergedAabbKernel); + clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel); + + clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel); + clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel); + clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel); + clReleaseKernel(m_findDistanceFromRootKernel); + clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel); + + clReleaseKernel(m_findLeafIndexRangesKernel); + + clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel); + clReleaseKernel(m_plbvhRayTraverseKernel); + clReleaseKernel(m_plbvhLargeAabbAabbTestKernel); + clReleaseKernel(m_plbvhLargeAabbRayTestKernel); + + clReleaseProgram(m_parallelLinearBvhProgram); +} + +void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, + const b3OpenCLArray<int>& largeAabbIndices) +{ + B3_PROFILE("b3ParallelLinearBvh::build()"); + + int numLargeAabbs = largeAabbIndices.size(); + int numSmallAabbs = smallAabbIndices.size(); + + //Since all AABBs(both large and small) are input as a contiguous array, + //with 2 additional arrays used to indicate the indices of large and small AABBs, + //it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH. + { + B3_PROFILE("Separate large and small AABBs"); + + m_largeAabbs.resize(numLargeAabbs); + m_leafNodeAabbs.resize(numSmallAabbs); + + //Write large AABBs into m_largeAabbs + { + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), + b3BufferInfoCL( largeAabbIndices.getBufferCL() ), + + b3BufferInfoCL( m_largeAabbs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLargeAabbs); + + launcher.launch1D(numLargeAabbs); + } + + //Write small AABBs into m_leafNodeAabbs + { + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), + b3BufferInfoCL( smallAabbIndices.getBufferCL() ), + + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numSmallAabbs); + + launcher.launch1D(numSmallAabbs); + } + + clFinish(m_queue); + } + + // + int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs + int numInternalNodes = numLeaves - 1; + + if(numLeaves < 2) + { + //Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(), + //so it does not matter if numLeaves == 0 and rootNodeIndex == -1 + int rootNodeIndex = numLeaves - 1; + m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1); + + //Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm, + //m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index + //instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work. + //( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs ) + if(numLeaves == 1) + { + b3SortData leaf; + leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set + + m_mortonCodesAndAabbIndicies.resize(1); + m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1); + } + + return; + } + + // + { + m_internalNodeAabbs.resize(numInternalNodes); + m_internalNodeLeafIndexRanges.resize(numInternalNodes); + m_internalNodeChildNodes.resize(numInternalNodes); + m_internalNodeParentNodes.resize(numInternalNodes); + + m_commonPrefixes.resize(numInternalNodes); + m_commonPrefixLengths.resize(numInternalNodes); + m_distanceFromRoot.resize(numInternalNodes); + + m_leafNodeParentNodes.resize(numLeaves); + m_mortonCodesAndAabbIndicies.resize(numLeaves); + m_mergedAabb.resize(numLeaves); + } + + //Find the merged AABB of all small AABBs; this is used to define the size of + //each cell in the virtual grid for the next kernel(2^10 cells in each dimension). + { + B3_PROFILE("Find AABB of merged nodes"); + + m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array + + for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2; + numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2) + { + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0] + }; + + b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numAabbsNeedingMerge); + + launcher.launch1D(numAabbsNeedingMerge); + } + + clFinish(m_queue); + } + + //Insert the center of the AABBs into a virtual grid, + //then convert the discrete grid coordinates into a morton code + //For each element in m_mortonCodesAndAabbIndicies, set + // m_key == morton code (value to sort by) + // m_value == small AABB index + { + B3_PROFILE("Assign morton codes"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_mergedAabb.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLeaves); + + launcher.launch1D(numLeaves); + clFinish(m_queue); + } + + // + { + B3_PROFILE("Sort leaves by morton codes"); + + m_radixSorter.execute(m_mortonCodesAndAabbIndicies); + clFinish(m_queue); + } + + // + constructBinaryRadixTree(); + + + //Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices. + //The root node contains leaf node indices in the range [0, numLeafNodes - 1]. + //The child nodes of each node split their parent's index range into 2 contiguous halves. + // + //For example, if the root has indices [0, 31], its children might partition that range into [0, 11] and [12, 31]. + //The next level in the tree could then split those ranges into [0, 2], [3, 11], [12, 22], and [23, 31]. + // + //This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice + { + B3_PROFILE("m_findLeafIndexRangesKernel"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numInternalNodes); + + launcher.launch1D(numInternalNodes); + clFinish(m_queue); + } +} + +void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs) +{ + int maxPairs = out_overlappingPairs.size(); + b3OpenCLArray<int>& numPairsGpu = m_temp; + + int reset = 0; + numPairsGpu.copyFromHostPointer(&reset, 1); + + // + if( m_leafNodeAabbs.size() > 1 ) + { + B3_PROFILE("PLBVH small-small AABB test"); + + int numQueryAabbs = m_leafNodeAabbs.size(); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + + b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + + b3BufferInfoCL( numPairsGpu.getBufferCL() ), + b3BufferInfoCL( out_overlappingPairs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(maxPairs); + launcher.setConst(numQueryAabbs); + + launcher.launch1D(numQueryAabbs); + clFinish(m_queue); + } + + int numLargeAabbRigids = m_largeAabbs.size(); + if( numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0 ) + { + B3_PROFILE("PLBVH large-small AABB test"); + + int numQueryAabbs = m_leafNodeAabbs.size(); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_largeAabbs.getBufferCL() ), + + b3BufferInfoCL( numPairsGpu.getBufferCL() ), + b3BufferInfoCL( out_overlappingPairs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(maxPairs); + launcher.setConst(numLargeAabbRigids); + launcher.setConst(numQueryAabbs); + + launcher.launch1D(numQueryAabbs); + clFinish(m_queue); + } + + + // + int numPairs = -1; + numPairsGpu.copyToHostPointer(&numPairs, 1); + if(numPairs > maxPairs) + { + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs = maxPairs; + numPairsGpu.copyFromHostPointer(&maxPairs, 1); + } + + out_overlappingPairs.resize(numPairs); +} + + +void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, + b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs) +{ + B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()"); + + int numRays = rays.size(); + int maxRayRigidPairs = out_rayRigidPairs.size(); + + int reset = 0; + out_numRayRigidPairs.copyFromHostPointer(&reset, 1); + + // + if( m_leafNodeAabbs.size() > 0 ) + { + B3_PROFILE("PLBVH ray test small AABB"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + + b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + + b3BufferInfoCL( rays.getBufferCL() ), + + b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ), + b3BufferInfoCL( out_rayRigidPairs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(maxRayRigidPairs); + launcher.setConst(numRays); + + launcher.launch1D(numRays); + clFinish(m_queue); + } + + int numLargeAabbRigids = m_largeAabbs.size(); + if(numLargeAabbRigids > 0) + { + B3_PROFILE("PLBVH ray test large AABB"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_largeAabbs.getBufferCL() ), + b3BufferInfoCL( rays.getBufferCL() ), + + b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ), + b3BufferInfoCL( out_rayRigidPairs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLargeAabbRigids); + launcher.setConst(maxRayRigidPairs); + launcher.setConst(numRays); + + launcher.launch1D(numRays); + clFinish(m_queue); + } + + // + int numRayRigidPairs = -1; + out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1); + + if(numRayRigidPairs > maxRayRigidPairs) + b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs); + +} + +void b3GpuParallelLinearBvh::constructBinaryRadixTree() +{ + B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()"); + + int numLeaves = m_leafNodeAabbs.size(); + int numInternalNodes = numLeaves - 1; + + //Each internal node is placed in between 2 leaf nodes. + //By using this arrangement and computing the common prefix between + //these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree. + { + B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + b3BufferInfoCL( m_commonPrefixes.getBufferCL() ), + b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numInternalNodes); + + launcher.launch1D(numInternalNodes); + clFinish(m_queue); + } + + //For each leaf node, select its parent node by + //comparing the 2 nearest internal nodes and assign child node indices + { + B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ), + b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLeaves); + + launcher.launch1D(numLeaves); + clFinish(m_queue); + } + + //For each internal node, perform 2 binary searches among the other internal nodes + //to its left and right to find its potential parent nodes and assign child node indices + { + B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_commonPrefixes.getBufferCL() ), + b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), + b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numInternalNodes); + + launcher.launch1D(numInternalNodes); + clFinish(m_queue); + } + + //Find the number of nodes seperating each internal node and the root node + //so that the AABBs can be set using the next kernel. + //Also determine the maximum number of nodes separating an internal node and the root node. + { + B3_PROFILE("m_findDistanceFromRootKernel"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), + b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), + b3BufferInfoCL( m_maxDistanceFromRoot.getBufferCL() ), + b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numInternalNodes); + + launcher.launch1D(numInternalNodes); + clFinish(m_queue); + } + + //Starting from the internal nodes nearest to the leaf nodes, recursively move up + //the tree towards the root to set the AABBs of each internal node; each internal node + //checks its children and merges their AABBs + { + B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel"); + + int maxDistanceFromRoot = -1; + { + B3_PROFILE("copy maxDistanceFromRoot to CPU"); + m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1); + clFinish(m_queue); + } + + for(int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot) + { + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(maxDistanceFromRoot); + launcher.setConst(distanceFromRoot); + launcher.setConst(numInternalNodes); + + //It may seem inefficent to launch a thread for each internal node when a + //much smaller number of nodes is actually processed, but this is actually + //faster than determining the exact nodes that are ready to merge their child AABBs. + launcher.launch1D(numInternalNodes); + } + + clFinish(m_queue); + } +} + +
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h new file mode 100644 index 0000000000..effe617b7b --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h @@ -0,0 +1,125 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +#ifndef B3_GPU_PARALLEL_LINEAR_BVH_H +#define B3_GPU_PARALLEL_LINEAR_BVH_H + +//#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h" +#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h" +#include "Bullet3Common/shared/b3Int2.h" +#include "Bullet3Common/shared/b3Int4.h" +#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h" + +#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h" + +#include "Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h" + +#define b3Int64 cl_long + +///@brief GPU Parallel Linearized Bounding Volume Heirarchy(LBVH) that is reconstructed every frame +///@remarks +///See presentation in docs/b3GpuParallelLinearBvh.pdf for algorithm details. +///@par +///Related papers: \n +///"Fast BVH Construction on GPUs" [Lauterbach et al. 2009] \n +///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n +///@par +///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages: +/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid) +/// - [fully parallel] Sort morton codes +/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH) +/// - [somewhat parallel] Set internal node AABBs +///@par +///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages. +///The BVH implementation here shares many concepts with [Karras 2012], but a different method is used for constructing the tree. +///Instead of searching for the child nodes of each internal node, we search for the parent node of each node. +///Additionally, a non-atomic traversal that starts from the leaf nodes and moves towards the root node is used to set the AABBs. +class b3GpuParallelLinearBvh +{ + cl_command_queue m_queue; + + cl_program m_parallelLinearBvhProgram; + + cl_kernel m_separateAabbsKernel; + cl_kernel m_findAllNodesMergedAabbKernel; + cl_kernel m_assignMortonCodesAndAabbIndiciesKernel; + + //Binary radix tree construction kernels + cl_kernel m_computeAdjacentPairCommonPrefixKernel; + cl_kernel m_buildBinaryRadixTreeLeafNodesKernel; + cl_kernel m_buildBinaryRadixTreeInternalNodesKernel; + cl_kernel m_findDistanceFromRootKernel; + cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel; + + cl_kernel m_findLeafIndexRangesKernel; + + //Traversal kernels + cl_kernel m_plbvhCalculateOverlappingPairsKernel; + cl_kernel m_plbvhRayTraverseKernel; + cl_kernel m_plbvhLargeAabbAabbTestKernel; + cl_kernel m_plbvhLargeAabbRayTestKernel; + + b3RadixSort32CL m_radixSorter; + + //1 element + b3OpenCLArray<int> m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node + b3OpenCLArray<int> m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node + b3OpenCLArray<int> m_temp; //Used to hold the number of pairs in calculateOverlappingPairs() + + //1 element per internal node (number_of_internal_nodes == number_of_leaves - 1) + b3OpenCLArray<b3SapAabb> m_internalNodeAabbs; + b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index + b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node + b3OpenCLArray<int> m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal + + //1 element per internal node; for binary radix tree construction + b3OpenCLArray<b3Int64> m_commonPrefixes; + b3OpenCLArray<int> m_commonPrefixLengths; + b3OpenCLArray<int> m_distanceFromRoot; //Number of internal nodes between this node and the root + + //1 element per leaf node (leaf nodes only include small AABBs) + b3OpenCLArray<int> m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal + b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs + b3OpenCLArray<b3SapAabb> m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes + b3OpenCLArray<b3SapAabb> m_leafNodeAabbs; //Contains only small AABBs + + //1 element per large AABB, which is not stored in the BVH + b3OpenCLArray<b3SapAabb> m_largeAabbs; + +public: + b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue); + virtual ~b3GpuParallelLinearBvh(); + + ///Must be called before any other function + void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, + const b3OpenCLArray<int>& largeAabbIndices); + + ///calculateOverlappingPairs() uses the worldSpaceAabbs parameter of b3GpuParallelLinearBvh::build() as the query AABBs. + ///@param out_overlappingPairs The size() of this array is used to determine the max number of pairs. + ///If the number of overlapping pairs is < out_overlappingPairs.size(), out_overlappingPairs is resized. + void calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs); + + ///@param out_numRigidRayPairs Array of length 1; contains the number of detected ray-rigid AABB intersections; + ///this value may be greater than out_rayRigidPairs.size() if out_rayRigidPairs is not large enough. + ///@param out_rayRigidPairs Contains an array of rays intersecting rigid AABBs; x == ray index, y == rigid body index. + ///If the size of this array is insufficient to hold all ray-rigid AABB intersections, additional intersections are discarded. + void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, + b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs); + +private: + void constructBinaryRadixTree(); +}; + +#endif diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp new file mode 100644 index 0000000000..d2618024ac --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp @@ -0,0 +1,80 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +#include "b3GpuParallelLinearBvhBroadphase.h" + +b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : + m_plbvh(context, device, queue), + + m_overlappingPairsGpu(context, queue), + + m_aabbsGpu(context, queue), + m_smallAabbsMappingGpu(context, queue), + m_largeAabbsMappingGpu(context, queue) +{ +} + +void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) +{ + int newAabbIndex = m_aabbsCpu.size(); + + b3SapAabb aabb; + aabb.m_minVec = aabbMin; + aabb.m_maxVec = aabbMax; + + aabb.m_minIndices[3] = userPtr; + aabb.m_signedMaxIndices[3] = newAabbIndex; + + m_smallAabbsMappingCpu.push_back(newAabbIndex); + + m_aabbsCpu.push_back(aabb); +} +void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) +{ + int newAabbIndex = m_aabbsCpu.size(); + + b3SapAabb aabb; + aabb.m_minVec = aabbMin; + aabb.m_maxVec = aabbMax; + + aabb.m_minIndices[3] = userPtr; + aabb.m_signedMaxIndices[3] = newAabbIndex; + + m_largeAabbsMappingCpu.push_back(newAabbIndex); + + m_aabbsCpu.push_back(aabb); +} + +void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairs(int maxPairs) +{ + //Reconstruct BVH + m_plbvh.build(m_aabbsGpu, m_smallAabbsMappingGpu, m_largeAabbsMappingGpu); + + // + m_overlappingPairsGpu.resize(maxPairs); + m_plbvh.calculateOverlappingPairs(m_overlappingPairsGpu); +} +void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairsHost(int maxPairs) +{ + b3Assert(0); //CPU version not implemented +} + +void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu() +{ + m_aabbsGpu.copyFromHost(m_aabbsCpu); + m_smallAabbsMappingGpu.copyFromHost(m_smallAabbsMappingCpu); + m_largeAabbsMappingGpu.copyFromHost(m_largeAabbsMappingCpu); +} + + + diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h new file mode 100644 index 0000000000..e518500637 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h @@ -0,0 +1,66 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +#ifndef B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H +#define B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H + +#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h" + +#include "b3GpuParallelLinearBvh.h" + +class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface +{ + b3GpuParallelLinearBvh m_plbvh; + + b3OpenCLArray<b3Int4> m_overlappingPairsGpu; + + b3OpenCLArray<b3SapAabb> m_aabbsGpu; + b3OpenCLArray<int> m_smallAabbsMappingGpu; + b3OpenCLArray<int> m_largeAabbsMappingGpu; + + b3AlignedObjectArray<b3SapAabb> m_aabbsCpu; + b3AlignedObjectArray<int> m_smallAabbsMappingCpu; + b3AlignedObjectArray<int> m_largeAabbsMappingCpu; + +public: + b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue); + virtual ~b3GpuParallelLinearBvhBroadphase() {} + + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); + + virtual void calculateOverlappingPairs(int maxPairs); + virtual void calculateOverlappingPairsHost(int maxPairs); + + //call writeAabbsToGpu after done making all changes (createProxy etc) + virtual void writeAabbsToGpu(); + + virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); } + virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); } + + virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); } + virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; } + + virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() { return m_overlappingPairsGpu; } + virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() { return m_smallAabbsMappingGpu; } + virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() { return m_largeAabbsMappingGpu; } + + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_aabbsCpu; } + + static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue) + { + return new b3GpuParallelLinearBvhBroadphase(context, device, queue); + } +}; + +#endif diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp new file mode 100644 index 0000000000..c45fbbdcaa --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp @@ -0,0 +1,1366 @@ + +bool searchIncremental3dSapOnGpu = true; +#include <limits.h> +#include "b3GpuSapBroadphase.h" +#include "Bullet3Common/b3Vector3.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h" + + +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "kernels/sapKernels.h" + +#include "Bullet3Common/b3MinMax.h" + +#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl" + +/* + + + + + + + b3OpenCLArray<int> m_pairCount; + + + b3OpenCLArray<b3SapAabb> m_allAabbsGPU; + b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU; + + virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() + { + return m_allAabbsGPU; + } + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() + { + return m_allAabbsCPU; + } + + b3OpenCLArray<b3Vector3> m_sum; + b3OpenCLArray<b3Vector3> m_sum2; + b3OpenCLArray<b3Vector3> m_dst; + + b3OpenCLArray<int> m_smallAabbsMappingGPU; + b3AlignedObjectArray<int> m_smallAabbsMappingCPU; + + b3OpenCLArray<int> m_largeAabbsMappingGPU; + b3AlignedObjectArray<int> m_largeAabbsMappingCPU; + + + b3OpenCLArray<b3Int4> m_overlappingPairs; + + //temporary gpu work memory + b3OpenCLArray<b3SortData> m_gpuSmallSortData; + b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs; + + class b3PrefixScanFloat4CL* m_prefixScanFloat4; + */ + +b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType) +:m_context(ctx), +m_device(device), +m_queue(q), + +m_objectMinMaxIndexGPUaxis0(ctx,q), +m_objectMinMaxIndexGPUaxis1(ctx,q), +m_objectMinMaxIndexGPUaxis2(ctx,q), +m_objectMinMaxIndexGPUaxis0prev(ctx,q), +m_objectMinMaxIndexGPUaxis1prev(ctx,q), +m_objectMinMaxIndexGPUaxis2prev(ctx,q), +m_sortedAxisGPU0(ctx,q), +m_sortedAxisGPU1(ctx,q), +m_sortedAxisGPU2(ctx,q), +m_sortedAxisGPU0prev(ctx,q), +m_sortedAxisGPU1prev(ctx,q), +m_sortedAxisGPU2prev(ctx,q), +m_addedHostPairsGPU(ctx,q), +m_removedHostPairsGPU(ctx,q), +m_addedCountGPU(ctx,q), +m_removedCountGPU(ctx,q), +m_currentBuffer(-1), +m_pairCount(ctx,q), +m_allAabbsGPU(ctx,q), +m_sum(ctx,q), +m_sum2(ctx,q), +m_dst(ctx,q), +m_smallAabbsMappingGPU(ctx,q), +m_largeAabbsMappingGPU(ctx,q), +m_overlappingPairs(ctx,q), +m_gpuSmallSortData(ctx,q), +m_gpuSmallSortedAabbs(ctx,q) +{ + const char* sapSrc = sapCL; + + + cl_int errNum=0; + + b3Assert(m_context); + b3Assert(m_device); + cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH); + b3Assert(errNum==CL_SUCCESS); + + + b3Assert(errNum==CL_SUCCESS); +#ifndef __APPLE__ + m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context,m_device,m_queue); +#else + m_prefixScanFloat4 = 0; +#endif + m_sapKernel = 0; + + switch (kernelType) + { + case B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU: + { + m_sapKernel=0; + break; + } + case B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU: + { + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBruteForce",&errNum,sapProg ); + break; + } + + case B3_GPU_SAP_KERNEL_ORIGINAL: + { + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg ); + break; + } + case B3_GPU_SAP_KERNEL_BARRIER: + { + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg ); + break; + } + case B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY: + { + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg ); + break; + } + + default: + { + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg ); + b3Error("Unknown 3D GPU SAP provided, fallback to computePairsKernelLocalSharedMemory"); + } + }; + + + + m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg ); + b3Assert(errNum==CL_SUCCESS); + + m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "prepareSumVarianceKernel",&errNum,sapProg ); + b3Assert(errNum==CL_SUCCESS); + + + m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg ); + + m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg ); + + m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "scatterKernel",&errNum,sapProg ); + + m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue); +} + +b3GpuSapBroadphase::~b3GpuSapBroadphase() +{ + delete m_sorter; + delete m_prefixScanFloat4; + + clReleaseKernel(m_scatterKernel); + clReleaseKernel(m_flipFloatKernel); + clReleaseKernel(m_copyAabbsKernel); + clReleaseKernel(m_sapKernel); + clReleaseKernel(m_sap2Kernel); + clReleaseKernel(m_prepareSumVarianceKernel); + + +} + +/// conservative test for overlap between two aabbs +static bool TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aabbMax1, + const b3Vector3 &aabbMin2, const b3Vector3 &aabbMax2) +{ + bool overlap = true; + overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap; + overlap = (aabbMin1.getZ() > aabbMax2.getZ() || aabbMax1.getZ() < aabbMin2.getZ()) ? false : overlap; + overlap = (aabbMin1.getY() > aabbMax2.getY() || aabbMax1.getY() < aabbMin2.getY()) ? false : overlap; + return overlap; +} + + + +//http://stereopsis.com/radix.html +static unsigned int FloatFlip(float fl) +{ + unsigned int f = *(unsigned int*)&fl; + unsigned int mask = -(int)(f >> 31) | 0x80000000; + return f ^ mask; +}; + +void b3GpuSapBroadphase::init3dSap() +{ + if (m_currentBuffer<0) + { + m_allAabbsGPU.copyToHost(m_allAabbsCPU); + + m_currentBuffer = 0; + for (int axis=0;axis<3;axis++) + { + for (int buf=0;buf<2;buf++) + { + int totalNumAabbs = m_allAabbsCPU.size(); + int numEndPoints = 2*totalNumAabbs; + m_sortedAxisCPU[axis][buf].resize(numEndPoints); + + if (buf==m_currentBuffer) + { + for (int i=0;i<totalNumAabbs;i++) + { + m_sortedAxisCPU[axis][buf][i*2].m_key = FloatFlip(m_allAabbsCPU[i].m_min[axis])-1; + m_sortedAxisCPU[axis][buf][i*2].m_value = i*2; + m_sortedAxisCPU[axis][buf][i*2+1].m_key = FloatFlip(m_allAabbsCPU[i].m_max[axis])+1; + m_sortedAxisCPU[axis][buf][i*2+1].m_value = i*2+1; + } + } + } + } + + for (int axis=0;axis<3;axis++) + { + m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]); + } + + for (int axis=0;axis<3;axis++) + { + //int totalNumAabbs = m_allAabbsCPU.size(); + int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size(); + m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(numEndPoints); + for (int i=0;i<numEndPoints;i++) + { + int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value; + int newDest = destIndex/2; + if (destIndex&1) + { + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i; + } else + { + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i; + } + } + } + + } +} + + +static bool b3PairCmp(const b3Int4& p, const b3Int4& q) +{ + return ((p.x<q.x) || ((p.x==q.x) && (p.y<q.y))); +} + + +static bool operator==(const b3Int4& a,const b3Int4& b) +{ + return a.x == b.x && a.y == b.y; +}; + +static bool operator<(const b3Int4& a,const b3Int4& b) +{ + return a.x < b.x || (a.x == b.x && a.y < b.y); +}; + +static bool operator>(const b3Int4& a,const b3Int4& b) +{ + return a.x > b.x || (a.x == b.x && a.y > b.y); +}; + +b3AlignedObjectArray<b3Int4> addedHostPairs; +b3AlignedObjectArray<b3Int4> removedHostPairs; + +b3AlignedObjectArray<b3SapAabb> preAabbs; + +void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() +{ + //static int framepje = 0; + //printf("framepje=%d\n",framepje++); + + + B3_PROFILE("calculateOverlappingPairsHostIncremental3Sap"); + + addedHostPairs.resize(0); + removedHostPairs.resize(0); + + b3Assert(m_currentBuffer>=0); + + { + preAabbs.resize(m_allAabbsCPU.size()); + for (int i=0;i<preAabbs.size();i++) + { + preAabbs[i]=m_allAabbsCPU[i]; + } + } + + + if (m_currentBuffer<0) + return; + { + B3_PROFILE("m_allAabbsGPU.copyToHost"); + m_allAabbsGPU.copyToHost(m_allAabbsCPU); + } + + b3AlignedObjectArray<b3Int4> allPairs; + { + B3_PROFILE("m_overlappingPairs.copyToHost"); + m_overlappingPairs.copyToHost(allPairs); + } + if (0) + { + { + printf("ab[40].min=%f,%f,%f,ab[40].max=%f,%f,%f\n", + m_allAabbsCPU[40].m_min[0], m_allAabbsCPU[40].m_min[1],m_allAabbsCPU[40].m_min[2], + m_allAabbsCPU[40].m_max[0], m_allAabbsCPU[40].m_max[1],m_allAabbsCPU[40].m_max[2]); + } + + { + printf("ab[53].min=%f,%f,%f,ab[53].max=%f,%f,%f\n", + m_allAabbsCPU[53].m_min[0], m_allAabbsCPU[53].m_min[1],m_allAabbsCPU[53].m_min[2], + m_allAabbsCPU[53].m_max[0], m_allAabbsCPU[53].m_max[1],m_allAabbsCPU[53].m_max[2]); + } + + + { + b3Int4 newPair; + newPair.x = 40; + newPair.y = 53; + int index = allPairs.findBinarySearch(newPair); + printf("hasPair(40,53)=%d out of %d\n",index, allPairs.size()); + + { + int overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[40].m_min, (const b3Vector3&)m_allAabbsCPU[40].m_max,(const b3Vector3&)m_allAabbsCPU[53].m_min,(const b3Vector3&)m_allAabbsCPU[53].m_max); + printf("overlap=%d\n",overlap); + } + + if (preAabbs.size()) + { + int prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[40].m_min, (const b3Vector3&)preAabbs[40].m_max,(const b3Vector3&)preAabbs[53].m_min,(const b3Vector3&)preAabbs[53].m_max); + printf("prevoverlap=%d\n",prevOverlap); + } else + { + printf("unknown prevoverlap\n"); + } + + } + } + + + if (0) + { + for (int i=0;i<m_allAabbsCPU.size();i++) + { + //printf("aabb[%d] min=%f,%f,%f max=%f,%f,%f\n",i,m_allAabbsCPU[i].m_min[0],m_allAabbsCPU[i].m_min[1],m_allAabbsCPU[i].m_min[2], m_allAabbsCPU[i].m_max[0],m_allAabbsCPU[i].m_max[1],m_allAabbsCPU[i].m_max[2]); + + + } + + for (int axis=0;axis<3;axis++) + { + for (int buf=0;buf<2;buf++) + { + b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size()*2); + } + } + } + + + + m_currentBuffer = 1-m_currentBuffer; + + + + int totalNumAabbs = m_allAabbsCPU.size(); + + { + B3_PROFILE("assign m_sortedAxisCPU(FloatFlip)"); + for (int i=0;i<totalNumAabbs;i++) + { + + + unsigned int keyMin[3]; + unsigned int keyMax[3]; + for (int axis=0;axis<3;axis++) + { + float vmin=m_allAabbsCPU[i].m_min[axis]; + float vmax = m_allAabbsCPU[i].m_max[axis]; + keyMin[axis] = FloatFlip(vmin); + keyMax[axis] = FloatFlip(vmax); + + m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_key = keyMin[axis]-1; + m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_value = i*2; + m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_key = keyMax[axis]+1; + m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_value = i*2+1; + } + //printf("aabb[%d] min=%u,%u,%u max %u,%u,%u\n", i,keyMin[0],keyMin[1],keyMin[2],keyMax[0],keyMax[1],keyMax[2]); + + } + } + + + + { + B3_PROFILE("sort m_sortedAxisCPU"); + for (int axis=0;axis<3;axis++) + m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]); + } + +#if 0 + if (0) + { + for (int axis=0;axis<3;axis++) + { + //printf("axis %d\n",axis); + for (int i=0;i<m_sortedAxisCPU[axis][m_currentBuffer].size();i++) + { + //int key = m_sortedAxisCPU[axis][m_currentBuffer][i].m_key; + //int value = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value; + //printf("[%d]=%d\n",i,value); + } + + } + } +#endif + + { + B3_PROFILE("assign m_objectMinMaxIndexCPU"); + for (int axis=0;axis<3;axis++) + { + int totalNumAabbs = m_allAabbsCPU.size(); + int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size(); + m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(totalNumAabbs); + for (int i=0;i<numEndPoints;i++) + { + int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value; + int newDest = destIndex/2; + if (destIndex&1) + { + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i; + } else + { + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i; + } + } + } + } + +#if 0 + if (0) + { + printf("==========================\n"); + for (int axis=0;axis<3;axis++) + { + unsigned int curMinIndex40 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][40].x; + unsigned int curMaxIndex40 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][40].y; + unsigned int prevMaxIndex40 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][40].y; + unsigned int prevMinIndex40 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][40].x; + + int dmin40 = curMinIndex40 - prevMinIndex40; + int dmax40 = curMinIndex40 - prevMinIndex40; + printf("axis %d curMinIndex40=%d prevMinIndex40=%d\n",axis,curMinIndex40, prevMinIndex40); + printf("axis %d curMaxIndex40=%d prevMaxIndex40=%d\n",axis,curMaxIndex40, prevMaxIndex40); + } + printf(".........................\n"); + for (int axis=0;axis<3;axis++) + { + unsigned int curMinIndex53 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][53].x; + unsigned int curMaxIndex53 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][53].y; + unsigned int prevMaxIndex53 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][53].y; + unsigned int prevMinIndex53 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][53].x; + + int dmin40 = curMinIndex53 - prevMinIndex53; + int dmax40 = curMinIndex53 - prevMinIndex53; + printf("axis %d curMinIndex53=%d prevMinIndex53=%d\n",axis,curMinIndex53, prevMinIndex53); + printf("axis %d curMaxIndex53=%d prevMaxIndex53=%d\n",axis,curMaxIndex53, prevMaxIndex53); + } + + } +#endif + + + int a = m_objectMinMaxIndexCPU[0][m_currentBuffer].size(); + int b = m_objectMinMaxIndexCPU[1][m_currentBuffer].size(); + int c = m_objectMinMaxIndexCPU[2][m_currentBuffer].size(); + b3Assert(a==b); + b3Assert(b==c); + /* + if (searchIncremental3dSapOnGpu) + { + B3_PROFILE("computePairsIncremental3dSapKernelGPU"); + int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size(); + int maxCapacity = 1024*1024; + { + B3_PROFILE("copy from host"); + m_objectMinMaxIndexGPUaxis0.copyFromHost(m_objectMinMaxIndexCPU[0][m_currentBuffer]); + m_objectMinMaxIndexGPUaxis1.copyFromHost(m_objectMinMaxIndexCPU[1][m_currentBuffer]); + m_objectMinMaxIndexGPUaxis2.copyFromHost(m_objectMinMaxIndexCPU[2][m_currentBuffer]); + m_objectMinMaxIndexGPUaxis0prev.copyFromHost(m_objectMinMaxIndexCPU[0][1-m_currentBuffer]); + m_objectMinMaxIndexGPUaxis1prev.copyFromHost(m_objectMinMaxIndexCPU[1][1-m_currentBuffer]); + m_objectMinMaxIndexGPUaxis2prev.copyFromHost(m_objectMinMaxIndexCPU[2][1-m_currentBuffer]); + + m_sortedAxisGPU0.copyFromHost(m_sortedAxisCPU[0][m_currentBuffer]); + m_sortedAxisGPU1.copyFromHost(m_sortedAxisCPU[1][m_currentBuffer]); + m_sortedAxisGPU2.copyFromHost(m_sortedAxisCPU[2][m_currentBuffer]); + m_sortedAxisGPU0prev.copyFromHost(m_sortedAxisCPU[0][1-m_currentBuffer]); + m_sortedAxisGPU1prev.copyFromHost(m_sortedAxisCPU[1][1-m_currentBuffer]); + m_sortedAxisGPU2prev.copyFromHost(m_sortedAxisCPU[2][1-m_currentBuffer]); + + + m_addedHostPairsGPU.resize(maxCapacity); + m_removedHostPairsGPU.resize(maxCapacity); + + m_addedCountGPU.resize(0); + m_addedCountGPU.push_back(0); + m_removedCountGPU.resize(0); + m_removedCountGPU.push_back(0); + } + + { + B3_PROFILE("launch1D"); + b3LauncherCL launcher(m_queue, m_computePairsIncremental3dSapKernel,"m_computePairsIncremental3dSapKernel"); + launcher.setBuffer(m_objectMinMaxIndexGPUaxis0.getBufferCL()); + launcher.setBuffer(m_objectMinMaxIndexGPUaxis1.getBufferCL()); + launcher.setBuffer(m_objectMinMaxIndexGPUaxis2.getBufferCL()); + launcher.setBuffer(m_objectMinMaxIndexGPUaxis0prev.getBufferCL()); + launcher.setBuffer(m_objectMinMaxIndexGPUaxis1prev.getBufferCL()); + launcher.setBuffer(m_objectMinMaxIndexGPUaxis2prev.getBufferCL()); + + launcher.setBuffer(m_sortedAxisGPU0.getBufferCL()); + launcher.setBuffer(m_sortedAxisGPU1.getBufferCL()); + launcher.setBuffer(m_sortedAxisGPU2.getBufferCL()); + launcher.setBuffer(m_sortedAxisGPU0prev.getBufferCL()); + launcher.setBuffer(m_sortedAxisGPU1prev.getBufferCL()); + launcher.setBuffer(m_sortedAxisGPU2prev.getBufferCL()); + + + launcher.setBuffer(m_addedHostPairsGPU.getBufferCL()); + launcher.setBuffer(m_removedHostPairsGPU.getBufferCL()); + launcher.setBuffer(m_addedCountGPU.getBufferCL()); + launcher.setBuffer(m_removedCountGPU.getBufferCL()); + launcher.setConst(maxCapacity); + launcher.setConst( numObjects); + launcher.launch1D( numObjects); + clFinish(m_queue); + } + + { + B3_PROFILE("copy to host"); + int addedCountGPU = m_addedCountGPU.at(0); + m_addedHostPairsGPU.resize(addedCountGPU); + m_addedHostPairsGPU.copyToHost(addedHostPairs); + + //printf("addedCountGPU=%d\n",addedCountGPU); + int removedCountGPU = m_removedCountGPU.at(0); + m_removedHostPairsGPU.resize(removedCountGPU); + m_removedHostPairsGPU.copyToHost(removedHostPairs); + //printf("removedCountGPU=%d\n",removedCountGPU); + + } + + + + } + else + */ + { + int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size(); + + B3_PROFILE("actual search"); + for (int i=0;i<numObjects;i++) + { + //int numObjects = m_objectMinMaxIndexCPU[axis][m_currentBuffer].size(); + //int checkObjects[]={40,53}; + //int numCheckObjects = sizeof(checkObjects)/sizeof(int); + + //for (int a=0;a<numCheckObjects ;a++) + + for (int axis=0;axis<3;axis++) + { + //int i = checkObjects[a]; + + unsigned int curMinIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].x; + unsigned int curMaxIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].y; + unsigned int prevMinIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].x; + int dmin = curMinIndex - prevMinIndex; + + unsigned int prevMaxIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].y; + + + + int dmax = curMaxIndex - prevMaxIndex; + if (dmin!=0) + { + //printf("for object %d, dmin=%d\n",i,dmin); + } + if (dmax!=0) + { + //printf("for object %d, dmax=%d\n",i,dmax); + } + for (int otherbuffer = 0;otherbuffer<2;otherbuffer++) + { + if (dmin!=0) + { + int stepMin = dmin<0 ? -1 : 1; + for (int j=prevMinIndex;j!=curMinIndex;j+=stepMin) + { + int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y; + int otherIndex = otherIndex2/2; + if (otherIndex!=i) + { + bool otherIsMax = ((otherIndex2&1)!=0); + + if (otherIsMax) + { + //bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max); + //bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max); + + bool overlap = true; + + for (int ax=0;ax<3;ax++) + { + if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) || + (m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x)) + overlap=false; + } + + // b3Assert(overlap2==overlap); + + bool prevOverlap = true; + + for (int ax=0;ax<3;ax++) + { + if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) || + (m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x)) + prevOverlap=false; + } + + + //b3Assert(overlap==overlap2); + + + + if (dmin<0) + { + if (overlap && !prevOverlap) + { + //add a pair + b3Int4 newPair; + if (i<=otherIndex) + { + newPair.x = i; + newPair.y = otherIndex; + } else + { + newPair.x = otherIndex; + newPair.y = i; + } + addedHostPairs.push_back(newPair); + } + } + else + { + if (!overlap && prevOverlap) + { + + //remove a pair + b3Int4 removedPair; + if (i<=otherIndex) + { + removedPair.x = i; + removedPair.y = otherIndex; + } else + { + removedPair.x = otherIndex; + removedPair.y = i; + } + removedHostPairs.push_back(removedPair); + } + }//otherisMax + }//if (dmin<0) + }//if (otherIndex!=i) + }//for (int j= + } + + if (dmax!=0) + { + int stepMax = dmax<0 ? -1 : 1; + for (int j=prevMaxIndex;j!=curMaxIndex;j+=stepMax) + { + int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y; + int otherIndex = otherIndex2/2; + if (otherIndex!=i) + { + //bool otherIsMin = ((otherIndex2&1)==0); + //if (otherIsMin) + { + //bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max); + //bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max); + + bool overlap = true; + + for (int ax=0;ax<3;ax++) + { + if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) || + (m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x)) + overlap=false; + } + //b3Assert(overlap2==overlap); + + bool prevOverlap = true; + + for (int ax=0;ax<3;ax++) + { + if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) || + (m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x)) + prevOverlap=false; + } + + + if (dmax>0) + { + if (overlap && !prevOverlap) + { + //add a pair + b3Int4 newPair; + if (i<=otherIndex) + { + newPair.x = i; + newPair.y = otherIndex; + } else + { + newPair.x = otherIndex; + newPair.y = i; + } + addedHostPairs.push_back(newPair); + + } + } + else + { + if (!overlap && prevOverlap) + { + //if (otherIndex2&1==0) -> min? + //remove a pair + b3Int4 removedPair; + if (i<=otherIndex) + { + removedPair.x = i; + removedPair.y = otherIndex; + } else + { + removedPair.x = otherIndex; + removedPair.y = i; + } + removedHostPairs.push_back(removedPair); + + } + } + + }//if (dmin<0) + }//if (otherIndex!=i) + }//for (int j= + } + }//for (int otherbuffer + }//for (int axis=0; + }//for (int i=0;i<numObjects + } + + //remove duplicates and add/remove then to existing m_overlappingPairs + + + + { + { + B3_PROFILE("sort allPairs"); + allPairs.quickSort(b3PairCmp); + } + { + B3_PROFILE("sort addedHostPairs"); + addedHostPairs.quickSort(b3PairCmp); + } + { + B3_PROFILE("sort removedHostPairs"); + removedHostPairs.quickSort(b3PairCmp); + } + } + + b3Int4 prevPair; + prevPair.x = -1; + prevPair.y = -1; + + int uniqueRemovedPairs = 0; + + b3AlignedObjectArray<int> removedPositions; + + { + B3_PROFILE("actual removing"); + for (int i=0;i<removedHostPairs.size();i++) + { + b3Int4 removedPair = removedHostPairs[i]; + if ((removedPair.x != prevPair.x) || (removedPair.y != prevPair.y)) + { + + int index1 = allPairs.findBinarySearch(removedPair); + + //#ifdef _DEBUG + + + + int index2 = allPairs.findLinearSearch(removedPair); + b3Assert(index1==index2); + + //b3Assert(index1!=allPairs.size()); + if (index1<allPairs.size()) + //#endif//_DEBUG + { + uniqueRemovedPairs++; + removedPositions.push_back(index1); + { + //printf("framepje(%d) remove pair(%d):%d,%d\n",framepje,i,removedPair.x,removedPair.y); + } + } + } + prevPair = removedPair; + } + + if (uniqueRemovedPairs) + { + for (int i=0;i<removedPositions.size();i++) + { + allPairs[removedPositions[i]].x = INT_MAX ; + allPairs[removedPositions[i]].y = INT_MAX ; + } + allPairs.quickSort(b3PairCmp); + allPairs.resize(allPairs.size()-uniqueRemovedPairs); + } + } + //if (uniqueRemovedPairs) + // printf("uniqueRemovedPairs=%d\n",uniqueRemovedPairs); + //printf("removedHostPairs.size = %d\n",removedHostPairs.size()); + + prevPair.x = -1; + prevPair.y = -1; + + int uniqueAddedPairs=0; + b3AlignedObjectArray<b3Int4> actualAddedPairs; + + { + B3_PROFILE("actual adding"); + for (int i=0;i<addedHostPairs.size();i++) + { + b3Int4 newPair = addedHostPairs[i]; + if ((newPair.x != prevPair.x) || (newPair.y != prevPair.y)) + { +//#ifdef _DEBUG + int index1 = allPairs.findBinarySearch(newPair); + + + int index2 = allPairs.findLinearSearch(newPair); + b3Assert(index1==index2); + + + b3Assert(index1==allPairs.size()); + if (index1!=allPairs.size()) + { + printf("??\n"); + } + + if (index1==allPairs.size()) +//#endif //_DEBUG + { + uniqueAddedPairs++; + actualAddedPairs.push_back(newPair); + } + } + prevPair = newPair; + } + for (int i=0;i<actualAddedPairs.size();i++) + { + //printf("framepje (%d), new pair(%d):%d,%d\n",framepje,i,actualAddedPairs[i].x,actualAddedPairs[i].y); + allPairs.push_back(actualAddedPairs[i]); + } + } + + //if (uniqueAddedPairs) + // printf("uniqueAddedPairs=%d\n", uniqueAddedPairs); + + + { + B3_PROFILE("m_overlappingPairs.copyFromHost"); + m_overlappingPairs.copyFromHost(allPairs); + } + + +} + + + + +void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) +{ + //test +// if (m_currentBuffer>=0) + // return calculateOverlappingPairsHostIncremental3Sap(); + + b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size()); + m_allAabbsGPU.copyToHost(m_allAabbsCPU); + + + + int axis=0; + { + B3_PROFILE("CPU compute best variance axis"); + b3Vector3 s=b3MakeVector3(0,0,0),s2=b3MakeVector3(0,0,0); + int numRigidBodies = m_smallAabbsMappingCPU.size(); + + for(int i=0;i<numRigidBodies;i++) + { + b3SapAabb aabb = this->m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; + + b3Vector3 maxAabb=b3MakeVector3(aabb.m_max[0],aabb.m_max[1],aabb.m_max[2]); + b3Vector3 minAabb=b3MakeVector3(aabb.m_min[0],aabb.m_min[1],aabb.m_min[2]); + b3Vector3 centerAabb=(maxAabb+minAabb)*0.5f; + + s += centerAabb; + s2 += centerAabb*centerAabb; + } + b3Vector3 v = s2 - (s*s) / (float)numRigidBodies; + + if(v[1] > v[0]) + axis = 1; + if(v[2] > v[axis]) + axis = 2; + } + + + + + b3AlignedObjectArray<b3Int4> hostPairs; + + { + int numSmallAabbs = m_smallAabbsMappingCPU.size(); + for (int i=0;i<numSmallAabbs;i++) + { + b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; + //float reference = smallAabbi.m_max[axis]; + + for (int j=i+1;j<numSmallAabbs;j++) + { + + b3SapAabb smallAabbj = m_allAabbsCPU[m_smallAabbsMappingCPU[j]]; + + if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max, + (b3Vector3&)smallAabbj.m_min,(b3Vector3&)smallAabbj.m_max)) + { + b3Int4 pair; + int a = smallAabbi.m_minIndices[3]; + int b = smallAabbj.m_minIndices[3]; + if (a<=b) + { + pair.x = a;//store the original index in the unsorted aabb array + pair.y = b; + } else + { + pair.x = b;//store the original index in the unsorted aabb array + pair.y = a; + } + hostPairs.push_back(pair); + } + } + } + } + + + { + int numSmallAabbs = m_smallAabbsMappingCPU.size(); + for (int i=0;i<numSmallAabbs;i++) + { + b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; + + //float reference = smallAabbi.m_max[axis]; + int numLargeAabbs = m_largeAabbsMappingCPU.size(); + + for (int j=0;j<numLargeAabbs;j++) + { + b3SapAabb largeAabbj = m_allAabbsCPU[m_largeAabbsMappingCPU[j]]; + if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max, + (b3Vector3&)largeAabbj.m_min,(b3Vector3&)largeAabbj.m_max)) + { + b3Int4 pair; + int a = largeAabbj.m_minIndices[3]; + int b = smallAabbi.m_minIndices[3]; + if (a<=b) + { + pair.x = a; + pair.y = b;//store the original index in the unsorted aabb array + } else + { + pair.x = b; + pair.y = a;//store the original index in the unsorted aabb array + } + + hostPairs.push_back(pair); + } + } + } + } + + if (hostPairs.size() > maxPairs) + { + hostPairs.resize(maxPairs); + } + + if (hostPairs.size()) + { + m_overlappingPairs.copyFromHost(hostPairs); + } else + { + m_overlappingPairs.resize(0); + } + + //init3dSap(); + +} + +void b3GpuSapBroadphase::reset() +{ + m_allAabbsGPU.resize(0); + m_allAabbsCPU.resize(0); + + + m_smallAabbsMappingGPU.resize(0); + m_smallAabbsMappingCPU.resize(0); + + m_pairCount.resize(0); + + m_largeAabbsMappingGPU.resize(0); + m_largeAabbsMappingCPU.resize(0); + +} + + +void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) +{ + if (m_sapKernel==0) + { + calculateOverlappingPairsHost(maxPairs); + return; + } + + //if (m_currentBuffer>=0) + // return calculateOverlappingPairsHostIncremental3Sap(); + + //calculateOverlappingPairsHost(maxPairs); + + B3_PROFILE("GPU 1-axis SAP calculateOverlappingPairs"); + + int axis = 0; + + { + + //bool syncOnHost = false; + + int numSmallAabbs = m_smallAabbsMappingCPU.size(); + if (m_prefixScanFloat4 && numSmallAabbs) + { + B3_PROFILE("GPU compute best variance axis"); + + if (m_dst.size()!=(numSmallAabbs+1)) + { + m_dst.resize(numSmallAabbs+128); + m_sum.resize(numSmallAabbs+128); + m_sum2.resize(numSmallAabbs+128); + m_sum.at(numSmallAabbs)=b3MakeVector3(0,0,0); //slow? + m_sum2.at(numSmallAabbs)=b3MakeVector3(0,0,0); //slow? + } + + b3LauncherCL launcher(m_queue, m_prepareSumVarianceKernel ,"m_prepareSumVarianceKernel"); + launcher.setBuffer(m_allAabbsGPU.getBufferCL()); + + launcher.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); + launcher.setBuffer(m_sum.getBufferCL()); + launcher.setBuffer(m_sum2.getBufferCL()); + launcher.setConst( numSmallAabbs ); + int num = numSmallAabbs; + launcher.launch1D( num); + + + b3Vector3 s; + b3Vector3 s2; + m_prefixScanFloat4->execute(m_sum,m_dst,numSmallAabbs+1,&s); + m_prefixScanFloat4->execute(m_sum2,m_dst,numSmallAabbs+1,&s2); + + b3Vector3 v = s2 - (s*s) / (float)numSmallAabbs; + + if(v[1] > v[0]) + axis = 1; + if(v[2] > v[axis]) + axis = 2; + } + + + + m_gpuSmallSortData.resize(numSmallAabbs); + + +#if 1 + if (m_smallAabbsMappingGPU.size()) + { + + B3_PROFILE("flipFloatKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), + b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL(), true), + b3BufferInfoCL( m_gpuSmallSortData.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_flipFloatKernel ,"m_flipFloatKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( numSmallAabbs ); + launcher.setConst( axis ); + + int num = numSmallAabbs; + launcher.launch1D( num); + clFinish(m_queue); + } + + if (m_gpuSmallSortData.size()) + { + B3_PROFILE("gpu radix sort"); + m_sorter->execute(m_gpuSmallSortData); + clFinish(m_queue); + } + + m_gpuSmallSortedAabbs.resize(numSmallAabbs); + if (numSmallAabbs) + { + B3_PROFILE("scatterKernel"); + + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), + b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL(), true), + b3BufferInfoCL( m_gpuSmallSortData.getBufferCL(),true), + b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_scatterKernel ,"m_scatterKernel "); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( numSmallAabbs); + int num = numSmallAabbs; + launcher.launch1D( num); + clFinish(m_queue); + + } + + + m_overlappingPairs.resize(maxPairs); + + m_pairCount.resize(0); + m_pairCount.push_back(0); + int numPairs=0; + + { + int numLargeAabbs = m_largeAabbsMappingGPU.size(); + if (numLargeAabbs && numSmallAabbs) + { + //@todo + B3_PROFILE("sap2Kernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( m_allAabbsGPU.getBufferCL() ), + b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ), + b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ), + b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), + b3BufferInfoCL(m_pairCount.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( numLargeAabbs ); + launcher.setConst( numSmallAabbs); + launcher.setConst( axis ); + launcher.setConst( maxPairs ); +//@todo: use actual maximum work item sizes of the device instead of hardcoded values + launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64); + + numPairs = m_pairCount.at(0); + if (numPairs >maxPairs) + { + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs =maxPairs; + } + } + } + if (m_gpuSmallSortedAabbs.size()) + { + B3_PROFILE("sapKernel"); + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), b3BufferInfoCL(m_pairCount.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_sapKernel,"m_sapKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( numSmallAabbs ); + launcher.setConst( axis ); + launcher.setConst( maxPairs ); + + + int num = numSmallAabbs; +#if 0 + int buffSize = launcher.getSerializationBufferSize(); + unsigned char* buf = new unsigned char[buffSize+sizeof(int)]; + for (int i=0;i<buffSize+1;i++) + { + unsigned char* ptr = (unsigned char*)&buf[i]; + *ptr = 0xff; + } + int actualWrite = launcher.serializeArguments(buf,buffSize); + + unsigned char* cptr = (unsigned char*)&buf[buffSize]; + // printf("buf[buffSize] = %d\n",*cptr); + + assert(buf[buffSize]==0xff);//check for buffer overrun + int* ptr = (int*)&buf[buffSize]; + + *ptr = num; + + FILE* f = fopen("m_sapKernelArgs.bin","wb"); + fwrite(buf,buffSize+sizeof(int),1,f); + fclose(f); +#endif// + + launcher.launch1D( num); + clFinish(m_queue); + + numPairs = m_pairCount.at(0); + if (numPairs>maxPairs) + { + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs = maxPairs; + m_pairCount.resize(0); + m_pairCount.push_back(maxPairs); + } + } + +#else + int numPairs = 0; + + + b3LauncherCL launcher(m_queue, m_sapKernel); + + const char* fileName = "m_sapKernelArgs.bin"; + FILE* f = fopen(fileName,"rb"); + if (f) + { + int sizeInBytes=0; + if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) + { + printf("error, cannot get file size\n"); + exit(0); + } + + unsigned char* buf = (unsigned char*) malloc(sizeInBytes); + fread(buf,sizeInBytes,1,f); + int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context); + int num = *(int*)&buf[serializedBytes]; + launcher.launch1D( num); + + b3OpenCLArray<int> pairCount(m_context, m_queue); + int numElements = launcher.m_arrays[2]->size()/sizeof(int); + pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements); + numPairs = pairCount.at(0); + //printf("overlapping pairs = %d\n",numPairs); + b3AlignedObjectArray<b3Int4> hostOoverlappingPairs; + b3OpenCLArray<b3Int4> tmpGpuPairs(m_context,m_queue); + tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs ); + + tmpGpuPairs.copyToHost(hostOoverlappingPairs); + m_overlappingPairs.copyFromHost(hostOoverlappingPairs); + //printf("hello %d\n", m_overlappingPairs.size()); + free(buf); + fclose(f); + + } else { + printf("error: cannot find file %s\n",fileName); + } + + clFinish(m_queue); + + +#endif + + + m_overlappingPairs.resize(numPairs); + + }//B3_PROFILE("GPU_RADIX SORT"); + //init3dSap(); +} + +void b3GpuSapBroadphase::writeAabbsToGpu() +{ + m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU); + m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU); + + m_allAabbsGPU.copyFromHost(m_allAabbsCPU);//might not be necessary, the 'setupGpuAabbsFull' already takes care of this + + + +} + +void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +{ + int index = userPtr; + b3SapAabb aabb; + for (int i=0;i<4;i++) + { + aabb.m_min[i] = aabbMin[i]; + aabb.m_max[i] = aabbMax[i]; + } + aabb.m_minIndices[3] = index; + aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size(); + m_largeAabbsMappingCPU.push_back(m_allAabbsCPU.size()); + + m_allAabbsCPU.push_back(aabb); +} + +void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +{ + int index = userPtr; + b3SapAabb aabb; + for (int i=0;i<4;i++) + { + aabb.m_min[i] = aabbMin[i]; + aabb.m_max[i] = aabbMax[i]; + } + aabb.m_minIndices[3] = index; + aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size(); + m_smallAabbsMappingCPU.push_back(m_allAabbsCPU.size()); + + + m_allAabbsCPU.push_back(aabb); +} + +cl_mem b3GpuSapBroadphase::getAabbBufferWS() +{ + return m_allAabbsGPU.getBufferCL(); +} + +int b3GpuSapBroadphase::getNumOverlap() +{ + return m_overlappingPairs.size(); +} +cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer() +{ + return m_overlappingPairs.getBufferCL(); +} + +b3OpenCLArray<b3Int4>& b3GpuSapBroadphase::getOverlappingPairsGPU() +{ + return m_overlappingPairs; +} +b3OpenCLArray<int>& b3GpuSapBroadphase::getSmallAabbIndicesGPU() +{ + return m_smallAabbsMappingGPU; +} +b3OpenCLArray<int>& b3GpuSapBroadphase::getLargeAabbIndicesGPU() +{ + return m_largeAabbsMappingGPU; +} diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h new file mode 100644 index 0000000000..8d36ac78f2 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h @@ -0,0 +1,151 @@ +#ifndef B3_GPU_SAP_BROADPHASE_H +#define B3_GPU_SAP_BROADPHASE_H + +#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2 +class b3Vector3; +#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" + +#include "b3SapAabb.h" +#include "Bullet3Common/shared/b3Int2.h" + +#include "b3GpuBroadphaseInterface.h" + + +class b3GpuSapBroadphase : public b3GpuBroadphaseInterface +{ + + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + cl_kernel m_flipFloatKernel; + cl_kernel m_scatterKernel ; + cl_kernel m_copyAabbsKernel; + cl_kernel m_sapKernel; + cl_kernel m_sap2Kernel; + cl_kernel m_prepareSumVarianceKernel; + + + class b3RadixSort32CL* m_sorter; + + ///test for 3d SAP + b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2]; + b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2]; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev; + + b3OpenCLArray<b3SortData> m_sortedAxisGPU0; + b3OpenCLArray<b3SortData> m_sortedAxisGPU1; + b3OpenCLArray<b3SortData> m_sortedAxisGPU2; + b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev; + b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev; + b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev; + + + b3OpenCLArray<b3Int4> m_addedHostPairsGPU; + b3OpenCLArray<b3Int4> m_removedHostPairsGPU; + b3OpenCLArray<int> m_addedCountGPU; + b3OpenCLArray<int> m_removedCountGPU; + + int m_currentBuffer; + +public: + + b3OpenCLArray<int> m_pairCount; + + + b3OpenCLArray<b3SapAabb> m_allAabbsGPU; + b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU; + + virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() + { + return m_allAabbsGPU; + } + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() + { + return m_allAabbsCPU; + } + + b3OpenCLArray<b3Vector3> m_sum; + b3OpenCLArray<b3Vector3> m_sum2; + b3OpenCLArray<b3Vector3> m_dst; + + b3OpenCLArray<int> m_smallAabbsMappingGPU; + b3AlignedObjectArray<int> m_smallAabbsMappingCPU; + + b3OpenCLArray<int> m_largeAabbsMappingGPU; + b3AlignedObjectArray<int> m_largeAabbsMappingCPU; + + + b3OpenCLArray<b3Int4> m_overlappingPairs; + + //temporary gpu work memory + b3OpenCLArray<b3SortData> m_gpuSmallSortData; + b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs; + + class b3PrefixScanFloat4CL* m_prefixScanFloat4; + + enum b3GpuSapKernelType + { + B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU=1, + B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU, + B3_GPU_SAP_KERNEL_ORIGINAL, + B3_GPU_SAP_KERNEL_BARRIER, + B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY + }; + + b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType=B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY); + virtual ~b3GpuSapBroadphase(); + + static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx,cl_device_id device, cl_command_queue q) + { + return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU); + } + + static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx,cl_device_id device, cl_command_queue q) + { + return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU); + } + + static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx,cl_device_id device, cl_command_queue q) + { + return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_ORIGINAL); + } + static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx,cl_device_id device, cl_command_queue q) + { + return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BARRIER); + } + static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx,cl_device_id device, cl_command_queue q) + { + return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY); + } + + + virtual void calculateOverlappingPairs(int maxPairs); + virtual void calculateOverlappingPairsHost(int maxPairs); + + void reset(); + + void init3dSap(); + virtual void calculateOverlappingPairsHostIncremental3Sap(); + + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); + + //call writeAabbsToGpu after done making all changes (createProxy etc) + virtual void writeAabbsToGpu(); + + virtual cl_mem getAabbBufferWS(); + virtual int getNumOverlap(); + virtual cl_mem getOverlappingPairBuffer(); + + virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU(); + virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU(); + virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU(); +}; + +#endif //B3_GPU_SAP_BROADPHASE_H
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h new file mode 100644 index 0000000000..ea6550fede --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h @@ -0,0 +1,14 @@ +#ifndef B3_SAP_AABB_H +#define B3_SAP_AABB_H + +#include "Bullet3Common/b3Scalar.h" +#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h" + +///just make sure that the b3Aabb is 16-byte aligned +B3_ATTRIBUTE_ALIGNED16(struct) b3SapAabb : public b3Aabb +{ + +}; + + +#endif //B3_SAP_AABB_H diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl new file mode 100644 index 0000000000..ded4796d33 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl @@ -0,0 +1,216 @@ + + +int getPosHash(int4 gridPos, __global float4* pParams) +{ + int4 gridDim = *((__global int4*)(pParams + 1)); + gridPos.x &= gridDim.x - 1; + gridPos.y &= gridDim.y - 1; + gridPos.z &= gridDim.z - 1; + int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x; + return hash; +} + +int4 getGridPos(float4 worldPos, __global float4* pParams) +{ + int4 gridPos; + int4 gridDim = *((__global int4*)(pParams + 1)); + gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1); + gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1); + gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1); + return gridPos; +} + + +// calculate grid hash value for each body using its AABB +__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams ) +{ + int index = get_global_id(0); + if(index >= numObjects) + { + return; + } + float4 bbMin = allpAABB[smallAabbMapping[index]*2]; + float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1]; + float4 pos; + pos.x = (bbMin.x + bbMax.x) * 0.5f; + pos.y = (bbMin.y + bbMax.y) * 0.5f; + pos.z = (bbMin.z + bbMax.z) * 0.5f; + pos.w = 0.f; + // get address in grid + int4 gridPos = getGridPos(pos, pParams); + int gridHash = getPosHash(gridPos, pParams); + // store grid hash and body index + int2 hashVal; + hashVal.x = gridHash; + hashVal.y = index; + pHash[index] = hashVal; +} + +__kernel void kClearCellStart( int numCells, + __global int* pCellStart ) +{ + int index = get_global_id(0); + if(index >= numCells) + { + return; + } + pCellStart[index] = -1; +} + +__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart ) +{ + __local int sharedHash[513]; + int index = get_global_id(0); + int2 sortedData; + + if(index < numObjects) + { + sortedData = pHash[index]; + // Load hash data into shared memory so that we can look + // at neighboring body's hash value without loading + // two hash values per thread + sharedHash[get_local_id(0) + 1] = sortedData.x; + if((index > 0) && (get_local_id(0) == 0)) + { + // first thread in block must load neighbor body hash + sharedHash[0] = pHash[index-1].x; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if(index < numObjects) + { + if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)])) + { + cellStart[sortedData.x] = index; + } + } +} + +int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1) +{ + return (min0.x <= max1.x)&& (min1.x <= max0.x) && + (min0.y <= max1.y)&& (min1.y <= max0.y) && + (min0.z <= max1.z)&& (min1.z <= max0.z); +} + + + + +//search for AABB 'index' against other AABBs' in this cell +void findPairsInCell( int numObjects, + int4 gridPos, + int index, + __global int2* pHash, + __global int* pCellStart, + __global float4* allpAABB, + __global const int* smallAabbMapping, + __global float4* pParams, + volatile __global int* pairCount, + __global int4* pPairBuff2, + int maxPairs + ) +{ + int4 pGridDim = *((__global int4*)(pParams + 1)); + int maxBodiesPerCell = pGridDim.w; + int gridHash = getPosHash(gridPos, pParams); + // get start of bucket for this cell + int bucketStart = pCellStart[gridHash]; + if (bucketStart == -1) + { + return; // cell empty + } + // iterate over bodies in this cell + int2 sortedData = pHash[index]; + int unsorted_indx = sortedData.y; + float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; + float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1]; + int handleIndex = as_int(min0.w); + + int bucketEnd = bucketStart + maxBodiesPerCell; + bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd; + for(int index2 = bucketStart; index2 < bucketEnd; index2++) + { + int2 cellData = pHash[index2]; + if (cellData.x != gridHash) + { + break; // no longer in same bucket + } + int unsorted_indx2 = cellData.y; + //if (unsorted_indx2 < unsorted_indx) // check not colliding with self + if (unsorted_indx2 != unsorted_indx) // check not colliding with self + { + float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0]; + float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1]; + if(testAABBOverlap(min0, max0, min1, max1)) + { + if (pairCount) + { + int handleIndex2 = as_int(min1.w); + if (handleIndex<handleIndex2) + { + int curPair = atomic_add(pairCount,1); + if (curPair<maxPairs) + { + int4 newpair; + newpair.x = handleIndex; + newpair.y = handleIndex2; + newpair.z = -1; + newpair.w = -1; + pPairBuff2[curPair] = newpair; + } + } + + } + } + } + } +} + +__kernel void kFindOverlappingPairs( int numObjects, + __global float4* allpAABB, + __global const int* smallAabbMapping, + __global int2* pHash, + __global int* pCellStart, + __global float4* pParams , + volatile __global int* pairCount, + __global int4* pPairBuff2, + int maxPairs + ) + +{ + int index = get_global_id(0); + if(index >= numObjects) + { + return; + } + int2 sortedData = pHash[index]; + int unsorted_indx = sortedData.y; + float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; + float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1]; + float4 pos; + pos.x = (bbMin.x + bbMax.x) * 0.5f; + pos.y = (bbMin.y + bbMax.y) * 0.5f; + pos.z = (bbMin.z + bbMax.z) * 0.5f; + // get address in grid + int4 gridPosA = getGridPos(pos, pParams); + int4 gridPosB; + // examine only neighbouring cells + for(int z=-1; z<=1; z++) + { + gridPosB.z = gridPosA.z + z; + for(int y=-1; y<=1; y++) + { + gridPosB.y = gridPosA.y + y; + for(int x=-1; x<=1; x++) + { + gridPosB.x = gridPosA.x + x; + findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs); + } + } + } +} + + + + + diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h new file mode 100644 index 0000000000..dad42477c3 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h @@ -0,0 +1,199 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* gridBroadphaseCL= \ +"int getPosHash(int4 gridPos, __global float4* pParams)\n" +"{\n" +" int4 gridDim = *((__global int4*)(pParams + 1));\n" +" gridPos.x &= gridDim.x - 1;\n" +" gridPos.y &= gridDim.y - 1;\n" +" gridPos.z &= gridDim.z - 1;\n" +" int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n" +" return hash;\n" +"} \n" +"int4 getGridPos(float4 worldPos, __global float4* pParams)\n" +"{\n" +" int4 gridPos;\n" +" int4 gridDim = *((__global int4*)(pParams + 1));\n" +" gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n" +" gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n" +" gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n" +" return gridPos;\n" +"}\n" +"// calculate grid hash value for each body using its AABB\n" +"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n" +"{\n" +" int index = get_global_id(0);\n" +" if(index >= numObjects)\n" +" {\n" +" return;\n" +" }\n" +" float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n" +" float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n" +" float4 pos;\n" +" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" +" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" +" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" +" pos.w = 0.f;\n" +" // get address in grid\n" +" int4 gridPos = getGridPos(pos, pParams);\n" +" int gridHash = getPosHash(gridPos, pParams);\n" +" // store grid hash and body index\n" +" int2 hashVal;\n" +" hashVal.x = gridHash;\n" +" hashVal.y = index;\n" +" pHash[index] = hashVal;\n" +"}\n" +"__kernel void kClearCellStart( int numCells, \n" +" __global int* pCellStart )\n" +"{\n" +" int index = get_global_id(0);\n" +" if(index >= numCells)\n" +" {\n" +" return;\n" +" }\n" +" pCellStart[index] = -1;\n" +"}\n" +"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n" +"{\n" +" __local int sharedHash[513];\n" +" int index = get_global_id(0);\n" +" int2 sortedData;\n" +" if(index < numObjects)\n" +" {\n" +" sortedData = pHash[index];\n" +" // Load hash data into shared memory so that we can look \n" +" // at neighboring body's hash value without loading\n" +" // two hash values per thread\n" +" sharedHash[get_local_id(0) + 1] = sortedData.x;\n" +" if((index > 0) && (get_local_id(0) == 0))\n" +" {\n" +" // first thread in block must load neighbor body hash\n" +" sharedHash[0] = pHash[index-1].x;\n" +" }\n" +" }\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" if(index < numObjects)\n" +" {\n" +" if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n" +" {\n" +" cellStart[sortedData.x] = index;\n" +" }\n" +" }\n" +"}\n" +"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n" +"{\n" +" return (min0.x <= max1.x)&& (min1.x <= max0.x) && \n" +" (min0.y <= max1.y)&& (min1.y <= max0.y) && \n" +" (min0.z <= max1.z)&& (min1.z <= max0.z); \n" +"}\n" +"//search for AABB 'index' against other AABBs' in this cell\n" +"void findPairsInCell( int numObjects,\n" +" int4 gridPos,\n" +" int index,\n" +" __global int2* pHash,\n" +" __global int* pCellStart,\n" +" __global float4* allpAABB, \n" +" __global const int* smallAabbMapping,\n" +" __global float4* pParams,\n" +" volatile __global int* pairCount,\n" +" __global int4* pPairBuff2,\n" +" int maxPairs\n" +" )\n" +"{\n" +" int4 pGridDim = *((__global int4*)(pParams + 1));\n" +" int maxBodiesPerCell = pGridDim.w;\n" +" int gridHash = getPosHash(gridPos, pParams);\n" +" // get start of bucket for this cell\n" +" int bucketStart = pCellStart[gridHash];\n" +" if (bucketStart == -1)\n" +" {\n" +" return; // cell empty\n" +" }\n" +" // iterate over bodies in this cell\n" +" int2 sortedData = pHash[index];\n" +" int unsorted_indx = sortedData.y;\n" +" float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n" +" float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n" +" int handleIndex = as_int(min0.w);\n" +" \n" +" int bucketEnd = bucketStart + maxBodiesPerCell;\n" +" bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n" +" for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n" +" {\n" +" int2 cellData = pHash[index2];\n" +" if (cellData.x != gridHash)\n" +" {\n" +" break; // no longer in same bucket\n" +" }\n" +" int unsorted_indx2 = cellData.y;\n" +" //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n" +" if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n" +" { \n" +" float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n" +" float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n" +" if(testAABBOverlap(min0, max0, min1, max1))\n" +" {\n" +" if (pairCount)\n" +" {\n" +" int handleIndex2 = as_int(min1.w);\n" +" if (handleIndex<handleIndex2)\n" +" {\n" +" int curPair = atomic_add(pairCount,1);\n" +" if (curPair<maxPairs)\n" +" {\n" +" int4 newpair;\n" +" newpair.x = handleIndex;\n" +" newpair.y = handleIndex2;\n" +" newpair.z = -1;\n" +" newpair.w = -1;\n" +" pPairBuff2[curPair] = newpair;\n" +" }\n" +" }\n" +" \n" +" }\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__kernel void kFindOverlappingPairs( int numObjects,\n" +" __global float4* allpAABB, \n" +" __global const int* smallAabbMapping,\n" +" __global int2* pHash, \n" +" __global int* pCellStart, \n" +" __global float4* pParams ,\n" +" volatile __global int* pairCount,\n" +" __global int4* pPairBuff2,\n" +" int maxPairs\n" +" )\n" +"{\n" +" int index = get_global_id(0);\n" +" if(index >= numObjects)\n" +" {\n" +" return;\n" +" }\n" +" int2 sortedData = pHash[index];\n" +" int unsorted_indx = sortedData.y;\n" +" float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n" +" float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n" +" float4 pos;\n" +" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" +" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" +" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" +" // get address in grid\n" +" int4 gridPosA = getGridPos(pos, pParams);\n" +" int4 gridPosB; \n" +" // examine only neighbouring cells\n" +" for(int z=-1; z<=1; z++) \n" +" {\n" +" gridPosB.z = gridPosA.z + z;\n" +" for(int y=-1; y<=1; y++) \n" +" {\n" +" gridPosB.y = gridPosA.y + y;\n" +" for(int x=-1; x<=1; x++) \n" +" {\n" +" gridPosB.x = gridPosA.x + x;\n" +" findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n" +" }\n" +" }\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl new file mode 100644 index 0000000000..c375b9bf37 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl @@ -0,0 +1,767 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +typedef float b3Scalar; +typedef float4 b3Vector3; +#define b3Max max +#define b3Min min +#define b3Sqrt sqrt + +typedef struct +{ + unsigned int m_key; + unsigned int m_value; +} SortDataCL; + +typedef struct +{ + union + { + float4 m_min; + float m_minElems[4]; + int m_minIndices[4]; + }; + union + { + float4 m_max; + float m_maxElems[4]; + int m_maxIndices[4]; + }; +} b3AabbCL; + + +unsigned int interleaveBits(unsigned int x) +{ + //........ ........ ......12 3456789A //x + //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits + + //......12 3456789A ......12 3456789A //x ^ (x << 16) + //11111111 ........ ........ 11111111 //0x FF 00 00 FF + //......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF; + + //......12 ........ 3456789A 3456789A //x ^ (x << 8) + //......11 ........ 1111.... ....1111 //0x 03 00 F0 0F + //......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F; + + //..12..12 ....3456 3456.... 789A789A //x ^ (x << 4) + //......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3 + //......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3; + + //....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2) + //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49 + //....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249; + + //........ ........ ......11 11111111 //0x000003FF + x &= 0x000003FF; //Clear all bits above bit 10 + + x = (x ^ (x << 16)) & 0xFF0000FF; + x = (x ^ (x << 8)) & 0x0300F00F; + x = (x ^ (x << 4)) & 0x030C30C3; + x = (x ^ (x << 2)) & 0x09249249; + + return x; +} +unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z) +{ + return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2; +} + +__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate) +{ + int separatedAabbIndex = get_global_id(0); + if(separatedAabbIndex >= numAabbsToSeparate) return; + + int unseparatedAabbIndex = aabbIndices[separatedAabbIndex]; + out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex]; +} + +//Should replace with an optimized parallel reduction +__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge) +{ + //Each time this kernel is added to the command queue, + //the number of AABBs needing to be merged is halved + // + //Example with 159 AABBs: + // numRemainingAabbs == 159 / 2 + 159 % 2 == 80 + // numMergedAabbs == 159 - 80 == 79 + //So, indices [0, 78] are merged with [0 + 80, 78 + 80] + + int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2; + int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs; + + int aabbIndex = get_global_id(0); + if(aabbIndex >= numMergedAabbs) return; + + int otherAabbIndex = aabbIndex + numRemainingAabbs; + + b3AabbCL aabb = out_mergedAabb[aabbIndex]; + b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex]; + + b3AabbCL mergedAabb; + mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min); + mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max); + out_mergedAabb[aabbIndex] = mergedAabb; +} + +__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, + __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs) +{ + int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index + if(leafNodeIndex >= numAabbs) return; + + b3AabbCL mergedAabb = mergedAabbOfAllNodes[0]; + b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f; + b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024; + + b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex]; + b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f; + b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter; + + //Quantize into integer coordinates + //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size + b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize; + + int4 discretePosition; + discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) ); + discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) ); + discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) ); + + //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023] + discretePosition = b3Max( -512, b3Min(discretePosition, 511) ); + discretePosition += 512; + + //Interleave bits(assign a morton code, also known as a z-curve) + unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z); + + // + SortDataCL mortonCodeIndexPair; + mortonCodeIndexPair.m_key = mortonCode; + mortonCodeIndexPair.m_value = leafNodeIndex; + + out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair; +} + +#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128 + +//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes. +//If it is set, then the index is for an internal node; otherwise, it is a leaf node. +//In both cases, the bit should be cleared to access the actual node index. +int isLeafNode(int index) { return (index >> 31 == 0); } +int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); } +int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); } + +//From sap.cl +#define NEW_PAIR_MARKER -1 + +bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2) +{ + bool overlap = true; + overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap; + overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap; + overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap; + return overlap; +} +//From sap.cl + +__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, + + __global int* rootNodeIndex, + __global int2* internalNodeChildIndices, + __global b3AabbCL* internalNodeAabbs, + __global int2* internalNodeLeafIndexRanges, + + __global SortDataCL* mortonCodesAndAabbIndices, + __global int* out_numPairs, __global int4* out_overlappingPairs, + int maxPairs, int numQueryAabbs) +{ + //Using get_group_id()/get_local_id() is Faster than get_global_id(0) since + //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent) + int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0); + if(queryBvhNodeIndex >= numQueryAabbs) return; + + int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value; + b3AabbCL queryAabb = rigidAabbs[queryRigidIndex]; + + int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE]; + + int stackSize = 1; + stack[0] = *rootNodeIndex; + + while(stackSize) + { + int internalOrLeafNodeIndex = stack[ stackSize - 1 ]; + --stackSize; + + int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false + int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex); + + //Optimization - if the BVH is structured as a binary radix tree, then + //each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]). + //This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself. + { + int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y; + if(highestLeafIndex <= queryBvhNodeIndex) continue; + } + + //bvhRigidIndex is not used if internal node + int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1; + + b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex]; + if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) ) + { + if(isLeaf) + { + int4 pair; + pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3]; + pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3]; + pair.z = NEW_PAIR_MARKER; + pair.w = NEW_PAIR_MARKER; + + int pairIndex = atomic_inc(out_numPairs); + if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair; + } + + if(!isLeaf) //Internal node + { + if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE) + { + //Error + } + else + { + stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x; + stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y; + } + } + } + + } +} + + +//From rayCastKernels.cl +typedef struct +{ + float4 m_from; + float4 m_to; +} b3RayInfo; +//From rayCastKernels.cl + +b3Vector3 b3Vector3_normalize(b3Vector3 v) +{ + b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f}; + return normalize(normal); //OpenCL normalize == vector4 normalize +} +b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; } +b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; } + +int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb) +{ + //AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ). + //t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane. + // + //if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane + //and min.x will be the far plane; otherwise, it is reversed. + // + //In order for there to be a collision, the t_min and t_max of each pair must overlap. + //This can be tested for by selecting the highest t_min and lowest t_max and comparing them. + + int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) ); //isless(x,y) returns (x < y) + + //When using vector types, the select() function checks the most signficant bit, + //but isless() sets the least significant bit. + isNegative <<= 31; + + //select(b, a, condition) == condition ? a : b + //When using select() with vector types, (condition[i]) is true if its most significant bit is 1 + b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection; + b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection; + + b3Scalar t_min_final = 0.0f; + b3Scalar t_max_final = rayLength; + + //Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. + //Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4]) + //Since the innermost fmin()/fmax() is always not NaN, this should never return NaN. + t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) ); + t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) ); + + return (t_min_final <= t_max_final); +} + +__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs, + + __global int* rootNodeIndex, + __global int2* internalNodeChildIndices, + __global b3AabbCL* internalNodeAabbs, + __global int2* internalNodeLeafIndexRanges, + __global SortDataCL* mortonCodesAndAabbIndices, + + __global b3RayInfo* rays, + + __global int* out_numRayRigidPairs, + __global int2* out_rayRigidPairs, + int maxRayRigidPairs, int numRays) +{ + int rayIndex = get_global_id(0); + if(rayIndex >= numRays) return; + + // + b3Vector3 rayFrom = rays[rayIndex].m_from; + b3Vector3 rayTo = rays[rayIndex].m_to; + b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom); + b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) ); + + // + int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE]; + + int stackSize = 1; + stack[0] = *rootNodeIndex; + + while(stackSize) + { + int internalOrLeafNodeIndex = stack[ stackSize - 1 ]; + --stackSize; + + int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false + int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex); + + //bvhRigidIndex is not used if internal node + int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1; + + b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex]; + if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) ) + { + if(isLeaf) + { + int2 rayRigidPair; + rayRigidPair.x = rayIndex; + rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3]; + + int pairIndex = atomic_inc(out_numRayRigidPairs); + if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair; + } + + if(!isLeaf) //Internal node + { + if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE) + { + //Error + } + else + { + stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x; + stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y; + } + } + } + } +} + +__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, + __global int* out_numPairs, __global int4* out_overlappingPairs, + int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids) +{ + int smallAabbIndex = get_global_id(0); + if(smallAabbIndex >= numSmallAabbRigids) return; + + b3AabbCL smallAabb = smallAabbs[smallAabbIndex]; + for(int i = 0; i < numLargeAabbRigids; ++i) + { + b3AabbCL largeAabb = largeAabbs[i]; + if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) ) + { + int4 pair; + pair.x = largeAabb.m_minIndices[3]; + pair.y = smallAabb.m_minIndices[3]; + pair.z = NEW_PAIR_MARKER; + pair.w = NEW_PAIR_MARKER; + + int pairIndex = atomic_inc(out_numPairs); + if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair; + } + } +} +__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays, + __global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs, + int numLargeAabbRigids, int maxRayRigidPairs, int numRays) +{ + int rayIndex = get_global_id(0); + if(rayIndex >= numRays) return; + + b3Vector3 rayFrom = rays[rayIndex].m_from; + b3Vector3 rayTo = rays[rayIndex].m_to; + b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom); + b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) ); + + for(int i = 0; i < numLargeAabbRigids; ++i) + { + b3AabbCL rigidAabb = largeRigidAabbs[i]; + if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) ) + { + int2 rayRigidPair; + rayRigidPair.x = rayIndex; + rayRigidPair.y = rigidAabb.m_minIndices[3]; + + int pairIndex = atomic_inc(out_numRayRigidPairs); + if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair; + } + } +} + + +//Set so that it is always greater than the actual common prefixes, and never selected as a parent node. +//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve. +//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node. +//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve). +#define B3_PLBVH_INVALID_COMMON_PREFIX 128 + +#define B3_PLBVH_ROOT_NODE_MARKER -1 + +#define b3Int64 long + +int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); } +b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) +{ + //This function only needs to return (i & j) in order for the algorithm to work, + //but it may help with debugging to mask out the lower bits. + + b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j); + + b3Int64 sharedBits = i & j; + b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0 + + return sharedBits & bitmask; +} + +//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths +int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB) +{ + return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) ); +} + +__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices, + __global b3Int64* out_commonPrefixes, + __global int* out_commonPrefixLengths, + int numInternalNodes) +{ + int internalNodeIndex = get_global_id(0); + if (internalNodeIndex >= numInternalNodes) return; + + //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index, + //and the number of internal nodes is always numLeafNodes - 1 + int leftLeafIndex = internalNodeIndex; + int rightLeafIndex = internalNodeIndex + 1; + + int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key; + int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key; + + //Binary radix tree construction algorithm does not work if there are duplicate morton codes. + //Append the index of each leaf node to each morton code so that there are no duplicates. + //The algorithm also requires that the morton codes are sorted in ascending order; this requirement + //is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true. + // + //upsample(a, b) == ( ((b3Int64)a) << 32) | b + b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex); + b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex); + + out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode); + out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode); +} + + +__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes, + __global int2* out_childNodes, int numLeafNodes) +{ + int leafNodeIndex = get_global_id(0); + if (leafNodeIndex >= numLeafNodes) return; + + int numInternalNodes = numLeafNodes - 1; + + int leftSplitIndex = leafNodeIndex - 1; + int rightSplitIndex = leafNodeIndex; + + int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; + int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; + + //Parent node is the highest adjacent common prefix that is lower than the node's common prefix + //Leaf nodes are considered as having the highest common prefix + int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix); + + //Handle cases for the edge nodes; the first and last node + //For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX + if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false; + if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true; + + int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex; + out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex; + + int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa + + //out_childNodesAsInt[0] == int2.x == left child + //out_childNodesAsInt[1] == int2.y == right child + int isLeaf = 1; + __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]); + out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex); +} + +__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths, + __global int2* out_childNodes, + __global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex, + int numInternalNodes) +{ + int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0); + if(internalNodeIndex >= numInternalNodes) return; + + b3Int64 nodePrefix = commonPrefixes[internalNodeIndex]; + int nodePrefixLength = commonPrefixLengths[internalNodeIndex]; + +//#define USE_LINEAR_SEARCH +#ifdef USE_LINEAR_SEARCH + int leftIndex = -1; + int rightIndex = -1; + + //Find nearest element to left with a lower common prefix + for(int i = internalNodeIndex - 1; i >= 0; --i) + { + int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]); + if(nodeLeftSharedPrefixLength < nodePrefixLength) + { + leftIndex = i; + break; + } + } + + //Find nearest element to right with a lower common prefix + for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i) + { + int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]); + if(nodeRightSharedPrefixLength < nodePrefixLength) + { + rightIndex = i; + break; + } + } + +#else //Use binary search + + //Find nearest element to left with a lower common prefix + int leftIndex = -1; + { + int lower = 0; + int upper = internalNodeIndex - 1; + + while(lower <= upper) + { + int mid = (lower + upper) / 2; + b3Int64 midPrefix = commonPrefixes[mid]; + int midPrefixLength = commonPrefixLengths[mid]; + + int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength); + if(nodeMidSharedPrefixLength < nodePrefixLength) + { + int right = mid + 1; + if(right < internalNodeIndex) + { + b3Int64 rightPrefix = commonPrefixes[right]; + int rightPrefixLength = commonPrefixLengths[right]; + + int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength); + if(nodeRightSharedPrefixLength < nodePrefixLength) + { + lower = right; + leftIndex = right; + } + else + { + leftIndex = mid; + break; + } + } + else + { + leftIndex = mid; + break; + } + } + else upper = mid - 1; + } + } + + //Find nearest element to right with a lower common prefix + int rightIndex = -1; + { + int lower = internalNodeIndex + 1; + int upper = numInternalNodes - 1; + + while(lower <= upper) + { + int mid = (lower + upper) / 2; + b3Int64 midPrefix = commonPrefixes[mid]; + int midPrefixLength = commonPrefixLengths[mid]; + + int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength); + if(nodeMidSharedPrefixLength < nodePrefixLength) + { + int left = mid - 1; + if(left > internalNodeIndex) + { + b3Int64 leftPrefix = commonPrefixes[left]; + int leftPrefixLength = commonPrefixLengths[left]; + + int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength); + if(nodeLeftSharedPrefixLength < nodePrefixLength) + { + upper = left; + rightIndex = left; + } + else + { + rightIndex = mid; + break; + } + } + else + { + rightIndex = mid; + break; + } + } + else lower = mid + 1; + } + } +#endif + + //Select parent + { + int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; + int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; + + int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength); + + if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false; + else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true; + + int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex; + + int isRootNode = (leftIndex == -1 && rightIndex == -1); + out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER; + + int isLeaf = 0; + if(!isRootNode) + { + int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa + + //out_childNodesAsInt[0] == int2.x == left child + //out_childNodesAsInt[1] == int2.y == right child + __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]); + out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex); + } + else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex); + } +} + +__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes, + __global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes) +{ + if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0); + + int internalNodeIndex = get_global_id(0); + if(internalNodeIndex >= numInternalNodes) return; + + // + int distanceFromRoot = 0; + { + int parentIndex = internalNodeParentNodes[internalNodeIndex]; + while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER) + { + parentIndex = internalNodeParentNodes[parentIndex]; + ++distanceFromRoot; + } + } + out_distanceFromRoot[internalNodeIndex] = distanceFromRoot; + + // + __local int localMaxDistanceFromRoot; + if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0; + barrier(CLK_LOCAL_MEM_FENCE); + + atomic_max(&localMaxDistanceFromRoot, distanceFromRoot); + barrier(CLK_LOCAL_MEM_FENCE); + + if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot); +} + +__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices, + __global int2* childNodes, + __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs, + int maxDistanceFromRoot, int processedDistance, int numInternalNodes) +{ + int internalNodeIndex = get_global_id(0); + if(internalNodeIndex >= numInternalNodes) return; + + int distance = distanceFromRoot[internalNodeIndex]; + + if(distance == processedDistance) + { + int leftChildIndex = childNodes[internalNodeIndex].x; + int rightChildIndex = childNodes[internalNodeIndex].y; + + int isLeftChildLeaf = isLeafNode(leftChildIndex); + int isRightChildLeaf = isLeafNode(rightChildIndex); + + leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex); + rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex); + + //leftRigidIndex/rightRigidIndex is not used if internal node + int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1; + int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1; + + b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex]; + b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex]; + + b3AabbCL mergedAabb; + mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min); + mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max); + internalNodeAabbs[internalNodeIndex] = mergedAabb; + } +} + +__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes) +{ + int internalNodeIndex = get_global_id(0); + if(internalNodeIndex >= numInternalNodes) return; + + int numLeafNodes = numInternalNodes + 1; + + int2 childNodes = internalNodeChildNodes[internalNodeIndex]; + + int2 leafIndexRange; //x == min leaf index, y == max leaf index + + //Find lowest leaf index covered by this internal node + { + int lowestIndex = childNodes.x; //childNodes.x == Left child + while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x; + leafIndexRange.x = lowestIndex; + } + + //Find highest leaf index covered by this internal node + { + int highestIndex = childNodes.y; //childNodes.y == Right child + while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y; + leafIndexRange.y = highestIndex; + } + + // + out_leafIndexRanges[internalNodeIndex] = leafIndexRange; +} diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h new file mode 100644 index 0000000000..5eb8f45b16 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h @@ -0,0 +1,729 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* parallelLinearBvhCL= \ +"/*\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose,\n" +"including commercial applications, and to alter it and redistribute it freely,\n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Initial Author Jackson Lee, 2014\n" +"typedef float b3Scalar;\n" +"typedef float4 b3Vector3;\n" +"#define b3Max max\n" +"#define b3Min min\n" +"#define b3Sqrt sqrt\n" +"typedef struct\n" +"{\n" +" unsigned int m_key;\n" +" unsigned int m_value;\n" +"} SortDataCL;\n" +"typedef struct \n" +"{\n" +" union\n" +" {\n" +" float4 m_min;\n" +" float m_minElems[4];\n" +" int m_minIndices[4];\n" +" };\n" +" union\n" +" {\n" +" float4 m_max;\n" +" float m_maxElems[4];\n" +" int m_maxIndices[4];\n" +" };\n" +"} b3AabbCL;\n" +"unsigned int interleaveBits(unsigned int x)\n" +"{\n" +" //........ ........ ......12 3456789A //x\n" +" //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits\n" +" \n" +" //......12 3456789A ......12 3456789A //x ^ (x << 16)\n" +" //11111111 ........ ........ 11111111 //0x FF 00 00 FF\n" +" //......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF;\n" +" \n" +" //......12 ........ 3456789A 3456789A //x ^ (x << 8)\n" +" //......11 ........ 1111.... ....1111 //0x 03 00 F0 0F\n" +" //......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F;\n" +" \n" +" //..12..12 ....3456 3456.... 789A789A //x ^ (x << 4)\n" +" //......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3\n" +" //......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3;\n" +" \n" +" //....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2)\n" +" //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49\n" +" //....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249;\n" +" \n" +" //........ ........ ......11 11111111 //0x000003FF\n" +" x &= 0x000003FF; //Clear all bits above bit 10\n" +" \n" +" x = (x ^ (x << 16)) & 0xFF0000FF;\n" +" x = (x ^ (x << 8)) & 0x0300F00F;\n" +" x = (x ^ (x << 4)) & 0x030C30C3;\n" +" x = (x ^ (x << 2)) & 0x09249249;\n" +" \n" +" return x;\n" +"}\n" +"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n" +"{\n" +" return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n" +"}\n" +"__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n" +"{\n" +" int separatedAabbIndex = get_global_id(0);\n" +" if(separatedAabbIndex >= numAabbsToSeparate) return;\n" +" int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n" +" out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n" +"}\n" +"//Should replace with an optimized parallel reduction\n" +"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n" +"{\n" +" //Each time this kernel is added to the command queue, \n" +" //the number of AABBs needing to be merged is halved\n" +" //\n" +" //Example with 159 AABBs:\n" +" // numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n" +" // numMergedAabbs == 159 - 80 == 79\n" +" //So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n" +" \n" +" int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n" +" int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n" +" \n" +" int aabbIndex = get_global_id(0);\n" +" if(aabbIndex >= numMergedAabbs) return;\n" +" \n" +" int otherAabbIndex = aabbIndex + numRemainingAabbs;\n" +" \n" +" b3AabbCL aabb = out_mergedAabb[aabbIndex];\n" +" b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n" +" \n" +" b3AabbCL mergedAabb;\n" +" mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n" +" mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n" +" out_mergedAabb[aabbIndex] = mergedAabb;\n" +"}\n" +"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n" +" __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n" +"{\n" +" int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index\n" +" if(leafNodeIndex >= numAabbs) return;\n" +" \n" +" b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n" +" b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n" +" b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n" +" \n" +" b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n" +" b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n" +" b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n" +" \n" +" //Quantize into integer coordinates\n" +" //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n" +" b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n" +" \n" +" int4 discretePosition;\n" +" discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n" +" discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n" +" discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n" +" \n" +" //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n" +" discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n" +" discretePosition += 512;\n" +" \n" +" //Interleave bits(assign a morton code, also known as a z-curve)\n" +" unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n" +" \n" +" //\n" +" SortDataCL mortonCodeIndexPair;\n" +" mortonCodeIndexPair.m_key = mortonCode;\n" +" mortonCodeIndexPair.m_value = leafNodeIndex;\n" +" \n" +" out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n" +"}\n" +"#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n" +"//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n" +"//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n" +"//In both cases, the bit should be cleared to access the actual node index.\n" +"int isLeafNode(int index) { return (index >> 31 == 0); }\n" +"int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n" +"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n" +"//From sap.cl\n" +"#define NEW_PAIR_MARKER -1\n" +"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n" +"{\n" +" bool overlap = true;\n" +" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"//From sap.cl\n" +"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n" +" __global int* rootNodeIndex, \n" +" __global int2* internalNodeChildIndices, \n" +" __global b3AabbCL* internalNodeAabbs,\n" +" __global int2* internalNodeLeafIndexRanges,\n" +" \n" +" __global SortDataCL* mortonCodesAndAabbIndices,\n" +" __global int* out_numPairs, __global int4* out_overlappingPairs, \n" +" int maxPairs, int numQueryAabbs)\n" +"{\n" +" //Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n" +" //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n" +" int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" +" if(queryBvhNodeIndex >= numQueryAabbs) return;\n" +" \n" +" int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n" +" b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n" +" \n" +" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" +" \n" +" int stackSize = 1;\n" +" stack[0] = *rootNodeIndex;\n" +" \n" +" while(stackSize)\n" +" {\n" +" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" +" --stackSize;\n" +" \n" +" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n" +" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" +" \n" +" //Optimization - if the BVH is structured as a binary radix tree, then\n" +" //each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n" +" //This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n" +" {\n" +" int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n" +" if(highestLeafIndex <= queryBvhNodeIndex) continue;\n" +" }\n" +" \n" +" //bvhRigidIndex is not used if internal node\n" +" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" +" \n" +" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" +" if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n" +" {\n" +" if(isLeaf)\n" +" {\n" +" int4 pair;\n" +" pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n" +" pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" +" pair.z = NEW_PAIR_MARKER;\n" +" pair.w = NEW_PAIR_MARKER;\n" +" \n" +" int pairIndex = atomic_inc(out_numPairs);\n" +" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" +" }\n" +" \n" +" if(!isLeaf) //Internal node\n" +" {\n" +" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" +" {\n" +" //Error\n" +" }\n" +" else\n" +" {\n" +" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" +" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" +" }\n" +" }\n" +" }\n" +" \n" +" }\n" +"}\n" +"//From rayCastKernels.cl\n" +"typedef struct\n" +"{\n" +" float4 m_from;\n" +" float4 m_to;\n" +"} b3RayInfo;\n" +"//From rayCastKernels.cl\n" +"b3Vector3 b3Vector3_normalize(b3Vector3 v)\n" +"{\n" +" b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n" +" return normalize(normal); //OpenCL normalize == vector4 normalize\n" +"}\n" +"b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n" +"b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n" +"int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n" +"{\n" +" //AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n" +" //t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n" +" //\n" +" //if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n" +" //and min.x will be the far plane; otherwise, it is reversed.\n" +" //\n" +" //In order for there to be a collision, the t_min and t_max of each pair must overlap.\n" +" //This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n" +" \n" +" int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) ); //isless(x,y) returns (x < y)\n" +" \n" +" //When using vector types, the select() function checks the most signficant bit, \n" +" //but isless() sets the least significant bit.\n" +" isNegative <<= 31;\n" +" //select(b, a, condition) == condition ? a : b\n" +" //When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n" +" b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" +" b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" +" \n" +" b3Scalar t_min_final = 0.0f;\n" +" b3Scalar t_max_final = rayLength;\n" +" \n" +" //Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n" +" //Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n" +" //Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n" +" t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n" +" t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n" +" \n" +" return (t_min_final <= t_max_final);\n" +"}\n" +"__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n" +" __global int* rootNodeIndex, \n" +" __global int2* internalNodeChildIndices, \n" +" __global b3AabbCL* internalNodeAabbs,\n" +" __global int2* internalNodeLeafIndexRanges,\n" +" __global SortDataCL* mortonCodesAndAabbIndices,\n" +" \n" +" __global b3RayInfo* rays,\n" +" \n" +" __global int* out_numRayRigidPairs, \n" +" __global int2* out_rayRigidPairs,\n" +" int maxRayRigidPairs, int numRays)\n" +"{\n" +" int rayIndex = get_global_id(0);\n" +" if(rayIndex >= numRays) return;\n" +" \n" +" //\n" +" b3Vector3 rayFrom = rays[rayIndex].m_from;\n" +" b3Vector3 rayTo = rays[rayIndex].m_to;\n" +" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" +" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" +" \n" +" //\n" +" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" +" \n" +" int stackSize = 1;\n" +" stack[0] = *rootNodeIndex;\n" +" \n" +" while(stackSize)\n" +" {\n" +" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" +" --stackSize;\n" +" \n" +" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n" +" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" +" \n" +" //bvhRigidIndex is not used if internal node\n" +" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" +" \n" +" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" +" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) )\n" +" {\n" +" if(isLeaf)\n" +" {\n" +" int2 rayRigidPair;\n" +" rayRigidPair.x = rayIndex;\n" +" rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" +" \n" +" int pairIndex = atomic_inc(out_numRayRigidPairs);\n" +" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" +" }\n" +" \n" +" if(!isLeaf) //Internal node\n" +" {\n" +" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" +" {\n" +" //Error\n" +" }\n" +" else\n" +" {\n" +" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" +" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" +" }\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n" +" __global int* out_numPairs, __global int4* out_overlappingPairs, \n" +" int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n" +"{\n" +" int smallAabbIndex = get_global_id(0);\n" +" if(smallAabbIndex >= numSmallAabbRigids) return;\n" +" \n" +" b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n" +" for(int i = 0; i < numLargeAabbRigids; ++i)\n" +" {\n" +" b3AabbCL largeAabb = largeAabbs[i];\n" +" if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n" +" {\n" +" int4 pair;\n" +" pair.x = largeAabb.m_minIndices[3];\n" +" pair.y = smallAabb.m_minIndices[3];\n" +" pair.z = NEW_PAIR_MARKER;\n" +" pair.w = NEW_PAIR_MARKER;\n" +" \n" +" int pairIndex = atomic_inc(out_numPairs);\n" +" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" +" }\n" +" }\n" +"}\n" +"__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n" +" __global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs,\n" +" int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n" +"{\n" +" int rayIndex = get_global_id(0);\n" +" if(rayIndex >= numRays) return;\n" +" \n" +" b3Vector3 rayFrom = rays[rayIndex].m_from;\n" +" b3Vector3 rayTo = rays[rayIndex].m_to;\n" +" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" +" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" +" \n" +" for(int i = 0; i < numLargeAabbRigids; ++i)\n" +" {\n" +" b3AabbCL rigidAabb = largeRigidAabbs[i];\n" +" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n" +" {\n" +" int2 rayRigidPair;\n" +" rayRigidPair.x = rayIndex;\n" +" rayRigidPair.y = rigidAabb.m_minIndices[3];\n" +" \n" +" int pairIndex = atomic_inc(out_numRayRigidPairs);\n" +" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" +" }\n" +" }\n" +"}\n" +"//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n" +"//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n" +"//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n" +"//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n" +"#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n" +"#define B3_PLBVH_ROOT_NODE_MARKER -1\n" +"#define b3Int64 long\n" +"int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n" +"b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n" +"{\n" +" //This function only needs to return (i & j) in order for the algorithm to work,\n" +" //but it may help with debugging to mask out the lower bits.\n" +" b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n" +" b3Int64 sharedBits = i & j;\n" +" b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0\n" +" \n" +" return sharedBits & bitmask;\n" +"}\n" +"//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n" +"int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n" +"{\n" +" return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n" +"}\n" +"__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n" +" __global b3Int64* out_commonPrefixes,\n" +" __global int* out_commonPrefixLengths,\n" +" int numInternalNodes)\n" +"{\n" +" int internalNodeIndex = get_global_id(0);\n" +" if (internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n" +" //and the number of internal nodes is always numLeafNodes - 1\n" +" int leftLeafIndex = internalNodeIndex;\n" +" int rightLeafIndex = internalNodeIndex + 1;\n" +" \n" +" int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n" +" int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n" +" \n" +" //Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n" +" //Append the index of each leaf node to each morton code so that there are no duplicates.\n" +" //The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n" +" //is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n" +" //\n" +" //upsample(a, b) == ( ((b3Int64)a) << 32) | b\n" +" b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n" +" b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n" +" \n" +" out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" +" out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" +"}\n" +"__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n" +" __global int2* out_childNodes, int numLeafNodes)\n" +"{\n" +" int leafNodeIndex = get_global_id(0);\n" +" if (leafNodeIndex >= numLeafNodes) return;\n" +" \n" +" int numInternalNodes = numLeafNodes - 1;\n" +" \n" +" int leftSplitIndex = leafNodeIndex - 1;\n" +" int rightSplitIndex = leafNodeIndex;\n" +" \n" +" int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +" int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +" \n" +" //Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n" +" //Leaf nodes are considered as having the highest common prefix\n" +" int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n" +" \n" +" //Handle cases for the edge nodes; the first and last node\n" +" //For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n" +" if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n" +" if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n" +" \n" +" int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n" +" out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n" +" \n" +" int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa\n" +" \n" +" //out_childNodesAsInt[0] == int2.x == left child\n" +" //out_childNodesAsInt[1] == int2.y == right child\n" +" int isLeaf = 1;\n" +" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" +" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n" +"}\n" +"__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n" +" __global int2* out_childNodes,\n" +" __global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n" +" int numInternalNodes)\n" +"{\n" +" int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" +" if(internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n" +" int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n" +" \n" +"//#define USE_LINEAR_SEARCH\n" +"#ifdef USE_LINEAR_SEARCH\n" +" int leftIndex = -1;\n" +" int rightIndex = -1;\n" +" \n" +" //Find nearest element to left with a lower common prefix\n" +" for(int i = internalNodeIndex - 1; i >= 0; --i)\n" +" {\n" +" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" +" if(nodeLeftSharedPrefixLength < nodePrefixLength)\n" +" {\n" +" leftIndex = i;\n" +" break;\n" +" }\n" +" }\n" +" \n" +" //Find nearest element to right with a lower common prefix\n" +" for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n" +" {\n" +" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" +" if(nodeRightSharedPrefixLength < nodePrefixLength)\n" +" {\n" +" rightIndex = i;\n" +" break;\n" +" }\n" +" }\n" +" \n" +"#else //Use binary search\n" +" //Find nearest element to left with a lower common prefix\n" +" int leftIndex = -1;\n" +" {\n" +" int lower = 0;\n" +" int upper = internalNodeIndex - 1;\n" +" \n" +" while(lower <= upper)\n" +" {\n" +" int mid = (lower + upper) / 2;\n" +" b3Int64 midPrefix = commonPrefixes[mid];\n" +" int midPrefixLength = commonPrefixLengths[mid];\n" +" \n" +" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" +" if(nodeMidSharedPrefixLength < nodePrefixLength) \n" +" {\n" +" int right = mid + 1;\n" +" if(right < internalNodeIndex)\n" +" {\n" +" b3Int64 rightPrefix = commonPrefixes[right];\n" +" int rightPrefixLength = commonPrefixLengths[right];\n" +" \n" +" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n" +" if(nodeRightSharedPrefixLength < nodePrefixLength) \n" +" {\n" +" lower = right;\n" +" leftIndex = right;\n" +" }\n" +" else \n" +" {\n" +" leftIndex = mid;\n" +" break;\n" +" }\n" +" }\n" +" else \n" +" {\n" +" leftIndex = mid;\n" +" break;\n" +" }\n" +" }\n" +" else upper = mid - 1;\n" +" }\n" +" }\n" +" \n" +" //Find nearest element to right with a lower common prefix\n" +" int rightIndex = -1;\n" +" {\n" +" int lower = internalNodeIndex + 1;\n" +" int upper = numInternalNodes - 1;\n" +" \n" +" while(lower <= upper)\n" +" {\n" +" int mid = (lower + upper) / 2;\n" +" b3Int64 midPrefix = commonPrefixes[mid];\n" +" int midPrefixLength = commonPrefixLengths[mid];\n" +" \n" +" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" +" if(nodeMidSharedPrefixLength < nodePrefixLength) \n" +" {\n" +" int left = mid - 1;\n" +" if(left > internalNodeIndex)\n" +" {\n" +" b3Int64 leftPrefix = commonPrefixes[left];\n" +" int leftPrefixLength = commonPrefixLengths[left];\n" +" \n" +" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n" +" if(nodeLeftSharedPrefixLength < nodePrefixLength) \n" +" {\n" +" upper = left;\n" +" rightIndex = left;\n" +" }\n" +" else \n" +" {\n" +" rightIndex = mid;\n" +" break;\n" +" }\n" +" }\n" +" else \n" +" {\n" +" rightIndex = mid;\n" +" break;\n" +" }\n" +" }\n" +" else lower = mid + 1;\n" +" }\n" +" }\n" +"#endif\n" +" \n" +" //Select parent\n" +" {\n" +" int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +" int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +" \n" +" int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n" +" \n" +" if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n" +" else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n" +" \n" +" int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n" +" \n" +" int isRootNode = (leftIndex == -1 && rightIndex == -1);\n" +" out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n" +" \n" +" int isLeaf = 0;\n" +" if(!isRootNode)\n" +" {\n" +" int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa\n" +" \n" +" //out_childNodesAsInt[0] == int2.x == left child\n" +" //out_childNodesAsInt[1] == int2.y == right child\n" +" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" +" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" +" }\n" +" else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" +" }\n" +"}\n" +"__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n" +" __global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n" +"{\n" +" if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n" +" int internalNodeIndex = get_global_id(0);\n" +" if(internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" //\n" +" int distanceFromRoot = 0;\n" +" {\n" +" int parentIndex = internalNodeParentNodes[internalNodeIndex];\n" +" while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n" +" {\n" +" parentIndex = internalNodeParentNodes[parentIndex];\n" +" ++distanceFromRoot;\n" +" }\n" +" }\n" +" out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n" +" \n" +" //\n" +" __local int localMaxDistanceFromRoot;\n" +" if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n" +"}\n" +"__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n" +" __global int2* childNodes,\n" +" __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n" +" int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n" +"{\n" +" int internalNodeIndex = get_global_id(0);\n" +" if(internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" int distance = distanceFromRoot[internalNodeIndex];\n" +" \n" +" if(distance == processedDistance)\n" +" {\n" +" int leftChildIndex = childNodes[internalNodeIndex].x;\n" +" int rightChildIndex = childNodes[internalNodeIndex].y;\n" +" \n" +" int isLeftChildLeaf = isLeafNode(leftChildIndex);\n" +" int isRightChildLeaf = isLeafNode(rightChildIndex);\n" +" \n" +" leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n" +" rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n" +" \n" +" //leftRigidIndex/rightRigidIndex is not used if internal node\n" +" int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n" +" int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n" +" \n" +" b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n" +" b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n" +" \n" +" b3AabbCL mergedAabb;\n" +" mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n" +" mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n" +" internalNodeAabbs[internalNodeIndex] = mergedAabb;\n" +" }\n" +"}\n" +"__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n" +"{\n" +" int internalNodeIndex = get_global_id(0);\n" +" if(internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" int numLeafNodes = numInternalNodes + 1;\n" +" \n" +" int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n" +" \n" +" int2 leafIndexRange; //x == min leaf index, y == max leaf index\n" +" \n" +" //Find lowest leaf index covered by this internal node\n" +" {\n" +" int lowestIndex = childNodes.x; //childNodes.x == Left child\n" +" while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n" +" leafIndexRange.x = lowestIndex;\n" +" }\n" +" \n" +" //Find highest leaf index covered by this internal node\n" +" {\n" +" int highestIndex = childNodes.y; //childNodes.y == Right child\n" +" while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n" +" leafIndexRange.y = highestIndex;\n" +" }\n" +" \n" +" //\n" +" out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n" +"}\n" +; diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl new file mode 100644 index 0000000000..93f77a6433 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl @@ -0,0 +1,389 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + +#define NEW_PAIR_MARKER -1 + +typedef struct +{ + union + { + float4 m_min; + float m_minElems[4]; + int m_minIndices[4]; + }; + union + { + float4 m_max; + float m_maxElems[4]; + int m_maxIndices[4]; + }; +} btAabbCL; + + +/// conservative test for overlap between two aabbs +bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2); +bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2) +{ + bool overlap = true; + overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap; + overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap; + overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap; + return overlap; +} +bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2); +bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2) +{ + bool overlap = true; + overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap; + overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap; + overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap; + return overlap; +} + +bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2); +bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2) +{ + bool overlap = true; + overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap; + overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap; + overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap; + return overlap; +} + + +__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs) +{ + int i = get_global_id(0); + if (i>=numUnsortedAabbs) + return; + + int j = get_global_id(1); + if (j>=numUnSortedAabbs2) + return; + + + __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]]; + __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]]; + + if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2)) + { + int4 myPair; + + int xIndex = unsortedAabbPtr[0].m_minIndices[3]; + int yIndex = unsortedAabbPtr2[0].m_minIndices[3]; + if (xIndex>yIndex) + { + int tmp = xIndex; + xIndex=yIndex; + yIndex=tmp; + } + + myPair.x = xIndex; + myPair.y = yIndex; + myPair.z = NEW_PAIR_MARKER; + myPair.w = NEW_PAIR_MARKER; + + + int curPair = atomic_inc (pairCount); + if (curPair<maxPairs) + { + pairsOut[curPair] = myPair; //flush to main memory + } + } +} + + + +__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs) +{ + int i = get_global_id(0); + if (i>=numObjects) + return; + for (int j=i+1;j<numObjects;j++) + { + if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j])) + { + int4 myPair; + myPair.x = aabbs[i].m_minIndices[3]; + myPair.y = aabbs[j].m_minIndices[3]; + myPair.z = NEW_PAIR_MARKER; + myPair.w = NEW_PAIR_MARKER; + + int curPair = atomic_inc (pairCount); + if (curPair<maxPairs) + { + pairsOut[curPair] = myPair; //flush to main memory + } + } + } +} + +__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs) +{ + int i = get_global_id(0); + if (i>=numObjects) + return; + for (int j=i+1;j<numObjects;j++) + { + if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) + { + break; + } + if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j])) + { + int4 myPair; + myPair.x = aabbs[i].m_minIndices[3]; + myPair.y = aabbs[j].m_minIndices[3]; + myPair.z = NEW_PAIR_MARKER; + myPair.w = NEW_PAIR_MARKER; + + int curPair = atomic_inc (pairCount); + if (curPair<maxPairs) + { + pairsOut[curPair] = myPair; //flush to main memory + } + } + } +} + + + + +__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs) +{ + int i = get_global_id(0); + int localId = get_local_id(0); + + __local int numActiveWgItems[1]; + __local int breakRequest[1]; + + if (localId==0) + { + numActiveWgItems[0] = 0; + breakRequest[0] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + atomic_inc(numActiveWgItems); + barrier(CLK_LOCAL_MEM_FENCE); + int localBreak = 0; + + int j=i+1; + do + { + barrier(CLK_LOCAL_MEM_FENCE); + + if (j<numObjects) + { + if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) + { + if (!localBreak) + { + atomic_inc(breakRequest); + localBreak = 1; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (j>=numObjects && !localBreak) + { + atomic_inc(breakRequest); + localBreak = 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (!localBreak) + { + if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j])) + { + int4 myPair; + myPair.x = aabbs[i].m_minIndices[3]; + myPair.y = aabbs[j].m_minIndices[3]; + myPair.z = NEW_PAIR_MARKER; + myPair.w = NEW_PAIR_MARKER; + + int curPair = atomic_inc (pairCount); + if (curPair<maxPairs) + { + pairsOut[curPair] = myPair; //flush to main memory + } + } + } + j++; + + } while (breakRequest[0]<numActiveWgItems[0]); +} + + +__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs) +{ + int i = get_global_id(0); + int localId = get_local_id(0); + + __local int numActiveWgItems[1]; + __local int breakRequest[1]; + __local btAabbCL localAabbs[128];// = aabbs[i]; + + btAabbCL myAabb; + + myAabb = (i<numObjects)? aabbs[i]:aabbs[0]; + float testValue = myAabb.m_maxElems[axis]; + + if (localId==0) + { + numActiveWgItems[0] = 0; + breakRequest[0] = 0; + } + int localCount=0; + int block=0; + localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0]; + localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0]; + + barrier(CLK_LOCAL_MEM_FENCE); + atomic_inc(numActiveWgItems); + barrier(CLK_LOCAL_MEM_FENCE); + int localBreak = 0; + + int j=i+1; + do + { + barrier(CLK_LOCAL_MEM_FENCE); + + if (j<numObjects) + { + if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) + { + if (!localBreak) + { + atomic_inc(breakRequest); + localBreak = 1; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (j>=numObjects && !localBreak) + { + atomic_inc(breakRequest); + localBreak = 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (!localBreak) + { + if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1])) + { + int4 myPair; + myPair.x = myAabb.m_minIndices[3]; + myPair.y = localAabbs[localCount+localId+1].m_minIndices[3]; + myPair.z = NEW_PAIR_MARKER; + myPair.w = NEW_PAIR_MARKER; + + int curPair = atomic_inc (pairCount); + if (curPair<maxPairs) + { + pairsOut[curPair] = myPair; //flush to main memory + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + localCount++; + if (localCount==64) + { + localCount = 0; + block+=64; + localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0]; + localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0]; + } + j++; + + } while (breakRequest[0]<numActiveWgItems[0]); + +} + + + + +//http://stereopsis.com/radix.html +unsigned int FloatFlip(float fl); +unsigned int FloatFlip(float fl) +{ + unsigned int f = *(unsigned int*)&fl; + unsigned int mask = -(int)(f >> 31) | 0x80000000; + return f ^ mask; +} +float IFloatFlip(unsigned int f); +float IFloatFlip(unsigned int f) +{ + unsigned int mask = ((f >> 31) - 1) | 0x80000000; + unsigned int fl = f ^ mask; + return *(float*)&fl; +} + + + + +__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects) +{ + int i = get_global_id(0); + if (i>=numObjects) + return; + int src = destAabbs[i].m_maxIndices[3]; + destAabbs[i] = allAabbs[src]; + destAabbs[i].m_maxIndices[3] = src; +} + + +__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis) +{ + int i = get_global_id(0); + if (i>=numObjects) + return; + + + sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]); + sortData[i].y = i; + +} + + +__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects) +{ + int i = get_global_id(0); + if (i>=numObjects) + return; + + sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]]; +} + + + +__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs) +{ + int i = get_global_id(0); + if (i>=numAabbs) + return; + + btAabbCL smallAabb = allAabbs[smallAabbMapping[i]]; + + float4 s; + s = (smallAabb.m_max+smallAabb.m_min)*0.5f; + sum[i]=s; + sum2[i]=s*s; +} diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h new file mode 100644 index 0000000000..04d40fcf26 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h @@ -0,0 +1,342 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* sapCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Erwin Coumans\n" +"#define NEW_PAIR_MARKER -1\n" +"typedef struct \n" +"{\n" +" union\n" +" {\n" +" float4 m_min;\n" +" float m_minElems[4];\n" +" int m_minIndices[4];\n" +" };\n" +" union\n" +" {\n" +" float4 m_max;\n" +" float m_maxElems[4];\n" +" int m_maxIndices[4];\n" +" };\n" +"} btAabbCL;\n" +"/// conservative test for overlap between two aabbs\n" +"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n" +"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n" +"{\n" +" bool overlap = true;\n" +" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" +"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" +"{\n" +" bool overlap = true;\n" +" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" +"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" +"{\n" +" bool overlap = true;\n" +" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numUnsortedAabbs)\n" +" return;\n" +" int j = get_global_id(1);\n" +" if (j>=numUnSortedAabbs2)\n" +" return;\n" +" __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n" +" __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n" +" if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n" +" {\n" +" int4 myPair;\n" +" \n" +" int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n" +" int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n" +" if (xIndex>yIndex)\n" +" {\n" +" int tmp = xIndex;\n" +" xIndex=yIndex;\n" +" yIndex=tmp;\n" +" }\n" +" \n" +" myPair.x = xIndex;\n" +" myPair.y = yIndex;\n" +" myPair.z = NEW_PAIR_MARKER;\n" +" myPair.w = NEW_PAIR_MARKER;\n" +" int curPair = atomic_inc (pairCount);\n" +" if (curPair<maxPairs)\n" +" {\n" +" pairsOut[curPair] = myPair; //flush to main memory\n" +" }\n" +" }\n" +"}\n" +"__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numObjects)\n" +" return;\n" +" for (int j=i+1;j<numObjects;j++)\n" +" {\n" +" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" +" {\n" +" int4 myPair;\n" +" myPair.x = aabbs[i].m_minIndices[3];\n" +" myPair.y = aabbs[j].m_minIndices[3];\n" +" myPair.z = NEW_PAIR_MARKER;\n" +" myPair.w = NEW_PAIR_MARKER;\n" +" int curPair = atomic_inc (pairCount);\n" +" if (curPair<maxPairs)\n" +" {\n" +" pairsOut[curPair] = myPair; //flush to main memory\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numObjects)\n" +" return;\n" +" for (int j=i+1;j<numObjects;j++)\n" +" {\n" +" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n" +" {\n" +" break;\n" +" }\n" +" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" +" {\n" +" int4 myPair;\n" +" myPair.x = aabbs[i].m_minIndices[3];\n" +" myPair.y = aabbs[j].m_minIndices[3];\n" +" myPair.z = NEW_PAIR_MARKER;\n" +" myPair.w = NEW_PAIR_MARKER;\n" +" int curPair = atomic_inc (pairCount);\n" +" if (curPair<maxPairs)\n" +" {\n" +" pairsOut[curPair] = myPair; //flush to main memory\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" +"{\n" +" int i = get_global_id(0);\n" +" int localId = get_local_id(0);\n" +" __local int numActiveWgItems[1];\n" +" __local int breakRequest[1];\n" +" if (localId==0)\n" +" {\n" +" numActiveWgItems[0] = 0;\n" +" breakRequest[0] = 0;\n" +" }\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" atomic_inc(numActiveWgItems);\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" int localBreak = 0;\n" +" int j=i+1;\n" +" do\n" +" {\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if (j<numObjects)\n" +" {\n" +" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n" +" {\n" +" if (!localBreak)\n" +" {\n" +" atomic_inc(breakRequest);\n" +" localBreak = 1;\n" +" }\n" +" }\n" +" }\n" +" \n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if (j>=numObjects && !localBreak)\n" +" {\n" +" atomic_inc(breakRequest);\n" +" localBreak = 1;\n" +" }\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if (!localBreak)\n" +" {\n" +" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" +" {\n" +" int4 myPair;\n" +" myPair.x = aabbs[i].m_minIndices[3];\n" +" myPair.y = aabbs[j].m_minIndices[3];\n" +" myPair.z = NEW_PAIR_MARKER;\n" +" myPair.w = NEW_PAIR_MARKER;\n" +" int curPair = atomic_inc (pairCount);\n" +" if (curPair<maxPairs)\n" +" {\n" +" pairsOut[curPair] = myPair; //flush to main memory\n" +" }\n" +" }\n" +" }\n" +" j++;\n" +" } while (breakRequest[0]<numActiveWgItems[0]);\n" +"}\n" +"__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" +"{\n" +" int i = get_global_id(0);\n" +" int localId = get_local_id(0);\n" +" __local int numActiveWgItems[1];\n" +" __local int breakRequest[1];\n" +" __local btAabbCL localAabbs[128];// = aabbs[i];\n" +" \n" +" btAabbCL myAabb;\n" +" \n" +" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n" +" float testValue = myAabb.m_maxElems[axis];\n" +" \n" +" if (localId==0)\n" +" {\n" +" numActiveWgItems[0] = 0;\n" +" breakRequest[0] = 0;\n" +" }\n" +" int localCount=0;\n" +" int block=0;\n" +" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n" +" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n" +" \n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" atomic_inc(numActiveWgItems);\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" int localBreak = 0;\n" +" \n" +" int j=i+1;\n" +" do\n" +" {\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if (j<numObjects)\n" +" {\n" +" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n" +" {\n" +" if (!localBreak)\n" +" {\n" +" atomic_inc(breakRequest);\n" +" localBreak = 1;\n" +" }\n" +" }\n" +" }\n" +" \n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if (j>=numObjects && !localBreak)\n" +" {\n" +" atomic_inc(breakRequest);\n" +" localBreak = 1;\n" +" }\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if (!localBreak)\n" +" {\n" +" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n" +" {\n" +" int4 myPair;\n" +" myPair.x = myAabb.m_minIndices[3];\n" +" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n" +" myPair.z = NEW_PAIR_MARKER;\n" +" myPair.w = NEW_PAIR_MARKER;\n" +" int curPair = atomic_inc (pairCount);\n" +" if (curPair<maxPairs)\n" +" {\n" +" pairsOut[curPair] = myPair; //flush to main memory\n" +" }\n" +" }\n" +" }\n" +" \n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" localCount++;\n" +" if (localCount==64)\n" +" {\n" +" localCount = 0;\n" +" block+=64; \n" +" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n" +" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n" +" }\n" +" j++;\n" +" \n" +" } while (breakRequest[0]<numActiveWgItems[0]);\n" +" \n" +"}\n" +"//http://stereopsis.com/radix.html\n" +"unsigned int FloatFlip(float fl);\n" +"unsigned int FloatFlip(float fl)\n" +"{\n" +" unsigned int f = *(unsigned int*)&fl;\n" +" unsigned int mask = -(int)(f >> 31) | 0x80000000;\n" +" return f ^ mask;\n" +"}\n" +"float IFloatFlip(unsigned int f);\n" +"float IFloatFlip(unsigned int f)\n" +"{\n" +" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n" +" unsigned int fl = f ^ mask;\n" +" return *(float*)&fl;\n" +"}\n" +"__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numObjects)\n" +" return;\n" +" int src = destAabbs[i].m_maxIndices[3];\n" +" destAabbs[i] = allAabbs[src];\n" +" destAabbs[i].m_maxIndices[3] = src;\n" +"}\n" +"__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numObjects)\n" +" return;\n" +" \n" +" \n" +" sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n" +" sortData[i].y = i;\n" +" \n" +"}\n" +"__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numObjects)\n" +" return;\n" +" \n" +" sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n" +"}\n" +"__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numAabbs)\n" +" return;\n" +" \n" +" btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n" +" \n" +" float4 s;\n" +" s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n" +" sum[i]=s;\n" +" sum2[i]=s*s; \n" +"}\n" +; |