diff options
Diffstat (limited to 'thirdparty/bullet/src/Bullet3OpenCL')
124 files changed, 57881 insertions, 0 deletions
diff --git a/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h new file mode 100644 index 0000000000..0ed8aa8232 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h @@ -0,0 +1,44 @@ + +#ifndef B3_GPU_BROADPHASE_INTERFACE_H +#define B3_GPU_BROADPHASE_INTERFACE_H + +#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h" +#include "Bullet3Common/b3Vector3.h" +#include "b3SapAabb.h" +#include "Bullet3Common/shared/b3Int2.h" +#include "Bullet3Common/shared/b3Int4.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" + +class b3GpuBroadphaseInterface +{ +public: + + typedef class b3GpuBroadphaseInterface* (CreateFunc)(cl_context ctx,cl_device_id device, cl_command_queue q); + + virtual ~b3GpuBroadphaseInterface() + { + } + + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0; + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0; + + virtual void calculateOverlappingPairs(int maxPairs)=0; + virtual void calculateOverlappingPairsHost(int maxPairs)=0; + + //call writeAabbsToGpu after done making all changes (createProxy etc) + virtual void writeAabbsToGpu()=0; + + virtual cl_mem getAabbBufferWS()=0; + virtual int getNumOverlap()=0; + virtual cl_mem getOverlappingPairBuffer()=0; + + virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()=0; + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()=0; + + virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() = 0; + virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() = 0; + virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() = 0; + +}; + +#endif //B3_GPU_BROADPHASE_INTERFACE_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp new file mode 100644 index 0000000000..74d0c8056c --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp @@ -0,0 +1,385 @@ + +#include "b3GpuGridBroadphase.h" +#include "Bullet3Geometry/b3AabbUtil.h" +#include "kernels/gridBroadphaseKernels.h" +#include "kernels/sapKernels.h" +//#include "kernels/gridBroadphase.cl" + + +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" + + + +#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl" +#define B3_GRID_BROADPHASE_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" + +cl_kernel kCalcHashAABB; +cl_kernel kClearCellStart; +cl_kernel kFindCellStart; +cl_kernel kFindOverlappingPairs; +cl_kernel m_copyAabbsKernel; +cl_kernel m_sap2Kernel; + + + + + +//int maxPairsPerBody = 64; +int maxBodiesPerCell = 256;//?? + +b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q ) +:m_context(ctx), +m_device(device), +m_queue(q), +m_allAabbsGPU1(ctx,q), +m_smallAabbsMappingGPU(ctx,q), +m_largeAabbsMappingGPU(ctx,q), +m_gpuPairs(ctx,q), + +m_hashGpu(ctx,q), + +m_cellStartGpu(ctx,q), +m_paramsGPU(ctx,q) +{ + + + b3Vector3 gridSize = b3MakeVector3(3,3,3); + b3Vector3 invGridSize = b3MakeVector3(1.f/gridSize[0],1.f/gridSize[1],1.f/gridSize[2]); + + m_paramsCPU.m_gridSize[0] = 128; + m_paramsCPU.m_gridSize[1] = 128; + m_paramsCPU.m_gridSize[2] = 128; + m_paramsCPU.m_gridSize[3] = maxBodiesPerCell; + m_paramsCPU.setMaxBodiesPerCell(maxBodiesPerCell); + m_paramsCPU.m_invCellSize[0] = invGridSize[0]; + m_paramsCPU.m_invCellSize[1] = invGridSize[1]; + m_paramsCPU.m_invCellSize[2] = invGridSize[2]; + m_paramsCPU.m_invCellSize[3] = 0.f; + m_paramsGPU.push_back(m_paramsCPU); + + cl_int errNum=0; + + { + const char* sapSrc = sapCL; + cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH); + b3Assert(errNum==CL_SUCCESS); + m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg ); + m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg ); + b3Assert(errNum==CL_SUCCESS); + } + + { + + cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,gridBroadphaseCL,&errNum,"",B3_GRID_BROADPHASE_PATH); + b3Assert(errNum==CL_SUCCESS); + + kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kCalcHashAABB",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); + + kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kClearCellStart",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); + + kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindCellStart",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); + + + kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindOverlappingPairs",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); + + + + + } + + m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue); + +} +b3GpuGridBroadphase::~b3GpuGridBroadphase() +{ + clReleaseKernel( kCalcHashAABB); + clReleaseKernel( kClearCellStart); + clReleaseKernel( kFindCellStart); + clReleaseKernel( kFindOverlappingPairs); + clReleaseKernel( m_sap2Kernel); + clReleaseKernel( m_copyAabbsKernel); + + + + delete m_sorter; +} + + + +void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +{ + b3SapAabb aabb; + aabb.m_minVec = aabbMin; + aabb.m_maxVec = aabbMax; + aabb.m_minIndices[3] = userPtr; + aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr; + m_smallAabbsMappingCPU.push_back(m_allAabbsCPU1.size()); + + m_allAabbsCPU1.push_back(aabb); + +} +void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +{ + b3SapAabb aabb; + aabb.m_minVec = aabbMin; + aabb.m_maxVec = aabbMax; + aabb.m_minIndices[3] = userPtr; + aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr; + m_largeAabbsMappingCPU.push_back(m_allAabbsCPU1.size()); + + m_allAabbsCPU1.push_back(aabb); +} + +void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs) +{ + B3_PROFILE("b3GpuGridBroadphase::calculateOverlappingPairs"); + + + if (0) + { + calculateOverlappingPairsHost(maxPairs); + /* + b3AlignedObjectArray<b3Int4> cpuPairs; + m_gpuPairs.copyToHost(cpuPairs); + printf("host m_gpuPairs.size()=%d\n",m_gpuPairs.size()); + for (int i=0;i<m_gpuPairs.size();i++) + { + printf("host pair %d = %d,%d\n",i,cpuPairs[i].x,cpuPairs[i].y); + } + */ + return; + } + + + + + + int numSmallAabbs = m_smallAabbsMappingGPU.size(); + + b3OpenCLArray<int> pairCount(m_context,m_queue); + pairCount.push_back(0); + m_gpuPairs.resize(maxPairs);//numSmallAabbs*maxPairsPerBody); + + { + int numLargeAabbs = m_largeAabbsMappingGPU.size(); + if (numLargeAabbs && numSmallAabbs) + { + B3_PROFILE("sap2Kernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( m_allAabbsGPU1.getBufferCL() ), + b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ), + b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ), + b3BufferInfoCL( m_gpuPairs.getBufferCL() ), + b3BufferInfoCL(pairCount.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( numLargeAabbs ); + launcher.setConst( numSmallAabbs); + launcher.setConst( 0 );//axis is not used + launcher.setConst( maxPairs ); + //@todo: use actual maximum work item sizes of the device instead of hardcoded values + launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64); + + int numPairs = pairCount.at(0); + + if (numPairs >maxPairs) + { + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs =maxPairs; + } + } + } + + + + + if (numSmallAabbs) + { + B3_PROFILE("gridKernel"); + m_hashGpu.resize(numSmallAabbs); + { + B3_PROFILE("kCalcHashAABB"); + b3LauncherCL launch(m_queue,kCalcHashAABB,"kCalcHashAABB"); + launch.setConst(numSmallAabbs); + launch.setBuffer(m_allAabbsGPU1.getBufferCL()); + launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); + launch.setBuffer(m_hashGpu.getBufferCL()); + launch.setBuffer(this->m_paramsGPU.getBufferCL()); + launch.launch1D(numSmallAabbs); + } + + m_sorter->execute(m_hashGpu); + + int numCells = this->m_paramsCPU.m_gridSize[0]*this->m_paramsCPU.m_gridSize[1]*this->m_paramsCPU.m_gridSize[2]; + m_cellStartGpu.resize(numCells); + //b3AlignedObjectArray<int > cellStartCpu; + + + { + B3_PROFILE("kClearCellStart"); + b3LauncherCL launch(m_queue,kClearCellStart,"kClearCellStart"); + launch.setConst(numCells); + launch.setBuffer(m_cellStartGpu.getBufferCL()); + launch.launch1D(numCells); + //m_cellStartGpu.copyToHost(cellStartCpu); + //printf("??\n"); + + } + + + { + B3_PROFILE("kFindCellStart"); + b3LauncherCL launch(m_queue,kFindCellStart,"kFindCellStart"); + launch.setConst(numSmallAabbs); + launch.setBuffer(m_hashGpu.getBufferCL()); + launch.setBuffer(m_cellStartGpu.getBufferCL()); + launch.launch1D(numSmallAabbs); + //m_cellStartGpu.copyToHost(cellStartCpu); + //printf("??\n"); + + } + + { + B3_PROFILE("kFindOverlappingPairs"); + + + b3LauncherCL launch(m_queue,kFindOverlappingPairs,"kFindOverlappingPairs"); + launch.setConst(numSmallAabbs); + launch.setBuffer(m_allAabbsGPU1.getBufferCL()); + launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); + launch.setBuffer(m_hashGpu.getBufferCL()); + launch.setBuffer(m_cellStartGpu.getBufferCL()); + + launch.setBuffer(m_paramsGPU.getBufferCL()); + //launch.setBuffer(0); + launch.setBuffer(pairCount.getBufferCL()); + launch.setBuffer(m_gpuPairs.getBufferCL()); + + launch.setConst(maxPairs); + launch.launch1D(numSmallAabbs); + + + int numPairs = pairCount.at(0); + if (numPairs >maxPairs) + { + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs =maxPairs; + } + + m_gpuPairs.resize(numPairs); + + if (0) + { + b3AlignedObjectArray<b3Int4> pairsCpu; + m_gpuPairs.copyToHost(pairsCpu); + + int sz = m_gpuPairs.size(); + printf("m_gpuPairs.size()=%d\n",sz); + for (int i=0;i<m_gpuPairs.size();i++) + { + printf("pair %d = %d,%d\n",i,pairsCpu[i].x,pairsCpu[i].y); + } + + printf("?!?\n"); + } + + } + + + } + + + + + + //calculateOverlappingPairsHost(maxPairs); +} +void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs) +{ + + m_hostPairs.resize(0); + m_allAabbsGPU1.copyToHost(m_allAabbsCPU1); + for (int i=0;i<m_allAabbsCPU1.size();i++) + { + for (int j=i+1;j<m_allAabbsCPU1.size();j++) + { + if (b3TestAabbAgainstAabb2(m_allAabbsCPU1[i].m_minVec, m_allAabbsCPU1[i].m_maxVec, + m_allAabbsCPU1[j].m_minVec,m_allAabbsCPU1[j].m_maxVec)) + { + b3Int4 pair; + int a = m_allAabbsCPU1[j].m_minIndices[3]; + int b = m_allAabbsCPU1[i].m_minIndices[3]; + if (a<=b) + { + pair.x = a; + pair.y = b;//store the original index in the unsorted aabb array + } else + { + pair.x = b; + pair.y = a;//store the original index in the unsorted aabb array + } + + if (m_hostPairs.size()<maxPairs) + { + m_hostPairs.push_back(pair); + } + } + } + } + + + m_gpuPairs.copyFromHost(m_hostPairs); + + +} + + //call writeAabbsToGpu after done making all changes (createProxy etc) +void b3GpuGridBroadphase::writeAabbsToGpu() +{ + m_allAabbsGPU1.copyFromHost(m_allAabbsCPU1); + m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU); + m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU); + +} + +cl_mem b3GpuGridBroadphase::getAabbBufferWS() +{ + return this->m_allAabbsGPU1.getBufferCL(); +} +int b3GpuGridBroadphase::getNumOverlap() +{ + return m_gpuPairs.size(); +} +cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer() +{ + return m_gpuPairs.getBufferCL(); +} + +b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU() +{ + return m_allAabbsGPU1; +} + +b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU() +{ + return m_allAabbsCPU1; +} + +b3OpenCLArray<b3Int4>& b3GpuGridBroadphase::getOverlappingPairsGPU() +{ + return m_gpuPairs; +} +b3OpenCLArray<int>& b3GpuGridBroadphase::getSmallAabbIndicesGPU() +{ + return m_smallAabbsMappingGPU; +} +b3OpenCLArray<int>& b3GpuGridBroadphase::getLargeAabbIndicesGPU() +{ + return m_largeAabbsMappingGPU; +} + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h new file mode 100644 index 0000000000..ec18c9f716 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h @@ -0,0 +1,88 @@ +#ifndef B3_GPU_GRID_BROADPHASE_H +#define B3_GPU_GRID_BROADPHASE_H + +#include "b3GpuBroadphaseInterface.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" + +struct b3ParamsGridBroadphaseCL +{ + + float m_invCellSize[4]; + int m_gridSize[4]; + + int getMaxBodiesPerCell() const + { + return m_gridSize[3]; + } + + void setMaxBodiesPerCell(int maxOverlap) + { + m_gridSize[3] = maxOverlap; + } +}; + + +class b3GpuGridBroadphase : public b3GpuBroadphaseInterface +{ +protected: + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + + b3OpenCLArray<b3SapAabb> m_allAabbsGPU1; + b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1; + + b3OpenCLArray<int> m_smallAabbsMappingGPU; + b3AlignedObjectArray<int> m_smallAabbsMappingCPU; + + b3OpenCLArray<int> m_largeAabbsMappingGPU; + b3AlignedObjectArray<int> m_largeAabbsMappingCPU; + + b3AlignedObjectArray<b3Int4> m_hostPairs; + b3OpenCLArray<b3Int4> m_gpuPairs; + + b3OpenCLArray<b3SortData> m_hashGpu; + b3OpenCLArray<int> m_cellStartGpu; + + + b3ParamsGridBroadphaseCL m_paramsCPU; + b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU; + + class b3RadixSort32CL* m_sorter; + +public: + + b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q ); + virtual ~b3GpuGridBroadphase(); + + static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx,cl_device_id device, cl_command_queue q) + { + return new b3GpuGridBroadphase(ctx,device,q); + } + + + + + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); + + virtual void calculateOverlappingPairs(int maxPairs); + virtual void calculateOverlappingPairsHost(int maxPairs); + + //call writeAabbsToGpu after done making all changes (createProxy etc) + virtual void writeAabbsToGpu(); + + virtual cl_mem getAabbBufferWS(); + virtual int getNumOverlap(); + virtual cl_mem getOverlappingPairBuffer(); + + virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU(); + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU(); + + virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU(); + virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU(); + virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU(); + +}; + +#endif //B3_GPU_GRID_BROADPHASE_H
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp new file mode 100644 index 0000000000..641df9eb12 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp @@ -0,0 +1,577 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" + +#include "b3GpuParallelLinearBvh.h" + +b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : + m_queue(queue), + m_radixSorter(context, device, queue), + + m_rootNodeIndex(context, queue), + m_maxDistanceFromRoot(context, queue), + m_temp(context, queue), + + m_internalNodeAabbs(context, queue), + m_internalNodeLeafIndexRanges(context, queue), + m_internalNodeChildNodes(context, queue), + m_internalNodeParentNodes(context, queue), + + m_commonPrefixes(context, queue), + m_commonPrefixLengths(context, queue), + m_distanceFromRoot(context, queue), + + m_leafNodeParentNodes(context, queue), + m_mortonCodesAndAabbIndicies(context, queue), + m_mergedAabb(context, queue), + m_leafNodeAabbs(context, queue), + + m_largeAabbs(context, queue) +{ + m_rootNodeIndex.resize(1); + m_maxDistanceFromRoot.resize(1); + m_temp.resize(1); + + // + const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl"; + + const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h + cl_int error; + char* additionalMacros = 0; + m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH); + b3Assert(m_parallelLinearBvhProgram); + + m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_separateAabbsKernel); + m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_findAllNodesMergedAabbKernel); + m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_assignMortonCodesAndAabbIndiciesKernel); + + m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_computeAdjacentPairCommonPrefixKernel); + m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_buildBinaryRadixTreeLeafNodesKernel); + m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_buildBinaryRadixTreeInternalNodesKernel); + m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_findDistanceFromRootKernel); + m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel); + + m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_findLeafIndexRangesKernel); + + m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_plbvhCalculateOverlappingPairsKernel); + m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_plbvhRayTraverseKernel); + m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_plbvhLargeAabbAabbTestKernel); + m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_plbvhLargeAabbRayTestKernel); +} + +b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh() +{ + clReleaseKernel(m_separateAabbsKernel); + clReleaseKernel(m_findAllNodesMergedAabbKernel); + clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel); + + clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel); + clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel); + clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel); + clReleaseKernel(m_findDistanceFromRootKernel); + clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel); + + clReleaseKernel(m_findLeafIndexRangesKernel); + + clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel); + clReleaseKernel(m_plbvhRayTraverseKernel); + clReleaseKernel(m_plbvhLargeAabbAabbTestKernel); + clReleaseKernel(m_plbvhLargeAabbRayTestKernel); + + clReleaseProgram(m_parallelLinearBvhProgram); +} + +void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, + const b3OpenCLArray<int>& largeAabbIndices) +{ + B3_PROFILE("b3ParallelLinearBvh::build()"); + + int numLargeAabbs = largeAabbIndices.size(); + int numSmallAabbs = smallAabbIndices.size(); + + //Since all AABBs(both large and small) are input as a contiguous array, + //with 2 additional arrays used to indicate the indices of large and small AABBs, + //it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH. + { + B3_PROFILE("Separate large and small AABBs"); + + m_largeAabbs.resize(numLargeAabbs); + m_leafNodeAabbs.resize(numSmallAabbs); + + //Write large AABBs into m_largeAabbs + { + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), + b3BufferInfoCL( largeAabbIndices.getBufferCL() ), + + b3BufferInfoCL( m_largeAabbs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLargeAabbs); + + launcher.launch1D(numLargeAabbs); + } + + //Write small AABBs into m_leafNodeAabbs + { + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), + b3BufferInfoCL( smallAabbIndices.getBufferCL() ), + + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numSmallAabbs); + + launcher.launch1D(numSmallAabbs); + } + + clFinish(m_queue); + } + + // + int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs + int numInternalNodes = numLeaves - 1; + + if(numLeaves < 2) + { + //Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(), + //so it does not matter if numLeaves == 0 and rootNodeIndex == -1 + int rootNodeIndex = numLeaves - 1; + m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1); + + //Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm, + //m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index + //instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work. + //( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs ) + if(numLeaves == 1) + { + b3SortData leaf; + leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set + + m_mortonCodesAndAabbIndicies.resize(1); + m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1); + } + + return; + } + + // + { + m_internalNodeAabbs.resize(numInternalNodes); + m_internalNodeLeafIndexRanges.resize(numInternalNodes); + m_internalNodeChildNodes.resize(numInternalNodes); + m_internalNodeParentNodes.resize(numInternalNodes); + + m_commonPrefixes.resize(numInternalNodes); + m_commonPrefixLengths.resize(numInternalNodes); + m_distanceFromRoot.resize(numInternalNodes); + + m_leafNodeParentNodes.resize(numLeaves); + m_mortonCodesAndAabbIndicies.resize(numLeaves); + m_mergedAabb.resize(numLeaves); + } + + //Find the merged AABB of all small AABBs; this is used to define the size of + //each cell in the virtual grid for the next kernel(2^10 cells in each dimension). + { + B3_PROFILE("Find AABB of merged nodes"); + + m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array + + for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2; + numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2) + { + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0] + }; + + b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numAabbsNeedingMerge); + + launcher.launch1D(numAabbsNeedingMerge); + } + + clFinish(m_queue); + } + + //Insert the center of the AABBs into a virtual grid, + //then convert the discrete grid coordinates into a morton code + //For each element in m_mortonCodesAndAabbIndicies, set + // m_key == morton code (value to sort by) + // m_value == small AABB index + { + B3_PROFILE("Assign morton codes"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_mergedAabb.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLeaves); + + launcher.launch1D(numLeaves); + clFinish(m_queue); + } + + // + { + B3_PROFILE("Sort leaves by morton codes"); + + m_radixSorter.execute(m_mortonCodesAndAabbIndicies); + clFinish(m_queue); + } + + // + constructBinaryRadixTree(); + + + //Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices. + //The root node contains leaf node indices in the range [0, numLeafNodes - 1]. + //The child nodes of each node split their parent's index range into 2 contiguous halves. + // + //For example, if the root has indices [0, 31], its children might partition that range into [0, 11] and [12, 31]. + //The next level in the tree could then split those ranges into [0, 2], [3, 11], [12, 22], and [23, 31]. + // + //This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice + { + B3_PROFILE("m_findLeafIndexRangesKernel"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numInternalNodes); + + launcher.launch1D(numInternalNodes); + clFinish(m_queue); + } +} + +void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs) +{ + int maxPairs = out_overlappingPairs.size(); + b3OpenCLArray<int>& numPairsGpu = m_temp; + + int reset = 0; + numPairsGpu.copyFromHostPointer(&reset, 1); + + // + if( m_leafNodeAabbs.size() > 1 ) + { + B3_PROFILE("PLBVH small-small AABB test"); + + int numQueryAabbs = m_leafNodeAabbs.size(); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + + b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + + b3BufferInfoCL( numPairsGpu.getBufferCL() ), + b3BufferInfoCL( out_overlappingPairs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(maxPairs); + launcher.setConst(numQueryAabbs); + + launcher.launch1D(numQueryAabbs); + clFinish(m_queue); + } + + int numLargeAabbRigids = m_largeAabbs.size(); + if( numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0 ) + { + B3_PROFILE("PLBVH large-small AABB test"); + + int numQueryAabbs = m_leafNodeAabbs.size(); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_largeAabbs.getBufferCL() ), + + b3BufferInfoCL( numPairsGpu.getBufferCL() ), + b3BufferInfoCL( out_overlappingPairs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(maxPairs); + launcher.setConst(numLargeAabbRigids); + launcher.setConst(numQueryAabbs); + + launcher.launch1D(numQueryAabbs); + clFinish(m_queue); + } + + + // + int numPairs = -1; + numPairsGpu.copyToHostPointer(&numPairs, 1); + if(numPairs > maxPairs) + { + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs = maxPairs; + numPairsGpu.copyFromHostPointer(&maxPairs, 1); + } + + out_overlappingPairs.resize(numPairs); +} + + +void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, + b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs) +{ + B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()"); + + int numRays = rays.size(); + int maxRayRigidPairs = out_rayRigidPairs.size(); + + int reset = 0; + out_numRayRigidPairs.copyFromHostPointer(&reset, 1); + + // + if( m_leafNodeAabbs.size() > 0 ) + { + B3_PROFILE("PLBVH ray test small AABB"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + + b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + + b3BufferInfoCL( rays.getBufferCL() ), + + b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ), + b3BufferInfoCL( out_rayRigidPairs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(maxRayRigidPairs); + launcher.setConst(numRays); + + launcher.launch1D(numRays); + clFinish(m_queue); + } + + int numLargeAabbRigids = m_largeAabbs.size(); + if(numLargeAabbRigids > 0) + { + B3_PROFILE("PLBVH ray test large AABB"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_largeAabbs.getBufferCL() ), + b3BufferInfoCL( rays.getBufferCL() ), + + b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ), + b3BufferInfoCL( out_rayRigidPairs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLargeAabbRigids); + launcher.setConst(maxRayRigidPairs); + launcher.setConst(numRays); + + launcher.launch1D(numRays); + clFinish(m_queue); + } + + // + int numRayRigidPairs = -1; + out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1); + + if(numRayRigidPairs > maxRayRigidPairs) + b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs); + +} + +void b3GpuParallelLinearBvh::constructBinaryRadixTree() +{ + B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()"); + + int numLeaves = m_leafNodeAabbs.size(); + int numInternalNodes = numLeaves - 1; + + //Each internal node is placed in between 2 leaf nodes. + //By using this arrangement and computing the common prefix between + //these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree. + { + B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + b3BufferInfoCL( m_commonPrefixes.getBufferCL() ), + b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numInternalNodes); + + launcher.launch1D(numInternalNodes); + clFinish(m_queue); + } + + //For each leaf node, select its parent node by + //comparing the 2 nearest internal nodes and assign child node indices + { + B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ), + b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLeaves); + + launcher.launch1D(numLeaves); + clFinish(m_queue); + } + + //For each internal node, perform 2 binary searches among the other internal nodes + //to its left and right to find its potential parent nodes and assign child node indices + { + B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_commonPrefixes.getBufferCL() ), + b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), + b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numInternalNodes); + + launcher.launch1D(numInternalNodes); + clFinish(m_queue); + } + + //Find the number of nodes seperating each internal node and the root node + //so that the AABBs can be set using the next kernel. + //Also determine the maximum number of nodes separating an internal node and the root node. + { + B3_PROFILE("m_findDistanceFromRootKernel"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), + b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), + b3BufferInfoCL( m_maxDistanceFromRoot.getBufferCL() ), + b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numInternalNodes); + + launcher.launch1D(numInternalNodes); + clFinish(m_queue); + } + + //Starting from the internal nodes nearest to the leaf nodes, recursively move up + //the tree towards the root to set the AABBs of each internal node; each internal node + //checks its children and merges their AABBs + { + B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel"); + + int maxDistanceFromRoot = -1; + { + B3_PROFILE("copy maxDistanceFromRoot to CPU"); + m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1); + clFinish(m_queue); + } + + for(int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot) + { + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(maxDistanceFromRoot); + launcher.setConst(distanceFromRoot); + launcher.setConst(numInternalNodes); + + //It may seem inefficent to launch a thread for each internal node when a + //much smaller number of nodes is actually processed, but this is actually + //faster than determining the exact nodes that are ready to merge their child AABBs. + launcher.launch1D(numInternalNodes); + } + + clFinish(m_queue); + } +} + +
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h new file mode 100644 index 0000000000..effe617b7b --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h @@ -0,0 +1,125 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +#ifndef B3_GPU_PARALLEL_LINEAR_BVH_H +#define B3_GPU_PARALLEL_LINEAR_BVH_H + +//#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h" +#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h" +#include "Bullet3Common/shared/b3Int2.h" +#include "Bullet3Common/shared/b3Int4.h" +#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h" + +#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h" + +#include "Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h" + +#define b3Int64 cl_long + +///@brief GPU Parallel Linearized Bounding Volume Heirarchy(LBVH) that is reconstructed every frame +///@remarks +///See presentation in docs/b3GpuParallelLinearBvh.pdf for algorithm details. +///@par +///Related papers: \n +///"Fast BVH Construction on GPUs" [Lauterbach et al. 2009] \n +///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n +///@par +///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages: +/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid) +/// - [fully parallel] Sort morton codes +/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH) +/// - [somewhat parallel] Set internal node AABBs +///@par +///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages. +///The BVH implementation here shares many concepts with [Karras 2012], but a different method is used for constructing the tree. +///Instead of searching for the child nodes of each internal node, we search for the parent node of each node. +///Additionally, a non-atomic traversal that starts from the leaf nodes and moves towards the root node is used to set the AABBs. +class b3GpuParallelLinearBvh +{ + cl_command_queue m_queue; + + cl_program m_parallelLinearBvhProgram; + + cl_kernel m_separateAabbsKernel; + cl_kernel m_findAllNodesMergedAabbKernel; + cl_kernel m_assignMortonCodesAndAabbIndiciesKernel; + + //Binary radix tree construction kernels + cl_kernel m_computeAdjacentPairCommonPrefixKernel; + cl_kernel m_buildBinaryRadixTreeLeafNodesKernel; + cl_kernel m_buildBinaryRadixTreeInternalNodesKernel; + cl_kernel m_findDistanceFromRootKernel; + cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel; + + cl_kernel m_findLeafIndexRangesKernel; + + //Traversal kernels + cl_kernel m_plbvhCalculateOverlappingPairsKernel; + cl_kernel m_plbvhRayTraverseKernel; + cl_kernel m_plbvhLargeAabbAabbTestKernel; + cl_kernel m_plbvhLargeAabbRayTestKernel; + + b3RadixSort32CL m_radixSorter; + + //1 element + b3OpenCLArray<int> m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node + b3OpenCLArray<int> m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node + b3OpenCLArray<int> m_temp; //Used to hold the number of pairs in calculateOverlappingPairs() + + //1 element per internal node (number_of_internal_nodes == number_of_leaves - 1) + b3OpenCLArray<b3SapAabb> m_internalNodeAabbs; + b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index + b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node + b3OpenCLArray<int> m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal + + //1 element per internal node; for binary radix tree construction + b3OpenCLArray<b3Int64> m_commonPrefixes; + b3OpenCLArray<int> m_commonPrefixLengths; + b3OpenCLArray<int> m_distanceFromRoot; //Number of internal nodes between this node and the root + + //1 element per leaf node (leaf nodes only include small AABBs) + b3OpenCLArray<int> m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal + b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs + b3OpenCLArray<b3SapAabb> m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes + b3OpenCLArray<b3SapAabb> m_leafNodeAabbs; //Contains only small AABBs + + //1 element per large AABB, which is not stored in the BVH + b3OpenCLArray<b3SapAabb> m_largeAabbs; + +public: + b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue); + virtual ~b3GpuParallelLinearBvh(); + + ///Must be called before any other function + void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, + const b3OpenCLArray<int>& largeAabbIndices); + + ///calculateOverlappingPairs() uses the worldSpaceAabbs parameter of b3GpuParallelLinearBvh::build() as the query AABBs. + ///@param out_overlappingPairs The size() of this array is used to determine the max number of pairs. + ///If the number of overlapping pairs is < out_overlappingPairs.size(), out_overlappingPairs is resized. + void calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs); + + ///@param out_numRigidRayPairs Array of length 1; contains the number of detected ray-rigid AABB intersections; + ///this value may be greater than out_rayRigidPairs.size() if out_rayRigidPairs is not large enough. + ///@param out_rayRigidPairs Contains an array of rays intersecting rigid AABBs; x == ray index, y == rigid body index. + ///If the size of this array is insufficient to hold all ray-rigid AABB intersections, additional intersections are discarded. + void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, + b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs); + +private: + void constructBinaryRadixTree(); +}; + +#endif diff --git a/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp new file mode 100644 index 0000000000..d2618024ac --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp @@ -0,0 +1,80 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +#include "b3GpuParallelLinearBvhBroadphase.h" + +b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : + m_plbvh(context, device, queue), + + m_overlappingPairsGpu(context, queue), + + m_aabbsGpu(context, queue), + m_smallAabbsMappingGpu(context, queue), + m_largeAabbsMappingGpu(context, queue) +{ +} + +void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) +{ + int newAabbIndex = m_aabbsCpu.size(); + + b3SapAabb aabb; + aabb.m_minVec = aabbMin; + aabb.m_maxVec = aabbMax; + + aabb.m_minIndices[3] = userPtr; + aabb.m_signedMaxIndices[3] = newAabbIndex; + + m_smallAabbsMappingCpu.push_back(newAabbIndex); + + m_aabbsCpu.push_back(aabb); +} +void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) +{ + int newAabbIndex = m_aabbsCpu.size(); + + b3SapAabb aabb; + aabb.m_minVec = aabbMin; + aabb.m_maxVec = aabbMax; + + aabb.m_minIndices[3] = userPtr; + aabb.m_signedMaxIndices[3] = newAabbIndex; + + m_largeAabbsMappingCpu.push_back(newAabbIndex); + + m_aabbsCpu.push_back(aabb); +} + +void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairs(int maxPairs) +{ + //Reconstruct BVH + m_plbvh.build(m_aabbsGpu, m_smallAabbsMappingGpu, m_largeAabbsMappingGpu); + + // + m_overlappingPairsGpu.resize(maxPairs); + m_plbvh.calculateOverlappingPairs(m_overlappingPairsGpu); +} +void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairsHost(int maxPairs) +{ + b3Assert(0); //CPU version not implemented +} + +void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu() +{ + m_aabbsGpu.copyFromHost(m_aabbsCpu); + m_smallAabbsMappingGpu.copyFromHost(m_smallAabbsMappingCpu); + m_largeAabbsMappingGpu.copyFromHost(m_largeAabbsMappingCpu); +} + + + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h new file mode 100644 index 0000000000..e518500637 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h @@ -0,0 +1,66 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +#ifndef B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H +#define B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H + +#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h" + +#include "b3GpuParallelLinearBvh.h" + +class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface +{ + b3GpuParallelLinearBvh m_plbvh; + + b3OpenCLArray<b3Int4> m_overlappingPairsGpu; + + b3OpenCLArray<b3SapAabb> m_aabbsGpu; + b3OpenCLArray<int> m_smallAabbsMappingGpu; + b3OpenCLArray<int> m_largeAabbsMappingGpu; + + b3AlignedObjectArray<b3SapAabb> m_aabbsCpu; + b3AlignedObjectArray<int> m_smallAabbsMappingCpu; + b3AlignedObjectArray<int> m_largeAabbsMappingCpu; + +public: + b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue); + virtual ~b3GpuParallelLinearBvhBroadphase() {} + + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); + + virtual void calculateOverlappingPairs(int maxPairs); + virtual void calculateOverlappingPairsHost(int maxPairs); + + //call writeAabbsToGpu after done making all changes (createProxy etc) + virtual void writeAabbsToGpu(); + + virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); } + virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); } + + virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); } + virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; } + + virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() { return m_overlappingPairsGpu; } + virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() { return m_smallAabbsMappingGpu; } + virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() { return m_largeAabbsMappingGpu; } + + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_aabbsCpu; } + + static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue) + { + return new b3GpuParallelLinearBvhBroadphase(context, device, queue); + } +}; + +#endif diff --git a/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp new file mode 100644 index 0000000000..c45fbbdcaa --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp @@ -0,0 +1,1366 @@ + +bool searchIncremental3dSapOnGpu = true; +#include <limits.h> +#include "b3GpuSapBroadphase.h" +#include "Bullet3Common/b3Vector3.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h" + + +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "kernels/sapKernels.h" + +#include "Bullet3Common/b3MinMax.h" + +#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl" + +/* + + + + + + + b3OpenCLArray<int> m_pairCount; + + + b3OpenCLArray<b3SapAabb> m_allAabbsGPU; + b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU; + + virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() + { + return m_allAabbsGPU; + } + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() + { + return m_allAabbsCPU; + } + + b3OpenCLArray<b3Vector3> m_sum; + b3OpenCLArray<b3Vector3> m_sum2; + b3OpenCLArray<b3Vector3> m_dst; + + b3OpenCLArray<int> m_smallAabbsMappingGPU; + b3AlignedObjectArray<int> m_smallAabbsMappingCPU; + + b3OpenCLArray<int> m_largeAabbsMappingGPU; + b3AlignedObjectArray<int> m_largeAabbsMappingCPU; + + + b3OpenCLArray<b3Int4> m_overlappingPairs; + + //temporary gpu work memory + b3OpenCLArray<b3SortData> m_gpuSmallSortData; + b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs; + + class b3PrefixScanFloat4CL* m_prefixScanFloat4; + */ + +b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType) +:m_context(ctx), +m_device(device), +m_queue(q), + +m_objectMinMaxIndexGPUaxis0(ctx,q), +m_objectMinMaxIndexGPUaxis1(ctx,q), +m_objectMinMaxIndexGPUaxis2(ctx,q), +m_objectMinMaxIndexGPUaxis0prev(ctx,q), +m_objectMinMaxIndexGPUaxis1prev(ctx,q), +m_objectMinMaxIndexGPUaxis2prev(ctx,q), +m_sortedAxisGPU0(ctx,q), +m_sortedAxisGPU1(ctx,q), +m_sortedAxisGPU2(ctx,q), +m_sortedAxisGPU0prev(ctx,q), +m_sortedAxisGPU1prev(ctx,q), +m_sortedAxisGPU2prev(ctx,q), +m_addedHostPairsGPU(ctx,q), +m_removedHostPairsGPU(ctx,q), +m_addedCountGPU(ctx,q), +m_removedCountGPU(ctx,q), +m_currentBuffer(-1), +m_pairCount(ctx,q), +m_allAabbsGPU(ctx,q), +m_sum(ctx,q), +m_sum2(ctx,q), +m_dst(ctx,q), +m_smallAabbsMappingGPU(ctx,q), +m_largeAabbsMappingGPU(ctx,q), +m_overlappingPairs(ctx,q), +m_gpuSmallSortData(ctx,q), +m_gpuSmallSortedAabbs(ctx,q) +{ + const char* sapSrc = sapCL; + + + cl_int errNum=0; + + b3Assert(m_context); + b3Assert(m_device); + cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH); + b3Assert(errNum==CL_SUCCESS); + + + b3Assert(errNum==CL_SUCCESS); +#ifndef __APPLE__ + m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context,m_device,m_queue); +#else + m_prefixScanFloat4 = 0; +#endif + m_sapKernel = 0; + + switch (kernelType) + { + case B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU: + { + m_sapKernel=0; + break; + } + case B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU: + { + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBruteForce",&errNum,sapProg ); + break; + } + + case B3_GPU_SAP_KERNEL_ORIGINAL: + { + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg ); + break; + } + case B3_GPU_SAP_KERNEL_BARRIER: + { + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg ); + break; + } + case B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY: + { + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg ); + break; + } + + default: + { + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg ); + b3Error("Unknown 3D GPU SAP provided, fallback to computePairsKernelLocalSharedMemory"); + } + }; + + + + m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg ); + b3Assert(errNum==CL_SUCCESS); + + m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "prepareSumVarianceKernel",&errNum,sapProg ); + b3Assert(errNum==CL_SUCCESS); + + + m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg ); + + m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg ); + + m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "scatterKernel",&errNum,sapProg ); + + m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue); +} + +b3GpuSapBroadphase::~b3GpuSapBroadphase() +{ + delete m_sorter; + delete m_prefixScanFloat4; + + clReleaseKernel(m_scatterKernel); + clReleaseKernel(m_flipFloatKernel); + clReleaseKernel(m_copyAabbsKernel); + clReleaseKernel(m_sapKernel); + clReleaseKernel(m_sap2Kernel); + clReleaseKernel(m_prepareSumVarianceKernel); + + +} + +/// conservative test for overlap between two aabbs +static bool TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aabbMax1, + const b3Vector3 &aabbMin2, const b3Vector3 &aabbMax2) +{ + bool overlap = true; + overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap; + overlap = (aabbMin1.getZ() > aabbMax2.getZ() || aabbMax1.getZ() < aabbMin2.getZ()) ? false : overlap; + overlap = (aabbMin1.getY() > aabbMax2.getY() || aabbMax1.getY() < aabbMin2.getY()) ? false : overlap; + return overlap; +} + + + +//http://stereopsis.com/radix.html +static unsigned int FloatFlip(float fl) +{ + unsigned int f = *(unsigned int*)&fl; + unsigned int mask = -(int)(f >> 31) | 0x80000000; + return f ^ mask; +}; + +void b3GpuSapBroadphase::init3dSap() +{ + if (m_currentBuffer<0) + { + m_allAabbsGPU.copyToHost(m_allAabbsCPU); + + m_currentBuffer = 0; + for (int axis=0;axis<3;axis++) + { + for (int buf=0;buf<2;buf++) + { + int totalNumAabbs = m_allAabbsCPU.size(); + int numEndPoints = 2*totalNumAabbs; + m_sortedAxisCPU[axis][buf].resize(numEndPoints); + + if (buf==m_currentBuffer) + { + for (int i=0;i<totalNumAabbs;i++) + { + m_sortedAxisCPU[axis][buf][i*2].m_key = FloatFlip(m_allAabbsCPU[i].m_min[axis])-1; + m_sortedAxisCPU[axis][buf][i*2].m_value = i*2; + m_sortedAxisCPU[axis][buf][i*2+1].m_key = FloatFlip(m_allAabbsCPU[i].m_max[axis])+1; + m_sortedAxisCPU[axis][buf][i*2+1].m_value = i*2+1; + } + } + } + } + + for (int axis=0;axis<3;axis++) + { + m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]); + } + + for (int axis=0;axis<3;axis++) + { + //int totalNumAabbs = m_allAabbsCPU.size(); + int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size(); + m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(numEndPoints); + for (int i=0;i<numEndPoints;i++) + { + int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value; + int newDest = destIndex/2; + if (destIndex&1) + { + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i; + } else + { + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i; + } + } + } + + } +} + + +static bool b3PairCmp(const b3Int4& p, const b3Int4& q) +{ + return ((p.x<q.x) || ((p.x==q.x) && (p.y<q.y))); +} + + +static bool operator==(const b3Int4& a,const b3Int4& b) +{ + return a.x == b.x && a.y == b.y; +}; + +static bool operator<(const b3Int4& a,const b3Int4& b) +{ + return a.x < b.x || (a.x == b.x && a.y < b.y); +}; + +static bool operator>(const b3Int4& a,const b3Int4& b) +{ + return a.x > b.x || (a.x == b.x && a.y > b.y); +}; + +b3AlignedObjectArray<b3Int4> addedHostPairs; +b3AlignedObjectArray<b3Int4> removedHostPairs; + +b3AlignedObjectArray<b3SapAabb> preAabbs; + +void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() +{ + //static int framepje = 0; + //printf("framepje=%d\n",framepje++); + + + B3_PROFILE("calculateOverlappingPairsHostIncremental3Sap"); + + addedHostPairs.resize(0); + removedHostPairs.resize(0); + + b3Assert(m_currentBuffer>=0); + + { + preAabbs.resize(m_allAabbsCPU.size()); + for (int i=0;i<preAabbs.size();i++) + { + preAabbs[i]=m_allAabbsCPU[i]; + } + } + + + if (m_currentBuffer<0) + return; + { + B3_PROFILE("m_allAabbsGPU.copyToHost"); + m_allAabbsGPU.copyToHost(m_allAabbsCPU); + } + + b3AlignedObjectArray<b3Int4> allPairs; + { + B3_PROFILE("m_overlappingPairs.copyToHost"); + m_overlappingPairs.copyToHost(allPairs); + } + if (0) + { + { + printf("ab[40].min=%f,%f,%f,ab[40].max=%f,%f,%f\n", + m_allAabbsCPU[40].m_min[0], m_allAabbsCPU[40].m_min[1],m_allAabbsCPU[40].m_min[2], + m_allAabbsCPU[40].m_max[0], m_allAabbsCPU[40].m_max[1],m_allAabbsCPU[40].m_max[2]); + } + + { + printf("ab[53].min=%f,%f,%f,ab[53].max=%f,%f,%f\n", + m_allAabbsCPU[53].m_min[0], m_allAabbsCPU[53].m_min[1],m_allAabbsCPU[53].m_min[2], + m_allAabbsCPU[53].m_max[0], m_allAabbsCPU[53].m_max[1],m_allAabbsCPU[53].m_max[2]); + } + + + { + b3Int4 newPair; + newPair.x = 40; + newPair.y = 53; + int index = allPairs.findBinarySearch(newPair); + printf("hasPair(40,53)=%d out of %d\n",index, allPairs.size()); + + { + int overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[40].m_min, (const b3Vector3&)m_allAabbsCPU[40].m_max,(const b3Vector3&)m_allAabbsCPU[53].m_min,(const b3Vector3&)m_allAabbsCPU[53].m_max); + printf("overlap=%d\n",overlap); + } + + if (preAabbs.size()) + { + int prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[40].m_min, (const b3Vector3&)preAabbs[40].m_max,(const b3Vector3&)preAabbs[53].m_min,(const b3Vector3&)preAabbs[53].m_max); + printf("prevoverlap=%d\n",prevOverlap); + } else + { + printf("unknown prevoverlap\n"); + } + + } + } + + + if (0) + { + for (int i=0;i<m_allAabbsCPU.size();i++) + { + //printf("aabb[%d] min=%f,%f,%f max=%f,%f,%f\n",i,m_allAabbsCPU[i].m_min[0],m_allAabbsCPU[i].m_min[1],m_allAabbsCPU[i].m_min[2], m_allAabbsCPU[i].m_max[0],m_allAabbsCPU[i].m_max[1],m_allAabbsCPU[i].m_max[2]); + + + } + + for (int axis=0;axis<3;axis++) + { + for (int buf=0;buf<2;buf++) + { + b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size()*2); + } + } + } + + + + m_currentBuffer = 1-m_currentBuffer; + + + + int totalNumAabbs = m_allAabbsCPU.size(); + + { + B3_PROFILE("assign m_sortedAxisCPU(FloatFlip)"); + for (int i=0;i<totalNumAabbs;i++) + { + + + unsigned int keyMin[3]; + unsigned int keyMax[3]; + for (int axis=0;axis<3;axis++) + { + float vmin=m_allAabbsCPU[i].m_min[axis]; + float vmax = m_allAabbsCPU[i].m_max[axis]; + keyMin[axis] = FloatFlip(vmin); + keyMax[axis] = FloatFlip(vmax); + + m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_key = keyMin[axis]-1; + m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_value = i*2; + m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_key = keyMax[axis]+1; + m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_value = i*2+1; + } + //printf("aabb[%d] min=%u,%u,%u max %u,%u,%u\n", i,keyMin[0],keyMin[1],keyMin[2],keyMax[0],keyMax[1],keyMax[2]); + + } + } + + + + { + B3_PROFILE("sort m_sortedAxisCPU"); + for (int axis=0;axis<3;axis++) + m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]); + } + +#if 0 + if (0) + { + for (int axis=0;axis<3;axis++) + { + //printf("axis %d\n",axis); + for (int i=0;i<m_sortedAxisCPU[axis][m_currentBuffer].size();i++) + { + //int key = m_sortedAxisCPU[axis][m_currentBuffer][i].m_key; + //int value = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value; + //printf("[%d]=%d\n",i,value); + } + + } + } +#endif + + { + B3_PROFILE("assign m_objectMinMaxIndexCPU"); + for (int axis=0;axis<3;axis++) + { + int totalNumAabbs = m_allAabbsCPU.size(); + int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size(); + m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(totalNumAabbs); + for (int i=0;i<numEndPoints;i++) + { + int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value; + int newDest = destIndex/2; + if (destIndex&1) + { + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i; + } else + { + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i; + } + } + } + } + +#if 0 + if (0) + { + printf("==========================\n"); + for (int axis=0;axis<3;axis++) + { + unsigned int curMinIndex40 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][40].x; + unsigned int curMaxIndex40 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][40].y; + unsigned int prevMaxIndex40 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][40].y; + unsigned int prevMinIndex40 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][40].x; + + int dmin40 = curMinIndex40 - prevMinIndex40; + int dmax40 = curMinIndex40 - prevMinIndex40; + printf("axis %d curMinIndex40=%d prevMinIndex40=%d\n",axis,curMinIndex40, prevMinIndex40); + printf("axis %d curMaxIndex40=%d prevMaxIndex40=%d\n",axis,curMaxIndex40, prevMaxIndex40); + } + printf(".........................\n"); + for (int axis=0;axis<3;axis++) + { + unsigned int curMinIndex53 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][53].x; + unsigned int curMaxIndex53 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][53].y; + unsigned int prevMaxIndex53 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][53].y; + unsigned int prevMinIndex53 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][53].x; + + int dmin40 = curMinIndex53 - prevMinIndex53; + int dmax40 = curMinIndex53 - prevMinIndex53; + printf("axis %d curMinIndex53=%d prevMinIndex53=%d\n",axis,curMinIndex53, prevMinIndex53); + printf("axis %d curMaxIndex53=%d prevMaxIndex53=%d\n",axis,curMaxIndex53, prevMaxIndex53); + } + + } +#endif + + + int a = m_objectMinMaxIndexCPU[0][m_currentBuffer].size(); + int b = m_objectMinMaxIndexCPU[1][m_currentBuffer].size(); + int c = m_objectMinMaxIndexCPU[2][m_currentBuffer].size(); + b3Assert(a==b); + b3Assert(b==c); + /* + if (searchIncremental3dSapOnGpu) + { + B3_PROFILE("computePairsIncremental3dSapKernelGPU"); + int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size(); + int maxCapacity = 1024*1024; + { + B3_PROFILE("copy from host"); + m_objectMinMaxIndexGPUaxis0.copyFromHost(m_objectMinMaxIndexCPU[0][m_currentBuffer]); + m_objectMinMaxIndexGPUaxis1.copyFromHost(m_objectMinMaxIndexCPU[1][m_currentBuffer]); + m_objectMinMaxIndexGPUaxis2.copyFromHost(m_objectMinMaxIndexCPU[2][m_currentBuffer]); + m_objectMinMaxIndexGPUaxis0prev.copyFromHost(m_objectMinMaxIndexCPU[0][1-m_currentBuffer]); + m_objectMinMaxIndexGPUaxis1prev.copyFromHost(m_objectMinMaxIndexCPU[1][1-m_currentBuffer]); + m_objectMinMaxIndexGPUaxis2prev.copyFromHost(m_objectMinMaxIndexCPU[2][1-m_currentBuffer]); + + m_sortedAxisGPU0.copyFromHost(m_sortedAxisCPU[0][m_currentBuffer]); + m_sortedAxisGPU1.copyFromHost(m_sortedAxisCPU[1][m_currentBuffer]); + m_sortedAxisGPU2.copyFromHost(m_sortedAxisCPU[2][m_currentBuffer]); + m_sortedAxisGPU0prev.copyFromHost(m_sortedAxisCPU[0][1-m_currentBuffer]); + m_sortedAxisGPU1prev.copyFromHost(m_sortedAxisCPU[1][1-m_currentBuffer]); + m_sortedAxisGPU2prev.copyFromHost(m_sortedAxisCPU[2][1-m_currentBuffer]); + + + m_addedHostPairsGPU.resize(maxCapacity); + m_removedHostPairsGPU.resize(maxCapacity); + + m_addedCountGPU.resize(0); + m_addedCountGPU.push_back(0); + m_removedCountGPU.resize(0); + m_removedCountGPU.push_back(0); + } + + { + B3_PROFILE("launch1D"); + b3LauncherCL launcher(m_queue, m_computePairsIncremental3dSapKernel,"m_computePairsIncremental3dSapKernel"); + launcher.setBuffer(m_objectMinMaxIndexGPUaxis0.getBufferCL()); + launcher.setBuffer(m_objectMinMaxIndexGPUaxis1.getBufferCL()); + launcher.setBuffer(m_objectMinMaxIndexGPUaxis2.getBufferCL()); + launcher.setBuffer(m_objectMinMaxIndexGPUaxis0prev.getBufferCL()); + launcher.setBuffer(m_objectMinMaxIndexGPUaxis1prev.getBufferCL()); + launcher.setBuffer(m_objectMinMaxIndexGPUaxis2prev.getBufferCL()); + + launcher.setBuffer(m_sortedAxisGPU0.getBufferCL()); + launcher.setBuffer(m_sortedAxisGPU1.getBufferCL()); + launcher.setBuffer(m_sortedAxisGPU2.getBufferCL()); + launcher.setBuffer(m_sortedAxisGPU0prev.getBufferCL()); + launcher.setBuffer(m_sortedAxisGPU1prev.getBufferCL()); + launcher.setBuffer(m_sortedAxisGPU2prev.getBufferCL()); + + + launcher.setBuffer(m_addedHostPairsGPU.getBufferCL()); + launcher.setBuffer(m_removedHostPairsGPU.getBufferCL()); + launcher.setBuffer(m_addedCountGPU.getBufferCL()); + launcher.setBuffer(m_removedCountGPU.getBufferCL()); + launcher.setConst(maxCapacity); + launcher.setConst( numObjects); + launcher.launch1D( numObjects); + clFinish(m_queue); + } + + { + B3_PROFILE("copy to host"); + int addedCountGPU = m_addedCountGPU.at(0); + m_addedHostPairsGPU.resize(addedCountGPU); + m_addedHostPairsGPU.copyToHost(addedHostPairs); + + //printf("addedCountGPU=%d\n",addedCountGPU); + int removedCountGPU = m_removedCountGPU.at(0); + m_removedHostPairsGPU.resize(removedCountGPU); + m_removedHostPairsGPU.copyToHost(removedHostPairs); + //printf("removedCountGPU=%d\n",removedCountGPU); + + } + + + + } + else + */ + { + int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size(); + + B3_PROFILE("actual search"); + for (int i=0;i<numObjects;i++) + { + //int numObjects = m_objectMinMaxIndexCPU[axis][m_currentBuffer].size(); + //int checkObjects[]={40,53}; + //int numCheckObjects = sizeof(checkObjects)/sizeof(int); + + //for (int a=0;a<numCheckObjects ;a++) + + for (int axis=0;axis<3;axis++) + { + //int i = checkObjects[a]; + + unsigned int curMinIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].x; + unsigned int curMaxIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].y; + unsigned int prevMinIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].x; + int dmin = curMinIndex - prevMinIndex; + + unsigned int prevMaxIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].y; + + + + int dmax = curMaxIndex - prevMaxIndex; + if (dmin!=0) + { + //printf("for object %d, dmin=%d\n",i,dmin); + } + if (dmax!=0) + { + //printf("for object %d, dmax=%d\n",i,dmax); + } + for (int otherbuffer = 0;otherbuffer<2;otherbuffer++) + { + if (dmin!=0) + { + int stepMin = dmin<0 ? -1 : 1; + for (int j=prevMinIndex;j!=curMinIndex;j+=stepMin) + { + int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y; + int otherIndex = otherIndex2/2; + if (otherIndex!=i) + { + bool otherIsMax = ((otherIndex2&1)!=0); + + if (otherIsMax) + { + //bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max); + //bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max); + + bool overlap = true; + + for (int ax=0;ax<3;ax++) + { + if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) || + (m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x)) + overlap=false; + } + + // b3Assert(overlap2==overlap); + + bool prevOverlap = true; + + for (int ax=0;ax<3;ax++) + { + if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) || + (m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x)) + prevOverlap=false; + } + + + //b3Assert(overlap==overlap2); + + + + if (dmin<0) + { + if (overlap && !prevOverlap) + { + //add a pair + b3Int4 newPair; + if (i<=otherIndex) + { + newPair.x = i; + newPair.y = otherIndex; + } else + { + newPair.x = otherIndex; + newPair.y = i; + } + addedHostPairs.push_back(newPair); + } + } + else + { + if (!overlap && prevOverlap) + { + + //remove a pair + b3Int4 removedPair; + if (i<=otherIndex) + { + removedPair.x = i; + removedPair.y = otherIndex; + } else + { + removedPair.x = otherIndex; + removedPair.y = i; + } + removedHostPairs.push_back(removedPair); + } + }//otherisMax + }//if (dmin<0) + }//if (otherIndex!=i) + }//for (int j= + } + + if (dmax!=0) + { + int stepMax = dmax<0 ? -1 : 1; + for (int j=prevMaxIndex;j!=curMaxIndex;j+=stepMax) + { + int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y; + int otherIndex = otherIndex2/2; + if (otherIndex!=i) + { + //bool otherIsMin = ((otherIndex2&1)==0); + //if (otherIsMin) + { + //bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max); + //bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max); + + bool overlap = true; + + for (int ax=0;ax<3;ax++) + { + if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) || + (m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x)) + overlap=false; + } + //b3Assert(overlap2==overlap); + + bool prevOverlap = true; + + for (int ax=0;ax<3;ax++) + { + if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) || + (m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x)) + prevOverlap=false; + } + + + if (dmax>0) + { + if (overlap && !prevOverlap) + { + //add a pair + b3Int4 newPair; + if (i<=otherIndex) + { + newPair.x = i; + newPair.y = otherIndex; + } else + { + newPair.x = otherIndex; + newPair.y = i; + } + addedHostPairs.push_back(newPair); + + } + } + else + { + if (!overlap && prevOverlap) + { + //if (otherIndex2&1==0) -> min? + //remove a pair + b3Int4 removedPair; + if (i<=otherIndex) + { + removedPair.x = i; + removedPair.y = otherIndex; + } else + { + removedPair.x = otherIndex; + removedPair.y = i; + } + removedHostPairs.push_back(removedPair); + + } + } + + }//if (dmin<0) + }//if (otherIndex!=i) + }//for (int j= + } + }//for (int otherbuffer + }//for (int axis=0; + }//for (int i=0;i<numObjects + } + + //remove duplicates and add/remove then to existing m_overlappingPairs + + + + { + { + B3_PROFILE("sort allPairs"); + allPairs.quickSort(b3PairCmp); + } + { + B3_PROFILE("sort addedHostPairs"); + addedHostPairs.quickSort(b3PairCmp); + } + { + B3_PROFILE("sort removedHostPairs"); + removedHostPairs.quickSort(b3PairCmp); + } + } + + b3Int4 prevPair; + prevPair.x = -1; + prevPair.y = -1; + + int uniqueRemovedPairs = 0; + + b3AlignedObjectArray<int> removedPositions; + + { + B3_PROFILE("actual removing"); + for (int i=0;i<removedHostPairs.size();i++) + { + b3Int4 removedPair = removedHostPairs[i]; + if ((removedPair.x != prevPair.x) || (removedPair.y != prevPair.y)) + { + + int index1 = allPairs.findBinarySearch(removedPair); + + //#ifdef _DEBUG + + + + int index2 = allPairs.findLinearSearch(removedPair); + b3Assert(index1==index2); + + //b3Assert(index1!=allPairs.size()); + if (index1<allPairs.size()) + //#endif//_DEBUG + { + uniqueRemovedPairs++; + removedPositions.push_back(index1); + { + //printf("framepje(%d) remove pair(%d):%d,%d\n",framepje,i,removedPair.x,removedPair.y); + } + } + } + prevPair = removedPair; + } + + if (uniqueRemovedPairs) + { + for (int i=0;i<removedPositions.size();i++) + { + allPairs[removedPositions[i]].x = INT_MAX ; + allPairs[removedPositions[i]].y = INT_MAX ; + } + allPairs.quickSort(b3PairCmp); + allPairs.resize(allPairs.size()-uniqueRemovedPairs); + } + } + //if (uniqueRemovedPairs) + // printf("uniqueRemovedPairs=%d\n",uniqueRemovedPairs); + //printf("removedHostPairs.size = %d\n",removedHostPairs.size()); + + prevPair.x = -1; + prevPair.y = -1; + + int uniqueAddedPairs=0; + b3AlignedObjectArray<b3Int4> actualAddedPairs; + + { + B3_PROFILE("actual adding"); + for (int i=0;i<addedHostPairs.size();i++) + { + b3Int4 newPair = addedHostPairs[i]; + if ((newPair.x != prevPair.x) || (newPair.y != prevPair.y)) + { +//#ifdef _DEBUG + int index1 = allPairs.findBinarySearch(newPair); + + + int index2 = allPairs.findLinearSearch(newPair); + b3Assert(index1==index2); + + + b3Assert(index1==allPairs.size()); + if (index1!=allPairs.size()) + { + printf("??\n"); + } + + if (index1==allPairs.size()) +//#endif //_DEBUG + { + uniqueAddedPairs++; + actualAddedPairs.push_back(newPair); + } + } + prevPair = newPair; + } + for (int i=0;i<actualAddedPairs.size();i++) + { + //printf("framepje (%d), new pair(%d):%d,%d\n",framepje,i,actualAddedPairs[i].x,actualAddedPairs[i].y); + allPairs.push_back(actualAddedPairs[i]); + } + } + + //if (uniqueAddedPairs) + // printf("uniqueAddedPairs=%d\n", uniqueAddedPairs); + + + { + B3_PROFILE("m_overlappingPairs.copyFromHost"); + m_overlappingPairs.copyFromHost(allPairs); + } + + +} + + + + +void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) +{ + //test +// if (m_currentBuffer>=0) + // return calculateOverlappingPairsHostIncremental3Sap(); + + b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size()); + m_allAabbsGPU.copyToHost(m_allAabbsCPU); + + + + int axis=0; + { + B3_PROFILE("CPU compute best variance axis"); + b3Vector3 s=b3MakeVector3(0,0,0),s2=b3MakeVector3(0,0,0); + int numRigidBodies = m_smallAabbsMappingCPU.size(); + + for(int i=0;i<numRigidBodies;i++) + { + b3SapAabb aabb = this->m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; + + b3Vector3 maxAabb=b3MakeVector3(aabb.m_max[0],aabb.m_max[1],aabb.m_max[2]); + b3Vector3 minAabb=b3MakeVector3(aabb.m_min[0],aabb.m_min[1],aabb.m_min[2]); + b3Vector3 centerAabb=(maxAabb+minAabb)*0.5f; + + s += centerAabb; + s2 += centerAabb*centerAabb; + } + b3Vector3 v = s2 - (s*s) / (float)numRigidBodies; + + if(v[1] > v[0]) + axis = 1; + if(v[2] > v[axis]) + axis = 2; + } + + + + + b3AlignedObjectArray<b3Int4> hostPairs; + + { + int numSmallAabbs = m_smallAabbsMappingCPU.size(); + for (int i=0;i<numSmallAabbs;i++) + { + b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; + //float reference = smallAabbi.m_max[axis]; + + for (int j=i+1;j<numSmallAabbs;j++) + { + + b3SapAabb smallAabbj = m_allAabbsCPU[m_smallAabbsMappingCPU[j]]; + + if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max, + (b3Vector3&)smallAabbj.m_min,(b3Vector3&)smallAabbj.m_max)) + { + b3Int4 pair; + int a = smallAabbi.m_minIndices[3]; + int b = smallAabbj.m_minIndices[3]; + if (a<=b) + { + pair.x = a;//store the original index in the unsorted aabb array + pair.y = b; + } else + { + pair.x = b;//store the original index in the unsorted aabb array + pair.y = a; + } + hostPairs.push_back(pair); + } + } + } + } + + + { + int numSmallAabbs = m_smallAabbsMappingCPU.size(); + for (int i=0;i<numSmallAabbs;i++) + { + b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; + + //float reference = smallAabbi.m_max[axis]; + int numLargeAabbs = m_largeAabbsMappingCPU.size(); + + for (int j=0;j<numLargeAabbs;j++) + { + b3SapAabb largeAabbj = m_allAabbsCPU[m_largeAabbsMappingCPU[j]]; + if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max, + (b3Vector3&)largeAabbj.m_min,(b3Vector3&)largeAabbj.m_max)) + { + b3Int4 pair; + int a = largeAabbj.m_minIndices[3]; + int b = smallAabbi.m_minIndices[3]; + if (a<=b) + { + pair.x = a; + pair.y = b;//store the original index in the unsorted aabb array + } else + { + pair.x = b; + pair.y = a;//store the original index in the unsorted aabb array + } + + hostPairs.push_back(pair); + } + } + } + } + + if (hostPairs.size() > maxPairs) + { + hostPairs.resize(maxPairs); + } + + if (hostPairs.size()) + { + m_overlappingPairs.copyFromHost(hostPairs); + } else + { + m_overlappingPairs.resize(0); + } + + //init3dSap(); + +} + +void b3GpuSapBroadphase::reset() +{ + m_allAabbsGPU.resize(0); + m_allAabbsCPU.resize(0); + + + m_smallAabbsMappingGPU.resize(0); + m_smallAabbsMappingCPU.resize(0); + + m_pairCount.resize(0); + + m_largeAabbsMappingGPU.resize(0); + m_largeAabbsMappingCPU.resize(0); + +} + + +void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) +{ + if (m_sapKernel==0) + { + calculateOverlappingPairsHost(maxPairs); + return; + } + + //if (m_currentBuffer>=0) + // return calculateOverlappingPairsHostIncremental3Sap(); + + //calculateOverlappingPairsHost(maxPairs); + + B3_PROFILE("GPU 1-axis SAP calculateOverlappingPairs"); + + int axis = 0; + + { + + //bool syncOnHost = false; + + int numSmallAabbs = m_smallAabbsMappingCPU.size(); + if (m_prefixScanFloat4 && numSmallAabbs) + { + B3_PROFILE("GPU compute best variance axis"); + + if (m_dst.size()!=(numSmallAabbs+1)) + { + m_dst.resize(numSmallAabbs+128); + m_sum.resize(numSmallAabbs+128); + m_sum2.resize(numSmallAabbs+128); + m_sum.at(numSmallAabbs)=b3MakeVector3(0,0,0); //slow? + m_sum2.at(numSmallAabbs)=b3MakeVector3(0,0,0); //slow? + } + + b3LauncherCL launcher(m_queue, m_prepareSumVarianceKernel ,"m_prepareSumVarianceKernel"); + launcher.setBuffer(m_allAabbsGPU.getBufferCL()); + + launcher.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); + launcher.setBuffer(m_sum.getBufferCL()); + launcher.setBuffer(m_sum2.getBufferCL()); + launcher.setConst( numSmallAabbs ); + int num = numSmallAabbs; + launcher.launch1D( num); + + + b3Vector3 s; + b3Vector3 s2; + m_prefixScanFloat4->execute(m_sum,m_dst,numSmallAabbs+1,&s); + m_prefixScanFloat4->execute(m_sum2,m_dst,numSmallAabbs+1,&s2); + + b3Vector3 v = s2 - (s*s) / (float)numSmallAabbs; + + if(v[1] > v[0]) + axis = 1; + if(v[2] > v[axis]) + axis = 2; + } + + + + m_gpuSmallSortData.resize(numSmallAabbs); + + +#if 1 + if (m_smallAabbsMappingGPU.size()) + { + + B3_PROFILE("flipFloatKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), + b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL(), true), + b3BufferInfoCL( m_gpuSmallSortData.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_flipFloatKernel ,"m_flipFloatKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( numSmallAabbs ); + launcher.setConst( axis ); + + int num = numSmallAabbs; + launcher.launch1D( num); + clFinish(m_queue); + } + + if (m_gpuSmallSortData.size()) + { + B3_PROFILE("gpu radix sort"); + m_sorter->execute(m_gpuSmallSortData); + clFinish(m_queue); + } + + m_gpuSmallSortedAabbs.resize(numSmallAabbs); + if (numSmallAabbs) + { + B3_PROFILE("scatterKernel"); + + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), + b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL(), true), + b3BufferInfoCL( m_gpuSmallSortData.getBufferCL(),true), + b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_scatterKernel ,"m_scatterKernel "); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( numSmallAabbs); + int num = numSmallAabbs; + launcher.launch1D( num); + clFinish(m_queue); + + } + + + m_overlappingPairs.resize(maxPairs); + + m_pairCount.resize(0); + m_pairCount.push_back(0); + int numPairs=0; + + { + int numLargeAabbs = m_largeAabbsMappingGPU.size(); + if (numLargeAabbs && numSmallAabbs) + { + //@todo + B3_PROFILE("sap2Kernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( m_allAabbsGPU.getBufferCL() ), + b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ), + b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ), + b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), + b3BufferInfoCL(m_pairCount.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( numLargeAabbs ); + launcher.setConst( numSmallAabbs); + launcher.setConst( axis ); + launcher.setConst( maxPairs ); +//@todo: use actual maximum work item sizes of the device instead of hardcoded values + launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64); + + numPairs = m_pairCount.at(0); + if (numPairs >maxPairs) + { + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs =maxPairs; + } + } + } + if (m_gpuSmallSortedAabbs.size()) + { + B3_PROFILE("sapKernel"); + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), b3BufferInfoCL(m_pairCount.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_sapKernel,"m_sapKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( numSmallAabbs ); + launcher.setConst( axis ); + launcher.setConst( maxPairs ); + + + int num = numSmallAabbs; +#if 0 + int buffSize = launcher.getSerializationBufferSize(); + unsigned char* buf = new unsigned char[buffSize+sizeof(int)]; + for (int i=0;i<buffSize+1;i++) + { + unsigned char* ptr = (unsigned char*)&buf[i]; + *ptr = 0xff; + } + int actualWrite = launcher.serializeArguments(buf,buffSize); + + unsigned char* cptr = (unsigned char*)&buf[buffSize]; + // printf("buf[buffSize] = %d\n",*cptr); + + assert(buf[buffSize]==0xff);//check for buffer overrun + int* ptr = (int*)&buf[buffSize]; + + *ptr = num; + + FILE* f = fopen("m_sapKernelArgs.bin","wb"); + fwrite(buf,buffSize+sizeof(int),1,f); + fclose(f); +#endif// + + launcher.launch1D( num); + clFinish(m_queue); + + numPairs = m_pairCount.at(0); + if (numPairs>maxPairs) + { + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs = maxPairs; + m_pairCount.resize(0); + m_pairCount.push_back(maxPairs); + } + } + +#else + int numPairs = 0; + + + b3LauncherCL launcher(m_queue, m_sapKernel); + + const char* fileName = "m_sapKernelArgs.bin"; + FILE* f = fopen(fileName,"rb"); + if (f) + { + int sizeInBytes=0; + if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) + { + printf("error, cannot get file size\n"); + exit(0); + } + + unsigned char* buf = (unsigned char*) malloc(sizeInBytes); + fread(buf,sizeInBytes,1,f); + int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context); + int num = *(int*)&buf[serializedBytes]; + launcher.launch1D( num); + + b3OpenCLArray<int> pairCount(m_context, m_queue); + int numElements = launcher.m_arrays[2]->size()/sizeof(int); + pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements); + numPairs = pairCount.at(0); + //printf("overlapping pairs = %d\n",numPairs); + b3AlignedObjectArray<b3Int4> hostOoverlappingPairs; + b3OpenCLArray<b3Int4> tmpGpuPairs(m_context,m_queue); + tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs ); + + tmpGpuPairs.copyToHost(hostOoverlappingPairs); + m_overlappingPairs.copyFromHost(hostOoverlappingPairs); + //printf("hello %d\n", m_overlappingPairs.size()); + free(buf); + fclose(f); + + } else { + printf("error: cannot find file %s\n",fileName); + } + + clFinish(m_queue); + + +#endif + + + m_overlappingPairs.resize(numPairs); + + }//B3_PROFILE("GPU_RADIX SORT"); + //init3dSap(); +} + +void b3GpuSapBroadphase::writeAabbsToGpu() +{ + m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU); + m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU); + + m_allAabbsGPU.copyFromHost(m_allAabbsCPU);//might not be necessary, the 'setupGpuAabbsFull' already takes care of this + + + +} + +void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +{ + int index = userPtr; + b3SapAabb aabb; + for (int i=0;i<4;i++) + { + aabb.m_min[i] = aabbMin[i]; + aabb.m_max[i] = aabbMax[i]; + } + aabb.m_minIndices[3] = index; + aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size(); + m_largeAabbsMappingCPU.push_back(m_allAabbsCPU.size()); + + m_allAabbsCPU.push_back(aabb); +} + +void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +{ + int index = userPtr; + b3SapAabb aabb; + for (int i=0;i<4;i++) + { + aabb.m_min[i] = aabbMin[i]; + aabb.m_max[i] = aabbMax[i]; + } + aabb.m_minIndices[3] = index; + aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size(); + m_smallAabbsMappingCPU.push_back(m_allAabbsCPU.size()); + + + m_allAabbsCPU.push_back(aabb); +} + +cl_mem b3GpuSapBroadphase::getAabbBufferWS() +{ + return m_allAabbsGPU.getBufferCL(); +} + +int b3GpuSapBroadphase::getNumOverlap() +{ + return m_overlappingPairs.size(); +} +cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer() +{ + return m_overlappingPairs.getBufferCL(); +} + +b3OpenCLArray<b3Int4>& b3GpuSapBroadphase::getOverlappingPairsGPU() +{ + return m_overlappingPairs; +} +b3OpenCLArray<int>& b3GpuSapBroadphase::getSmallAabbIndicesGPU() +{ + return m_smallAabbsMappingGPU; +} +b3OpenCLArray<int>& b3GpuSapBroadphase::getLargeAabbIndicesGPU() +{ + return m_largeAabbsMappingGPU; +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h new file mode 100644 index 0000000000..8d36ac78f2 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h @@ -0,0 +1,151 @@ +#ifndef B3_GPU_SAP_BROADPHASE_H +#define B3_GPU_SAP_BROADPHASE_H + +#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2 +class b3Vector3; +#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" + +#include "b3SapAabb.h" +#include "Bullet3Common/shared/b3Int2.h" + +#include "b3GpuBroadphaseInterface.h" + + +class b3GpuSapBroadphase : public b3GpuBroadphaseInterface +{ + + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + cl_kernel m_flipFloatKernel; + cl_kernel m_scatterKernel ; + cl_kernel m_copyAabbsKernel; + cl_kernel m_sapKernel; + cl_kernel m_sap2Kernel; + cl_kernel m_prepareSumVarianceKernel; + + + class b3RadixSort32CL* m_sorter; + + ///test for 3d SAP + b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2]; + b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2]; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev; + + b3OpenCLArray<b3SortData> m_sortedAxisGPU0; + b3OpenCLArray<b3SortData> m_sortedAxisGPU1; + b3OpenCLArray<b3SortData> m_sortedAxisGPU2; + b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev; + b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev; + b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev; + + + b3OpenCLArray<b3Int4> m_addedHostPairsGPU; + b3OpenCLArray<b3Int4> m_removedHostPairsGPU; + b3OpenCLArray<int> m_addedCountGPU; + b3OpenCLArray<int> m_removedCountGPU; + + int m_currentBuffer; + +public: + + b3OpenCLArray<int> m_pairCount; + + + b3OpenCLArray<b3SapAabb> m_allAabbsGPU; + b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU; + + virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() + { + return m_allAabbsGPU; + } + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() + { + return m_allAabbsCPU; + } + + b3OpenCLArray<b3Vector3> m_sum; + b3OpenCLArray<b3Vector3> m_sum2; + b3OpenCLArray<b3Vector3> m_dst; + + b3OpenCLArray<int> m_smallAabbsMappingGPU; + b3AlignedObjectArray<int> m_smallAabbsMappingCPU; + + b3OpenCLArray<int> m_largeAabbsMappingGPU; + b3AlignedObjectArray<int> m_largeAabbsMappingCPU; + + + b3OpenCLArray<b3Int4> m_overlappingPairs; + + //temporary gpu work memory + b3OpenCLArray<b3SortData> m_gpuSmallSortData; + b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs; + + class b3PrefixScanFloat4CL* m_prefixScanFloat4; + + enum b3GpuSapKernelType + { + B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU=1, + B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU, + B3_GPU_SAP_KERNEL_ORIGINAL, + B3_GPU_SAP_KERNEL_BARRIER, + B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY + }; + + b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType=B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY); + virtual ~b3GpuSapBroadphase(); + + static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx,cl_device_id device, cl_command_queue q) + { + return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU); + } + + static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx,cl_device_id device, cl_command_queue q) + { + return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU); + } + + static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx,cl_device_id device, cl_command_queue q) + { + return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_ORIGINAL); + } + static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx,cl_device_id device, cl_command_queue q) + { + return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BARRIER); + } + static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx,cl_device_id device, cl_command_queue q) + { + return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY); + } + + + virtual void calculateOverlappingPairs(int maxPairs); + virtual void calculateOverlappingPairsHost(int maxPairs); + + void reset(); + + void init3dSap(); + virtual void calculateOverlappingPairsHostIncremental3Sap(); + + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); + + //call writeAabbsToGpu after done making all changes (createProxy etc) + virtual void writeAabbsToGpu(); + + virtual cl_mem getAabbBufferWS(); + virtual int getNumOverlap(); + virtual cl_mem getOverlappingPairBuffer(); + + virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU(); + virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU(); + virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU(); +}; + +#endif //B3_GPU_SAP_BROADPHASE_H
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h new file mode 100644 index 0000000000..ea6550fede --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h @@ -0,0 +1,14 @@ +#ifndef B3_SAP_AABB_H +#define B3_SAP_AABB_H + +#include "Bullet3Common/b3Scalar.h" +#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h" + +///just make sure that the b3Aabb is 16-byte aligned +B3_ATTRIBUTE_ALIGNED16(struct) b3SapAabb : public b3Aabb +{ + +}; + + +#endif //B3_SAP_AABB_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl new file mode 100644 index 0000000000..ded4796d33 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl @@ -0,0 +1,216 @@ + + +int getPosHash(int4 gridPos, __global float4* pParams) +{ + int4 gridDim = *((__global int4*)(pParams + 1)); + gridPos.x &= gridDim.x - 1; + gridPos.y &= gridDim.y - 1; + gridPos.z &= gridDim.z - 1; + int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x; + return hash; +} + +int4 getGridPos(float4 worldPos, __global float4* pParams) +{ + int4 gridPos; + int4 gridDim = *((__global int4*)(pParams + 1)); + gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1); + gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1); + gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1); + return gridPos; +} + + +// calculate grid hash value for each body using its AABB +__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams ) +{ + int index = get_global_id(0); + if(index >= numObjects) + { + return; + } + float4 bbMin = allpAABB[smallAabbMapping[index]*2]; + float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1]; + float4 pos; + pos.x = (bbMin.x + bbMax.x) * 0.5f; + pos.y = (bbMin.y + bbMax.y) * 0.5f; + pos.z = (bbMin.z + bbMax.z) * 0.5f; + pos.w = 0.f; + // get address in grid + int4 gridPos = getGridPos(pos, pParams); + int gridHash = getPosHash(gridPos, pParams); + // store grid hash and body index + int2 hashVal; + hashVal.x = gridHash; + hashVal.y = index; + pHash[index] = hashVal; +} + +__kernel void kClearCellStart( int numCells, + __global int* pCellStart ) +{ + int index = get_global_id(0); + if(index >= numCells) + { + return; + } + pCellStart[index] = -1; +} + +__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart ) +{ + __local int sharedHash[513]; + int index = get_global_id(0); + int2 sortedData; + + if(index < numObjects) + { + sortedData = pHash[index]; + // Load hash data into shared memory so that we can look + // at neighboring body's hash value without loading + // two hash values per thread + sharedHash[get_local_id(0) + 1] = sortedData.x; + if((index > 0) && (get_local_id(0) == 0)) + { + // first thread in block must load neighbor body hash + sharedHash[0] = pHash[index-1].x; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if(index < numObjects) + { + if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)])) + { + cellStart[sortedData.x] = index; + } + } +} + +int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1) +{ + return (min0.x <= max1.x)&& (min1.x <= max0.x) && + (min0.y <= max1.y)&& (min1.y <= max0.y) && + (min0.z <= max1.z)&& (min1.z <= max0.z); +} + + + + +//search for AABB 'index' against other AABBs' in this cell +void findPairsInCell( int numObjects, + int4 gridPos, + int index, + __global int2* pHash, + __global int* pCellStart, + __global float4* allpAABB, + __global const int* smallAabbMapping, + __global float4* pParams, + volatile __global int* pairCount, + __global int4* pPairBuff2, + int maxPairs + ) +{ + int4 pGridDim = *((__global int4*)(pParams + 1)); + int maxBodiesPerCell = pGridDim.w; + int gridHash = getPosHash(gridPos, pParams); + // get start of bucket for this cell + int bucketStart = pCellStart[gridHash]; + if (bucketStart == -1) + { + return; // cell empty + } + // iterate over bodies in this cell + int2 sortedData = pHash[index]; + int unsorted_indx = sortedData.y; + float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; + float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1]; + int handleIndex = as_int(min0.w); + + int bucketEnd = bucketStart + maxBodiesPerCell; + bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd; + for(int index2 = bucketStart; index2 < bucketEnd; index2++) + { + int2 cellData = pHash[index2]; + if (cellData.x != gridHash) + { + break; // no longer in same bucket + } + int unsorted_indx2 = cellData.y; + //if (unsorted_indx2 < unsorted_indx) // check not colliding with self + if (unsorted_indx2 != unsorted_indx) // check not colliding with self + { + float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0]; + float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1]; + if(testAABBOverlap(min0, max0, min1, max1)) + { + if (pairCount) + { + int handleIndex2 = as_int(min1.w); + if (handleIndex<handleIndex2) + { + int curPair = atomic_add(pairCount,1); + if (curPair<maxPairs) + { + int4 newpair; + newpair.x = handleIndex; + newpair.y = handleIndex2; + newpair.z = -1; + newpair.w = -1; + pPairBuff2[curPair] = newpair; + } + } + + } + } + } + } +} + +__kernel void kFindOverlappingPairs( int numObjects, + __global float4* allpAABB, + __global const int* smallAabbMapping, + __global int2* pHash, + __global int* pCellStart, + __global float4* pParams , + volatile __global int* pairCount, + __global int4* pPairBuff2, + int maxPairs + ) + +{ + int index = get_global_id(0); + if(index >= numObjects) + { + return; + } + int2 sortedData = pHash[index]; + int unsorted_indx = sortedData.y; + float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; + float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1]; + float4 pos; + pos.x = (bbMin.x + bbMax.x) * 0.5f; + pos.y = (bbMin.y + bbMax.y) * 0.5f; + pos.z = (bbMin.z + bbMax.z) * 0.5f; + // get address in grid + int4 gridPosA = getGridPos(pos, pParams); + int4 gridPosB; + // examine only neighbouring cells + for(int z=-1; z<=1; z++) + { + gridPosB.z = gridPosA.z + z; + for(int y=-1; y<=1; y++) + { + gridPosB.y = gridPosA.y + y; + for(int x=-1; x<=1; x++) + { + gridPosB.x = gridPosA.x + x; + findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs); + } + } + } +} + + + + + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h new file mode 100644 index 0000000000..dad42477c3 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h @@ -0,0 +1,199 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* gridBroadphaseCL= \ +"int getPosHash(int4 gridPos, __global float4* pParams)\n" +"{\n" +" int4 gridDim = *((__global int4*)(pParams + 1));\n" +" gridPos.x &= gridDim.x - 1;\n" +" gridPos.y &= gridDim.y - 1;\n" +" gridPos.z &= gridDim.z - 1;\n" +" int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n" +" return hash;\n" +"} \n" +"int4 getGridPos(float4 worldPos, __global float4* pParams)\n" +"{\n" +" int4 gridPos;\n" +" int4 gridDim = *((__global int4*)(pParams + 1));\n" +" gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n" +" gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n" +" gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n" +" return gridPos;\n" +"}\n" +"// calculate grid hash value for each body using its AABB\n" +"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n" +"{\n" +" int index = get_global_id(0);\n" +" if(index >= numObjects)\n" +" {\n" +" return;\n" +" }\n" +" float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n" +" float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n" +" float4 pos;\n" +" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" +" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" +" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" +" pos.w = 0.f;\n" +" // get address in grid\n" +" int4 gridPos = getGridPos(pos, pParams);\n" +" int gridHash = getPosHash(gridPos, pParams);\n" +" // store grid hash and body index\n" +" int2 hashVal;\n" +" hashVal.x = gridHash;\n" +" hashVal.y = index;\n" +" pHash[index] = hashVal;\n" +"}\n" +"__kernel void kClearCellStart( int numCells, \n" +" __global int* pCellStart )\n" +"{\n" +" int index = get_global_id(0);\n" +" if(index >= numCells)\n" +" {\n" +" return;\n" +" }\n" +" pCellStart[index] = -1;\n" +"}\n" +"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n" +"{\n" +" __local int sharedHash[513];\n" +" int index = get_global_id(0);\n" +" int2 sortedData;\n" +" if(index < numObjects)\n" +" {\n" +" sortedData = pHash[index];\n" +" // Load hash data into shared memory so that we can look \n" +" // at neighboring body's hash value without loading\n" +" // two hash values per thread\n" +" sharedHash[get_local_id(0) + 1] = sortedData.x;\n" +" if((index > 0) && (get_local_id(0) == 0))\n" +" {\n" +" // first thread in block must load neighbor body hash\n" +" sharedHash[0] = pHash[index-1].x;\n" +" }\n" +" }\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" if(index < numObjects)\n" +" {\n" +" if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n" +" {\n" +" cellStart[sortedData.x] = index;\n" +" }\n" +" }\n" +"}\n" +"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n" +"{\n" +" return (min0.x <= max1.x)&& (min1.x <= max0.x) && \n" +" (min0.y <= max1.y)&& (min1.y <= max0.y) && \n" +" (min0.z <= max1.z)&& (min1.z <= max0.z); \n" +"}\n" +"//search for AABB 'index' against other AABBs' in this cell\n" +"void findPairsInCell( int numObjects,\n" +" int4 gridPos,\n" +" int index,\n" +" __global int2* pHash,\n" +" __global int* pCellStart,\n" +" __global float4* allpAABB, \n" +" __global const int* smallAabbMapping,\n" +" __global float4* pParams,\n" +" volatile __global int* pairCount,\n" +" __global int4* pPairBuff2,\n" +" int maxPairs\n" +" )\n" +"{\n" +" int4 pGridDim = *((__global int4*)(pParams + 1));\n" +" int maxBodiesPerCell = pGridDim.w;\n" +" int gridHash = getPosHash(gridPos, pParams);\n" +" // get start of bucket for this cell\n" +" int bucketStart = pCellStart[gridHash];\n" +" if (bucketStart == -1)\n" +" {\n" +" return; // cell empty\n" +" }\n" +" // iterate over bodies in this cell\n" +" int2 sortedData = pHash[index];\n" +" int unsorted_indx = sortedData.y;\n" +" float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n" +" float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n" +" int handleIndex = as_int(min0.w);\n" +" \n" +" int bucketEnd = bucketStart + maxBodiesPerCell;\n" +" bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n" +" for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n" +" {\n" +" int2 cellData = pHash[index2];\n" +" if (cellData.x != gridHash)\n" +" {\n" +" break; // no longer in same bucket\n" +" }\n" +" int unsorted_indx2 = cellData.y;\n" +" //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n" +" if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n" +" { \n" +" float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n" +" float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n" +" if(testAABBOverlap(min0, max0, min1, max1))\n" +" {\n" +" if (pairCount)\n" +" {\n" +" int handleIndex2 = as_int(min1.w);\n" +" if (handleIndex<handleIndex2)\n" +" {\n" +" int curPair = atomic_add(pairCount,1);\n" +" if (curPair<maxPairs)\n" +" {\n" +" int4 newpair;\n" +" newpair.x = handleIndex;\n" +" newpair.y = handleIndex2;\n" +" newpair.z = -1;\n" +" newpair.w = -1;\n" +" pPairBuff2[curPair] = newpair;\n" +" }\n" +" }\n" +" \n" +" }\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__kernel void kFindOverlappingPairs( int numObjects,\n" +" __global float4* allpAABB, \n" +" __global const int* smallAabbMapping,\n" +" __global int2* pHash, \n" +" __global int* pCellStart, \n" +" __global float4* pParams ,\n" +" volatile __global int* pairCount,\n" +" __global int4* pPairBuff2,\n" +" int maxPairs\n" +" )\n" +"{\n" +" int index = get_global_id(0);\n" +" if(index >= numObjects)\n" +" {\n" +" return;\n" +" }\n" +" int2 sortedData = pHash[index];\n" +" int unsorted_indx = sortedData.y;\n" +" float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n" +" float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n" +" float4 pos;\n" +" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" +" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" +" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" +" // get address in grid\n" +" int4 gridPosA = getGridPos(pos, pParams);\n" +" int4 gridPosB; \n" +" // examine only neighbouring cells\n" +" for(int z=-1; z<=1; z++) \n" +" {\n" +" gridPosB.z = gridPosA.z + z;\n" +" for(int y=-1; y<=1; y++) \n" +" {\n" +" gridPosB.y = gridPosA.y + y;\n" +" for(int x=-1; x<=1; x++) \n" +" {\n" +" gridPosB.x = gridPosA.x + x;\n" +" findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n" +" }\n" +" }\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl new file mode 100644 index 0000000000..c375b9bf37 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl @@ -0,0 +1,767 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +typedef float b3Scalar; +typedef float4 b3Vector3; +#define b3Max max +#define b3Min min +#define b3Sqrt sqrt + +typedef struct +{ + unsigned int m_key; + unsigned int m_value; +} SortDataCL; + +typedef struct +{ + union + { + float4 m_min; + float m_minElems[4]; + int m_minIndices[4]; + }; + union + { + float4 m_max; + float m_maxElems[4]; + int m_maxIndices[4]; + }; +} b3AabbCL; + + +unsigned int interleaveBits(unsigned int x) +{ + //........ ........ ......12 3456789A //x + //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits + + //......12 3456789A ......12 3456789A //x ^ (x << 16) + //11111111 ........ ........ 11111111 //0x FF 00 00 FF + //......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF; + + //......12 ........ 3456789A 3456789A //x ^ (x << 8) + //......11 ........ 1111.... ....1111 //0x 03 00 F0 0F + //......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F; + + //..12..12 ....3456 3456.... 789A789A //x ^ (x << 4) + //......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3 + //......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3; + + //....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2) + //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49 + //....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249; + + //........ ........ ......11 11111111 //0x000003FF + x &= 0x000003FF; //Clear all bits above bit 10 + + x = (x ^ (x << 16)) & 0xFF0000FF; + x = (x ^ (x << 8)) & 0x0300F00F; + x = (x ^ (x << 4)) & 0x030C30C3; + x = (x ^ (x << 2)) & 0x09249249; + + return x; +} +unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z) +{ + return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2; +} + +__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate) +{ + int separatedAabbIndex = get_global_id(0); + if(separatedAabbIndex >= numAabbsToSeparate) return; + + int unseparatedAabbIndex = aabbIndices[separatedAabbIndex]; + out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex]; +} + +//Should replace with an optimized parallel reduction +__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge) +{ + //Each time this kernel is added to the command queue, + //the number of AABBs needing to be merged is halved + // + //Example with 159 AABBs: + // numRemainingAabbs == 159 / 2 + 159 % 2 == 80 + // numMergedAabbs == 159 - 80 == 79 + //So, indices [0, 78] are merged with [0 + 80, 78 + 80] + + int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2; + int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs; + + int aabbIndex = get_global_id(0); + if(aabbIndex >= numMergedAabbs) return; + + int otherAabbIndex = aabbIndex + numRemainingAabbs; + + b3AabbCL aabb = out_mergedAabb[aabbIndex]; + b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex]; + + b3AabbCL mergedAabb; + mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min); + mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max); + out_mergedAabb[aabbIndex] = mergedAabb; +} + +__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, + __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs) +{ + int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index + if(leafNodeIndex >= numAabbs) return; + + b3AabbCL mergedAabb = mergedAabbOfAllNodes[0]; + b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f; + b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024; + + b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex]; + b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f; + b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter; + + //Quantize into integer coordinates + //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size + b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize; + + int4 discretePosition; + discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) ); + discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) ); + discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) ); + + //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023] + discretePosition = b3Max( -512, b3Min(discretePosition, 511) ); + discretePosition += 512; + + //Interleave bits(assign a morton code, also known as a z-curve) + unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z); + + // + SortDataCL mortonCodeIndexPair; + mortonCodeIndexPair.m_key = mortonCode; + mortonCodeIndexPair.m_value = leafNodeIndex; + + out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair; +} + +#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128 + +//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes. +//If it is set, then the index is for an internal node; otherwise, it is a leaf node. +//In both cases, the bit should be cleared to access the actual node index. +int isLeafNode(int index) { return (index >> 31 == 0); } +int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); } +int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); } + +//From sap.cl +#define NEW_PAIR_MARKER -1 + +bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2) +{ + bool overlap = true; + overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap; + overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap; + overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap; + return overlap; +} +//From sap.cl + +__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, + + __global int* rootNodeIndex, + __global int2* internalNodeChildIndices, + __global b3AabbCL* internalNodeAabbs, + __global int2* internalNodeLeafIndexRanges, + + __global SortDataCL* mortonCodesAndAabbIndices, + __global int* out_numPairs, __global int4* out_overlappingPairs, + int maxPairs, int numQueryAabbs) +{ + //Using get_group_id()/get_local_id() is Faster than get_global_id(0) since + //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent) + int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0); + if(queryBvhNodeIndex >= numQueryAabbs) return; + + int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value; + b3AabbCL queryAabb = rigidAabbs[queryRigidIndex]; + + int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE]; + + int stackSize = 1; + stack[0] = *rootNodeIndex; + + while(stackSize) + { + int internalOrLeafNodeIndex = stack[ stackSize - 1 ]; + --stackSize; + + int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false + int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex); + + //Optimization - if the BVH is structured as a binary radix tree, then + //each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]). + //This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself. + { + int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y; + if(highestLeafIndex <= queryBvhNodeIndex) continue; + } + + //bvhRigidIndex is not used if internal node + int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1; + + b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex]; + if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) ) + { + if(isLeaf) + { + int4 pair; + pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3]; + pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3]; + pair.z = NEW_PAIR_MARKER; + pair.w = NEW_PAIR_MARKER; + + int pairIndex = atomic_inc(out_numPairs); + if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair; + } + + if(!isLeaf) //Internal node + { + if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE) + { + //Error + } + else + { + stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x; + stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y; + } + } + } + + } +} + + +//From rayCastKernels.cl +typedef struct +{ + float4 m_from; + float4 m_to; +} b3RayInfo; +//From rayCastKernels.cl + +b3Vector3 b3Vector3_normalize(b3Vector3 v) +{ + b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f}; + return normalize(normal); //OpenCL normalize == vector4 normalize +} +b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; } +b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; } + +int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb) +{ + //AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ). + //t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane. + // + //if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane + //and min.x will be the far plane; otherwise, it is reversed. + // + //In order for there to be a collision, the t_min and t_max of each pair must overlap. + //This can be tested for by selecting the highest t_min and lowest t_max and comparing them. + + int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) ); //isless(x,y) returns (x < y) + + //When using vector types, the select() function checks the most signficant bit, + //but isless() sets the least significant bit. + isNegative <<= 31; + + //select(b, a, condition) == condition ? a : b + //When using select() with vector types, (condition[i]) is true if its most significant bit is 1 + b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection; + b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection; + + b3Scalar t_min_final = 0.0f; + b3Scalar t_max_final = rayLength; + + //Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. + //Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4]) + //Since the innermost fmin()/fmax() is always not NaN, this should never return NaN. + t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) ); + t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) ); + + return (t_min_final <= t_max_final); +} + +__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs, + + __global int* rootNodeIndex, + __global int2* internalNodeChildIndices, + __global b3AabbCL* internalNodeAabbs, + __global int2* internalNodeLeafIndexRanges, + __global SortDataCL* mortonCodesAndAabbIndices, + + __global b3RayInfo* rays, + + __global int* out_numRayRigidPairs, + __global int2* out_rayRigidPairs, + int maxRayRigidPairs, int numRays) +{ + int rayIndex = get_global_id(0); + if(rayIndex >= numRays) return; + + // + b3Vector3 rayFrom = rays[rayIndex].m_from; + b3Vector3 rayTo = rays[rayIndex].m_to; + b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom); + b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) ); + + // + int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE]; + + int stackSize = 1; + stack[0] = *rootNodeIndex; + + while(stackSize) + { + int internalOrLeafNodeIndex = stack[ stackSize - 1 ]; + --stackSize; + + int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false + int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex); + + //bvhRigidIndex is not used if internal node + int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1; + + b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex]; + if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) ) + { + if(isLeaf) + { + int2 rayRigidPair; + rayRigidPair.x = rayIndex; + rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3]; + + int pairIndex = atomic_inc(out_numRayRigidPairs); + if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair; + } + + if(!isLeaf) //Internal node + { + if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE) + { + //Error + } + else + { + stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x; + stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y; + } + } + } + } +} + +__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, + __global int* out_numPairs, __global int4* out_overlappingPairs, + int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids) +{ + int smallAabbIndex = get_global_id(0); + if(smallAabbIndex >= numSmallAabbRigids) return; + + b3AabbCL smallAabb = smallAabbs[smallAabbIndex]; + for(int i = 0; i < numLargeAabbRigids; ++i) + { + b3AabbCL largeAabb = largeAabbs[i]; + if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) ) + { + int4 pair; + pair.x = largeAabb.m_minIndices[3]; + pair.y = smallAabb.m_minIndices[3]; + pair.z = NEW_PAIR_MARKER; + pair.w = NEW_PAIR_MARKER; + + int pairIndex = atomic_inc(out_numPairs); + if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair; + } + } +} +__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays, + __global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs, + int numLargeAabbRigids, int maxRayRigidPairs, int numRays) +{ + int rayIndex = get_global_id(0); + if(rayIndex >= numRays) return; + + b3Vector3 rayFrom = rays[rayIndex].m_from; + b3Vector3 rayTo = rays[rayIndex].m_to; + b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom); + b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) ); + + for(int i = 0; i < numLargeAabbRigids; ++i) + { + b3AabbCL rigidAabb = largeRigidAabbs[i]; + if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) ) + { + int2 rayRigidPair; + rayRigidPair.x = rayIndex; + rayRigidPair.y = rigidAabb.m_minIndices[3]; + + int pairIndex = atomic_inc(out_numRayRigidPairs); + if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair; + } + } +} + + +//Set so that it is always greater than the actual common prefixes, and never selected as a parent node. +//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve. +//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node. +//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve). +#define B3_PLBVH_INVALID_COMMON_PREFIX 128 + +#define B3_PLBVH_ROOT_NODE_MARKER -1 + +#define b3Int64 long + +int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); } +b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) +{ + //This function only needs to return (i & j) in order for the algorithm to work, + //but it may help with debugging to mask out the lower bits. + + b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j); + + b3Int64 sharedBits = i & j; + b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0 + + return sharedBits & bitmask; +} + +//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths +int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB) +{ + return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) ); +} + +__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices, + __global b3Int64* out_commonPrefixes, + __global int* out_commonPrefixLengths, + int numInternalNodes) +{ + int internalNodeIndex = get_global_id(0); + if (internalNodeIndex >= numInternalNodes) return; + + //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index, + //and the number of internal nodes is always numLeafNodes - 1 + int leftLeafIndex = internalNodeIndex; + int rightLeafIndex = internalNodeIndex + 1; + + int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key; + int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key; + + //Binary radix tree construction algorithm does not work if there are duplicate morton codes. + //Append the index of each leaf node to each morton code so that there are no duplicates. + //The algorithm also requires that the morton codes are sorted in ascending order; this requirement + //is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true. + // + //upsample(a, b) == ( ((b3Int64)a) << 32) | b + b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex); + b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex); + + out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode); + out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode); +} + + +__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes, + __global int2* out_childNodes, int numLeafNodes) +{ + int leafNodeIndex = get_global_id(0); + if (leafNodeIndex >= numLeafNodes) return; + + int numInternalNodes = numLeafNodes - 1; + + int leftSplitIndex = leafNodeIndex - 1; + int rightSplitIndex = leafNodeIndex; + + int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; + int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; + + //Parent node is the highest adjacent common prefix that is lower than the node's common prefix + //Leaf nodes are considered as having the highest common prefix + int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix); + + //Handle cases for the edge nodes; the first and last node + //For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX + if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false; + if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true; + + int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex; + out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex; + + int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa + + //out_childNodesAsInt[0] == int2.x == left child + //out_childNodesAsInt[1] == int2.y == right child + int isLeaf = 1; + __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]); + out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex); +} + +__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths, + __global int2* out_childNodes, + __global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex, + int numInternalNodes) +{ + int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0); + if(internalNodeIndex >= numInternalNodes) return; + + b3Int64 nodePrefix = commonPrefixes[internalNodeIndex]; + int nodePrefixLength = commonPrefixLengths[internalNodeIndex]; + +//#define USE_LINEAR_SEARCH +#ifdef USE_LINEAR_SEARCH + int leftIndex = -1; + int rightIndex = -1; + + //Find nearest element to left with a lower common prefix + for(int i = internalNodeIndex - 1; i >= 0; --i) + { + int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]); + if(nodeLeftSharedPrefixLength < nodePrefixLength) + { + leftIndex = i; + break; + } + } + + //Find nearest element to right with a lower common prefix + for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i) + { + int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]); + if(nodeRightSharedPrefixLength < nodePrefixLength) + { + rightIndex = i; + break; + } + } + +#else //Use binary search + + //Find nearest element to left with a lower common prefix + int leftIndex = -1; + { + int lower = 0; + int upper = internalNodeIndex - 1; + + while(lower <= upper) + { + int mid = (lower + upper) / 2; + b3Int64 midPrefix = commonPrefixes[mid]; + int midPrefixLength = commonPrefixLengths[mid]; + + int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength); + if(nodeMidSharedPrefixLength < nodePrefixLength) + { + int right = mid + 1; + if(right < internalNodeIndex) + { + b3Int64 rightPrefix = commonPrefixes[right]; + int rightPrefixLength = commonPrefixLengths[right]; + + int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength); + if(nodeRightSharedPrefixLength < nodePrefixLength) + { + lower = right; + leftIndex = right; + } + else + { + leftIndex = mid; + break; + } + } + else + { + leftIndex = mid; + break; + } + } + else upper = mid - 1; + } + } + + //Find nearest element to right with a lower common prefix + int rightIndex = -1; + { + int lower = internalNodeIndex + 1; + int upper = numInternalNodes - 1; + + while(lower <= upper) + { + int mid = (lower + upper) / 2; + b3Int64 midPrefix = commonPrefixes[mid]; + int midPrefixLength = commonPrefixLengths[mid]; + + int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength); + if(nodeMidSharedPrefixLength < nodePrefixLength) + { + int left = mid - 1; + if(left > internalNodeIndex) + { + b3Int64 leftPrefix = commonPrefixes[left]; + int leftPrefixLength = commonPrefixLengths[left]; + + int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength); + if(nodeLeftSharedPrefixLength < nodePrefixLength) + { + upper = left; + rightIndex = left; + } + else + { + rightIndex = mid; + break; + } + } + else + { + rightIndex = mid; + break; + } + } + else lower = mid + 1; + } + } +#endif + + //Select parent + { + int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; + int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; + + int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength); + + if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false; + else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true; + + int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex; + + int isRootNode = (leftIndex == -1 && rightIndex == -1); + out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER; + + int isLeaf = 0; + if(!isRootNode) + { + int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa + + //out_childNodesAsInt[0] == int2.x == left child + //out_childNodesAsInt[1] == int2.y == right child + __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]); + out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex); + } + else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex); + } +} + +__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes, + __global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes) +{ + if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0); + + int internalNodeIndex = get_global_id(0); + if(internalNodeIndex >= numInternalNodes) return; + + // + int distanceFromRoot = 0; + { + int parentIndex = internalNodeParentNodes[internalNodeIndex]; + while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER) + { + parentIndex = internalNodeParentNodes[parentIndex]; + ++distanceFromRoot; + } + } + out_distanceFromRoot[internalNodeIndex] = distanceFromRoot; + + // + __local int localMaxDistanceFromRoot; + if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0; + barrier(CLK_LOCAL_MEM_FENCE); + + atomic_max(&localMaxDistanceFromRoot, distanceFromRoot); + barrier(CLK_LOCAL_MEM_FENCE); + + if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot); +} + +__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices, + __global int2* childNodes, + __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs, + int maxDistanceFromRoot, int processedDistance, int numInternalNodes) +{ + int internalNodeIndex = get_global_id(0); + if(internalNodeIndex >= numInternalNodes) return; + + int distance = distanceFromRoot[internalNodeIndex]; + + if(distance == processedDistance) + { + int leftChildIndex = childNodes[internalNodeIndex].x; + int rightChildIndex = childNodes[internalNodeIndex].y; + + int isLeftChildLeaf = isLeafNode(leftChildIndex); + int isRightChildLeaf = isLeafNode(rightChildIndex); + + leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex); + rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex); + + //leftRigidIndex/rightRigidIndex is not used if internal node + int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1; + int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1; + + b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex]; + b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex]; + + b3AabbCL mergedAabb; + mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min); + mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max); + internalNodeAabbs[internalNodeIndex] = mergedAabb; + } +} + +__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes) +{ + int internalNodeIndex = get_global_id(0); + if(internalNodeIndex >= numInternalNodes) return; + + int numLeafNodes = numInternalNodes + 1; + + int2 childNodes = internalNodeChildNodes[internalNodeIndex]; + + int2 leafIndexRange; //x == min leaf index, y == max leaf index + + //Find lowest leaf index covered by this internal node + { + int lowestIndex = childNodes.x; //childNodes.x == Left child + while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x; + leafIndexRange.x = lowestIndex; + } + + //Find highest leaf index covered by this internal node + { + int highestIndex = childNodes.y; //childNodes.y == Right child + while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y; + leafIndexRange.y = highestIndex; + } + + // + out_leafIndexRanges[internalNodeIndex] = leafIndexRange; +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h new file mode 100644 index 0000000000..5eb8f45b16 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h @@ -0,0 +1,729 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* parallelLinearBvhCL= \ +"/*\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose,\n" +"including commercial applications, and to alter it and redistribute it freely,\n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Initial Author Jackson Lee, 2014\n" +"typedef float b3Scalar;\n" +"typedef float4 b3Vector3;\n" +"#define b3Max max\n" +"#define b3Min min\n" +"#define b3Sqrt sqrt\n" +"typedef struct\n" +"{\n" +" unsigned int m_key;\n" +" unsigned int m_value;\n" +"} SortDataCL;\n" +"typedef struct \n" +"{\n" +" union\n" +" {\n" +" float4 m_min;\n" +" float m_minElems[4];\n" +" int m_minIndices[4];\n" +" };\n" +" union\n" +" {\n" +" float4 m_max;\n" +" float m_maxElems[4];\n" +" int m_maxIndices[4];\n" +" };\n" +"} b3AabbCL;\n" +"unsigned int interleaveBits(unsigned int x)\n" +"{\n" +" //........ ........ ......12 3456789A //x\n" +" //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits\n" +" \n" +" //......12 3456789A ......12 3456789A //x ^ (x << 16)\n" +" //11111111 ........ ........ 11111111 //0x FF 00 00 FF\n" +" //......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF;\n" +" \n" +" //......12 ........ 3456789A 3456789A //x ^ (x << 8)\n" +" //......11 ........ 1111.... ....1111 //0x 03 00 F0 0F\n" +" //......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F;\n" +" \n" +" //..12..12 ....3456 3456.... 789A789A //x ^ (x << 4)\n" +" //......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3\n" +" //......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3;\n" +" \n" +" //....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2)\n" +" //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49\n" +" //....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249;\n" +" \n" +" //........ ........ ......11 11111111 //0x000003FF\n" +" x &= 0x000003FF; //Clear all bits above bit 10\n" +" \n" +" x = (x ^ (x << 16)) & 0xFF0000FF;\n" +" x = (x ^ (x << 8)) & 0x0300F00F;\n" +" x = (x ^ (x << 4)) & 0x030C30C3;\n" +" x = (x ^ (x << 2)) & 0x09249249;\n" +" \n" +" return x;\n" +"}\n" +"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n" +"{\n" +" return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n" +"}\n" +"__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n" +"{\n" +" int separatedAabbIndex = get_global_id(0);\n" +" if(separatedAabbIndex >= numAabbsToSeparate) return;\n" +" int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n" +" out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n" +"}\n" +"//Should replace with an optimized parallel reduction\n" +"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n" +"{\n" +" //Each time this kernel is added to the command queue, \n" +" //the number of AABBs needing to be merged is halved\n" +" //\n" +" //Example with 159 AABBs:\n" +" // numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n" +" // numMergedAabbs == 159 - 80 == 79\n" +" //So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n" +" \n" +" int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n" +" int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n" +" \n" +" int aabbIndex = get_global_id(0);\n" +" if(aabbIndex >= numMergedAabbs) return;\n" +" \n" +" int otherAabbIndex = aabbIndex + numRemainingAabbs;\n" +" \n" +" b3AabbCL aabb = out_mergedAabb[aabbIndex];\n" +" b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n" +" \n" +" b3AabbCL mergedAabb;\n" +" mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n" +" mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n" +" out_mergedAabb[aabbIndex] = mergedAabb;\n" +"}\n" +"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n" +" __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n" +"{\n" +" int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index\n" +" if(leafNodeIndex >= numAabbs) return;\n" +" \n" +" b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n" +" b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n" +" b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n" +" \n" +" b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n" +" b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n" +" b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n" +" \n" +" //Quantize into integer coordinates\n" +" //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n" +" b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n" +" \n" +" int4 discretePosition;\n" +" discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n" +" discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n" +" discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n" +" \n" +" //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n" +" discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n" +" discretePosition += 512;\n" +" \n" +" //Interleave bits(assign a morton code, also known as a z-curve)\n" +" unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n" +" \n" +" //\n" +" SortDataCL mortonCodeIndexPair;\n" +" mortonCodeIndexPair.m_key = mortonCode;\n" +" mortonCodeIndexPair.m_value = leafNodeIndex;\n" +" \n" +" out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n" +"}\n" +"#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n" +"//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n" +"//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n" +"//In both cases, the bit should be cleared to access the actual node index.\n" +"int isLeafNode(int index) { return (index >> 31 == 0); }\n" +"int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n" +"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n" +"//From sap.cl\n" +"#define NEW_PAIR_MARKER -1\n" +"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n" +"{\n" +" bool overlap = true;\n" +" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"//From sap.cl\n" +"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n" +" __global int* rootNodeIndex, \n" +" __global int2* internalNodeChildIndices, \n" +" __global b3AabbCL* internalNodeAabbs,\n" +" __global int2* internalNodeLeafIndexRanges,\n" +" \n" +" __global SortDataCL* mortonCodesAndAabbIndices,\n" +" __global int* out_numPairs, __global int4* out_overlappingPairs, \n" +" int maxPairs, int numQueryAabbs)\n" +"{\n" +" //Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n" +" //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n" +" int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" +" if(queryBvhNodeIndex >= numQueryAabbs) return;\n" +" \n" +" int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n" +" b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n" +" \n" +" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" +" \n" +" int stackSize = 1;\n" +" stack[0] = *rootNodeIndex;\n" +" \n" +" while(stackSize)\n" +" {\n" +" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" +" --stackSize;\n" +" \n" +" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n" +" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" +" \n" +" //Optimization - if the BVH is structured as a binary radix tree, then\n" +" //each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n" +" //This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n" +" {\n" +" int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n" +" if(highestLeafIndex <= queryBvhNodeIndex) continue;\n" +" }\n" +" \n" +" //bvhRigidIndex is not used if internal node\n" +" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" +" \n" +" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" +" if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n" +" {\n" +" if(isLeaf)\n" +" {\n" +" int4 pair;\n" +" pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n" +" pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" +" pair.z = NEW_PAIR_MARKER;\n" +" pair.w = NEW_PAIR_MARKER;\n" +" \n" +" int pairIndex = atomic_inc(out_numPairs);\n" +" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" +" }\n" +" \n" +" if(!isLeaf) //Internal node\n" +" {\n" +" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" +" {\n" +" //Error\n" +" }\n" +" else\n" +" {\n" +" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" +" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" +" }\n" +" }\n" +" }\n" +" \n" +" }\n" +"}\n" +"//From rayCastKernels.cl\n" +"typedef struct\n" +"{\n" +" float4 m_from;\n" +" float4 m_to;\n" +"} b3RayInfo;\n" +"//From rayCastKernels.cl\n" +"b3Vector3 b3Vector3_normalize(b3Vector3 v)\n" +"{\n" +" b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n" +" return normalize(normal); //OpenCL normalize == vector4 normalize\n" +"}\n" +"b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n" +"b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n" +"int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n" +"{\n" +" //AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n" +" //t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n" +" //\n" +" //if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n" +" //and min.x will be the far plane; otherwise, it is reversed.\n" +" //\n" +" //In order for there to be a collision, the t_min and t_max of each pair must overlap.\n" +" //This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n" +" \n" +" int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) ); //isless(x,y) returns (x < y)\n" +" \n" +" //When using vector types, the select() function checks the most signficant bit, \n" +" //but isless() sets the least significant bit.\n" +" isNegative <<= 31;\n" +" //select(b, a, condition) == condition ? a : b\n" +" //When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n" +" b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" +" b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" +" \n" +" b3Scalar t_min_final = 0.0f;\n" +" b3Scalar t_max_final = rayLength;\n" +" \n" +" //Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n" +" //Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n" +" //Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n" +" t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n" +" t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n" +" \n" +" return (t_min_final <= t_max_final);\n" +"}\n" +"__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n" +" __global int* rootNodeIndex, \n" +" __global int2* internalNodeChildIndices, \n" +" __global b3AabbCL* internalNodeAabbs,\n" +" __global int2* internalNodeLeafIndexRanges,\n" +" __global SortDataCL* mortonCodesAndAabbIndices,\n" +" \n" +" __global b3RayInfo* rays,\n" +" \n" +" __global int* out_numRayRigidPairs, \n" +" __global int2* out_rayRigidPairs,\n" +" int maxRayRigidPairs, int numRays)\n" +"{\n" +" int rayIndex = get_global_id(0);\n" +" if(rayIndex >= numRays) return;\n" +" \n" +" //\n" +" b3Vector3 rayFrom = rays[rayIndex].m_from;\n" +" b3Vector3 rayTo = rays[rayIndex].m_to;\n" +" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" +" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" +" \n" +" //\n" +" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" +" \n" +" int stackSize = 1;\n" +" stack[0] = *rootNodeIndex;\n" +" \n" +" while(stackSize)\n" +" {\n" +" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" +" --stackSize;\n" +" \n" +" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n" +" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" +" \n" +" //bvhRigidIndex is not used if internal node\n" +" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" +" \n" +" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" +" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) )\n" +" {\n" +" if(isLeaf)\n" +" {\n" +" int2 rayRigidPair;\n" +" rayRigidPair.x = rayIndex;\n" +" rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" +" \n" +" int pairIndex = atomic_inc(out_numRayRigidPairs);\n" +" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" +" }\n" +" \n" +" if(!isLeaf) //Internal node\n" +" {\n" +" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" +" {\n" +" //Error\n" +" }\n" +" else\n" +" {\n" +" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" +" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" +" }\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n" +" __global int* out_numPairs, __global int4* out_overlappingPairs, \n" +" int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n" +"{\n" +" int smallAabbIndex = get_global_id(0);\n" +" if(smallAabbIndex >= numSmallAabbRigids) return;\n" +" \n" +" b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n" +" for(int i = 0; i < numLargeAabbRigids; ++i)\n" +" {\n" +" b3AabbCL largeAabb = largeAabbs[i];\n" +" if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n" +" {\n" +" int4 pair;\n" +" pair.x = largeAabb.m_minIndices[3];\n" +" pair.y = smallAabb.m_minIndices[3];\n" +" pair.z = NEW_PAIR_MARKER;\n" +" pair.w = NEW_PAIR_MARKER;\n" +" \n" +" int pairIndex = atomic_inc(out_numPairs);\n" +" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" +" }\n" +" }\n" +"}\n" +"__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n" +" __global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs,\n" +" int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n" +"{\n" +" int rayIndex = get_global_id(0);\n" +" if(rayIndex >= numRays) return;\n" +" \n" +" b3Vector3 rayFrom = rays[rayIndex].m_from;\n" +" b3Vector3 rayTo = rays[rayIndex].m_to;\n" +" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" +" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" +" \n" +" for(int i = 0; i < numLargeAabbRigids; ++i)\n" +" {\n" +" b3AabbCL rigidAabb = largeRigidAabbs[i];\n" +" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n" +" {\n" +" int2 rayRigidPair;\n" +" rayRigidPair.x = rayIndex;\n" +" rayRigidPair.y = rigidAabb.m_minIndices[3];\n" +" \n" +" int pairIndex = atomic_inc(out_numRayRigidPairs);\n" +" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" +" }\n" +" }\n" +"}\n" +"//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n" +"//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n" +"//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n" +"//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n" +"#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n" +"#define B3_PLBVH_ROOT_NODE_MARKER -1\n" +"#define b3Int64 long\n" +"int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n" +"b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n" +"{\n" +" //This function only needs to return (i & j) in order for the algorithm to work,\n" +" //but it may help with debugging to mask out the lower bits.\n" +" b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n" +" b3Int64 sharedBits = i & j;\n" +" b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0\n" +" \n" +" return sharedBits & bitmask;\n" +"}\n" +"//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n" +"int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n" +"{\n" +" return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n" +"}\n" +"__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n" +" __global b3Int64* out_commonPrefixes,\n" +" __global int* out_commonPrefixLengths,\n" +" int numInternalNodes)\n" +"{\n" +" int internalNodeIndex = get_global_id(0);\n" +" if (internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n" +" //and the number of internal nodes is always numLeafNodes - 1\n" +" int leftLeafIndex = internalNodeIndex;\n" +" int rightLeafIndex = internalNodeIndex + 1;\n" +" \n" +" int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n" +" int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n" +" \n" +" //Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n" +" //Append the index of each leaf node to each morton code so that there are no duplicates.\n" +" //The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n" +" //is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n" +" //\n" +" //upsample(a, b) == ( ((b3Int64)a) << 32) | b\n" +" b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n" +" b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n" +" \n" +" out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" +" out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" +"}\n" +"__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n" +" __global int2* out_childNodes, int numLeafNodes)\n" +"{\n" +" int leafNodeIndex = get_global_id(0);\n" +" if (leafNodeIndex >= numLeafNodes) return;\n" +" \n" +" int numInternalNodes = numLeafNodes - 1;\n" +" \n" +" int leftSplitIndex = leafNodeIndex - 1;\n" +" int rightSplitIndex = leafNodeIndex;\n" +" \n" +" int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +" int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +" \n" +" //Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n" +" //Leaf nodes are considered as having the highest common prefix\n" +" int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n" +" \n" +" //Handle cases for the edge nodes; the first and last node\n" +" //For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n" +" if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n" +" if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n" +" \n" +" int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n" +" out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n" +" \n" +" int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa\n" +" \n" +" //out_childNodesAsInt[0] == int2.x == left child\n" +" //out_childNodesAsInt[1] == int2.y == right child\n" +" int isLeaf = 1;\n" +" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" +" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n" +"}\n" +"__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n" +" __global int2* out_childNodes,\n" +" __global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n" +" int numInternalNodes)\n" +"{\n" +" int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" +" if(internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n" +" int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n" +" \n" +"//#define USE_LINEAR_SEARCH\n" +"#ifdef USE_LINEAR_SEARCH\n" +" int leftIndex = -1;\n" +" int rightIndex = -1;\n" +" \n" +" //Find nearest element to left with a lower common prefix\n" +" for(int i = internalNodeIndex - 1; i >= 0; --i)\n" +" {\n" +" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" +" if(nodeLeftSharedPrefixLength < nodePrefixLength)\n" +" {\n" +" leftIndex = i;\n" +" break;\n" +" }\n" +" }\n" +" \n" +" //Find nearest element to right with a lower common prefix\n" +" for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n" +" {\n" +" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" +" if(nodeRightSharedPrefixLength < nodePrefixLength)\n" +" {\n" +" rightIndex = i;\n" +" break;\n" +" }\n" +" }\n" +" \n" +"#else //Use binary search\n" +" //Find nearest element to left with a lower common prefix\n" +" int leftIndex = -1;\n" +" {\n" +" int lower = 0;\n" +" int upper = internalNodeIndex - 1;\n" +" \n" +" while(lower <= upper)\n" +" {\n" +" int mid = (lower + upper) / 2;\n" +" b3Int64 midPrefix = commonPrefixes[mid];\n" +" int midPrefixLength = commonPrefixLengths[mid];\n" +" \n" +" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" +" if(nodeMidSharedPrefixLength < nodePrefixLength) \n" +" {\n" +" int right = mid + 1;\n" +" if(right < internalNodeIndex)\n" +" {\n" +" b3Int64 rightPrefix = commonPrefixes[right];\n" +" int rightPrefixLength = commonPrefixLengths[right];\n" +" \n" +" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n" +" if(nodeRightSharedPrefixLength < nodePrefixLength) \n" +" {\n" +" lower = right;\n" +" leftIndex = right;\n" +" }\n" +" else \n" +" {\n" +" leftIndex = mid;\n" +" break;\n" +" }\n" +" }\n" +" else \n" +" {\n" +" leftIndex = mid;\n" +" break;\n" +" }\n" +" }\n" +" else upper = mid - 1;\n" +" }\n" +" }\n" +" \n" +" //Find nearest element to right with a lower common prefix\n" +" int rightIndex = -1;\n" +" {\n" +" int lower = internalNodeIndex + 1;\n" +" int upper = numInternalNodes - 1;\n" +" \n" +" while(lower <= upper)\n" +" {\n" +" int mid = (lower + upper) / 2;\n" +" b3Int64 midPrefix = commonPrefixes[mid];\n" +" int midPrefixLength = commonPrefixLengths[mid];\n" +" \n" +" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" +" if(nodeMidSharedPrefixLength < nodePrefixLength) \n" +" {\n" +" int left = mid - 1;\n" +" if(left > internalNodeIndex)\n" +" {\n" +" b3Int64 leftPrefix = commonPrefixes[left];\n" +" int leftPrefixLength = commonPrefixLengths[left];\n" +" \n" +" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n" +" if(nodeLeftSharedPrefixLength < nodePrefixLength) \n" +" {\n" +" upper = left;\n" +" rightIndex = left;\n" +" }\n" +" else \n" +" {\n" +" rightIndex = mid;\n" +" break;\n" +" }\n" +" }\n" +" else \n" +" {\n" +" rightIndex = mid;\n" +" break;\n" +" }\n" +" }\n" +" else lower = mid + 1;\n" +" }\n" +" }\n" +"#endif\n" +" \n" +" //Select parent\n" +" {\n" +" int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +" int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +" \n" +" int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n" +" \n" +" if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n" +" else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n" +" \n" +" int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n" +" \n" +" int isRootNode = (leftIndex == -1 && rightIndex == -1);\n" +" out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n" +" \n" +" int isLeaf = 0;\n" +" if(!isRootNode)\n" +" {\n" +" int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa\n" +" \n" +" //out_childNodesAsInt[0] == int2.x == left child\n" +" //out_childNodesAsInt[1] == int2.y == right child\n" +" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" +" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" +" }\n" +" else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" +" }\n" +"}\n" +"__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n" +" __global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n" +"{\n" +" if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n" +" int internalNodeIndex = get_global_id(0);\n" +" if(internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" //\n" +" int distanceFromRoot = 0;\n" +" {\n" +" int parentIndex = internalNodeParentNodes[internalNodeIndex];\n" +" while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n" +" {\n" +" parentIndex = internalNodeParentNodes[parentIndex];\n" +" ++distanceFromRoot;\n" +" }\n" +" }\n" +" out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n" +" \n" +" //\n" +" __local int localMaxDistanceFromRoot;\n" +" if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n" +"}\n" +"__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n" +" __global int2* childNodes,\n" +" __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n" +" int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n" +"{\n" +" int internalNodeIndex = get_global_id(0);\n" +" if(internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" int distance = distanceFromRoot[internalNodeIndex];\n" +" \n" +" if(distance == processedDistance)\n" +" {\n" +" int leftChildIndex = childNodes[internalNodeIndex].x;\n" +" int rightChildIndex = childNodes[internalNodeIndex].y;\n" +" \n" +" int isLeftChildLeaf = isLeafNode(leftChildIndex);\n" +" int isRightChildLeaf = isLeafNode(rightChildIndex);\n" +" \n" +" leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n" +" rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n" +" \n" +" //leftRigidIndex/rightRigidIndex is not used if internal node\n" +" int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n" +" int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n" +" \n" +" b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n" +" b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n" +" \n" +" b3AabbCL mergedAabb;\n" +" mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n" +" mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n" +" internalNodeAabbs[internalNodeIndex] = mergedAabb;\n" +" }\n" +"}\n" +"__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n" +"{\n" +" int internalNodeIndex = get_global_id(0);\n" +" if(internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" int numLeafNodes = numInternalNodes + 1;\n" +" \n" +" int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n" +" \n" +" int2 leafIndexRange; //x == min leaf index, y == max leaf index\n" +" \n" +" //Find lowest leaf index covered by this internal node\n" +" {\n" +" int lowestIndex = childNodes.x; //childNodes.x == Left child\n" +" while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n" +" leafIndexRange.x = lowestIndex;\n" +" }\n" +" \n" +" //Find highest leaf index covered by this internal node\n" +" {\n" +" int highestIndex = childNodes.y; //childNodes.y == Right child\n" +" while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n" +" leafIndexRange.y = highestIndex;\n" +" }\n" +" \n" +" //\n" +" out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl new file mode 100644 index 0000000000..93f77a6433 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl @@ -0,0 +1,389 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + +#define NEW_PAIR_MARKER -1 + +typedef struct +{ + union + { + float4 m_min; + float m_minElems[4]; + int m_minIndices[4]; + }; + union + { + float4 m_max; + float m_maxElems[4]; + int m_maxIndices[4]; + }; +} btAabbCL; + + +/// conservative test for overlap between two aabbs +bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2); +bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2) +{ + bool overlap = true; + overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap; + overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap; + overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap; + return overlap; +} +bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2); +bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2) +{ + bool overlap = true; + overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap; + overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap; + overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap; + return overlap; +} + +bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2); +bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2) +{ + bool overlap = true; + overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap; + overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap; + overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap; + return overlap; +} + + +__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs) +{ + int i = get_global_id(0); + if (i>=numUnsortedAabbs) + return; + + int j = get_global_id(1); + if (j>=numUnSortedAabbs2) + return; + + + __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]]; + __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]]; + + if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2)) + { + int4 myPair; + + int xIndex = unsortedAabbPtr[0].m_minIndices[3]; + int yIndex = unsortedAabbPtr2[0].m_minIndices[3]; + if (xIndex>yIndex) + { + int tmp = xIndex; + xIndex=yIndex; + yIndex=tmp; + } + + myPair.x = xIndex; + myPair.y = yIndex; + myPair.z = NEW_PAIR_MARKER; + myPair.w = NEW_PAIR_MARKER; + + + int curPair = atomic_inc (pairCount); + if (curPair<maxPairs) + { + pairsOut[curPair] = myPair; //flush to main memory + } + } +} + + + +__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs) +{ + int i = get_global_id(0); + if (i>=numObjects) + return; + for (int j=i+1;j<numObjects;j++) + { + if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j])) + { + int4 myPair; + myPair.x = aabbs[i].m_minIndices[3]; + myPair.y = aabbs[j].m_minIndices[3]; + myPair.z = NEW_PAIR_MARKER; + myPair.w = NEW_PAIR_MARKER; + + int curPair = atomic_inc (pairCount); + if (curPair<maxPairs) + { + pairsOut[curPair] = myPair; //flush to main memory + } + } + } +} + +__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs) +{ + int i = get_global_id(0); + if (i>=numObjects) + return; + for (int j=i+1;j<numObjects;j++) + { + if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) + { + break; + } + if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j])) + { + int4 myPair; + myPair.x = aabbs[i].m_minIndices[3]; + myPair.y = aabbs[j].m_minIndices[3]; + myPair.z = NEW_PAIR_MARKER; + myPair.w = NEW_PAIR_MARKER; + + int curPair = atomic_inc (pairCount); + if (curPair<maxPairs) + { + pairsOut[curPair] = myPair; //flush to main memory + } + } + } +} + + + + +__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs) +{ + int i = get_global_id(0); + int localId = get_local_id(0); + + __local int numActiveWgItems[1]; + __local int breakRequest[1]; + + if (localId==0) + { + numActiveWgItems[0] = 0; + breakRequest[0] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + atomic_inc(numActiveWgItems); + barrier(CLK_LOCAL_MEM_FENCE); + int localBreak = 0; + + int j=i+1; + do + { + barrier(CLK_LOCAL_MEM_FENCE); + + if (j<numObjects) + { + if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) + { + if (!localBreak) + { + atomic_inc(breakRequest); + localBreak = 1; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (j>=numObjects && !localBreak) + { + atomic_inc(breakRequest); + localBreak = 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (!localBreak) + { + if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j])) + { + int4 myPair; + myPair.x = aabbs[i].m_minIndices[3]; + myPair.y = aabbs[j].m_minIndices[3]; + myPair.z = NEW_PAIR_MARKER; + myPair.w = NEW_PAIR_MARKER; + + int curPair = atomic_inc (pairCount); + if (curPair<maxPairs) + { + pairsOut[curPair] = myPair; //flush to main memory + } + } + } + j++; + + } while (breakRequest[0]<numActiveWgItems[0]); +} + + +__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs) +{ + int i = get_global_id(0); + int localId = get_local_id(0); + + __local int numActiveWgItems[1]; + __local int breakRequest[1]; + __local btAabbCL localAabbs[128];// = aabbs[i]; + + btAabbCL myAabb; + + myAabb = (i<numObjects)? aabbs[i]:aabbs[0]; + float testValue = myAabb.m_maxElems[axis]; + + if (localId==0) + { + numActiveWgItems[0] = 0; + breakRequest[0] = 0; + } + int localCount=0; + int block=0; + localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0]; + localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0]; + + barrier(CLK_LOCAL_MEM_FENCE); + atomic_inc(numActiveWgItems); + barrier(CLK_LOCAL_MEM_FENCE); + int localBreak = 0; + + int j=i+1; + do + { + barrier(CLK_LOCAL_MEM_FENCE); + + if (j<numObjects) + { + if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) + { + if (!localBreak) + { + atomic_inc(breakRequest); + localBreak = 1; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (j>=numObjects && !localBreak) + { + atomic_inc(breakRequest); + localBreak = 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (!localBreak) + { + if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1])) + { + int4 myPair; + myPair.x = myAabb.m_minIndices[3]; + myPair.y = localAabbs[localCount+localId+1].m_minIndices[3]; + myPair.z = NEW_PAIR_MARKER; + myPair.w = NEW_PAIR_MARKER; + + int curPair = atomic_inc (pairCount); + if (curPair<maxPairs) + { + pairsOut[curPair] = myPair; //flush to main memory + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + localCount++; + if (localCount==64) + { + localCount = 0; + block+=64; + localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0]; + localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0]; + } + j++; + + } while (breakRequest[0]<numActiveWgItems[0]); + +} + + + + +//http://stereopsis.com/radix.html +unsigned int FloatFlip(float fl); +unsigned int FloatFlip(float fl) +{ + unsigned int f = *(unsigned int*)&fl; + unsigned int mask = -(int)(f >> 31) | 0x80000000; + return f ^ mask; +} +float IFloatFlip(unsigned int f); +float IFloatFlip(unsigned int f) +{ + unsigned int mask = ((f >> 31) - 1) | 0x80000000; + unsigned int fl = f ^ mask; + return *(float*)&fl; +} + + + + +__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects) +{ + int i = get_global_id(0); + if (i>=numObjects) + return; + int src = destAabbs[i].m_maxIndices[3]; + destAabbs[i] = allAabbs[src]; + destAabbs[i].m_maxIndices[3] = src; +} + + +__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis) +{ + int i = get_global_id(0); + if (i>=numObjects) + return; + + + sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]); + sortData[i].y = i; + +} + + +__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects) +{ + int i = get_global_id(0); + if (i>=numObjects) + return; + + sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]]; +} + + + +__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs) +{ + int i = get_global_id(0); + if (i>=numAabbs) + return; + + btAabbCL smallAabb = allAabbs[smallAabbMapping[i]]; + + float4 s; + s = (smallAabb.m_max+smallAabb.m_min)*0.5f; + sum[i]=s; + sum2[i]=s*s; +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h new file mode 100644 index 0000000000..04d40fcf26 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h @@ -0,0 +1,342 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* sapCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Erwin Coumans\n" +"#define NEW_PAIR_MARKER -1\n" +"typedef struct \n" +"{\n" +" union\n" +" {\n" +" float4 m_min;\n" +" float m_minElems[4];\n" +" int m_minIndices[4];\n" +" };\n" +" union\n" +" {\n" +" float4 m_max;\n" +" float m_maxElems[4];\n" +" int m_maxIndices[4];\n" +" };\n" +"} btAabbCL;\n" +"/// conservative test for overlap between two aabbs\n" +"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n" +"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n" +"{\n" +" bool overlap = true;\n" +" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" +"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" +"{\n" +" bool overlap = true;\n" +" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" +"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" +"{\n" +" bool overlap = true;\n" +" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numUnsortedAabbs)\n" +" return;\n" +" int j = get_global_id(1);\n" +" if (j>=numUnSortedAabbs2)\n" +" return;\n" +" __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n" +" __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n" +" if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n" +" {\n" +" int4 myPair;\n" +" \n" +" int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n" +" int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n" +" if (xIndex>yIndex)\n" +" {\n" +" int tmp = xIndex;\n" +" xIndex=yIndex;\n" +" yIndex=tmp;\n" +" }\n" +" \n" +" myPair.x = xIndex;\n" +" myPair.y = yIndex;\n" +" myPair.z = NEW_PAIR_MARKER;\n" +" myPair.w = NEW_PAIR_MARKER;\n" +" int curPair = atomic_inc (pairCount);\n" +" if (curPair<maxPairs)\n" +" {\n" +" pairsOut[curPair] = myPair; //flush to main memory\n" +" }\n" +" }\n" +"}\n" +"__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numObjects)\n" +" return;\n" +" for (int j=i+1;j<numObjects;j++)\n" +" {\n" +" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" +" {\n" +" int4 myPair;\n" +" myPair.x = aabbs[i].m_minIndices[3];\n" +" myPair.y = aabbs[j].m_minIndices[3];\n" +" myPair.z = NEW_PAIR_MARKER;\n" +" myPair.w = NEW_PAIR_MARKER;\n" +" int curPair = atomic_inc (pairCount);\n" +" if (curPair<maxPairs)\n" +" {\n" +" pairsOut[curPair] = myPair; //flush to main memory\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numObjects)\n" +" return;\n" +" for (int j=i+1;j<numObjects;j++)\n" +" {\n" +" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n" +" {\n" +" break;\n" +" }\n" +" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" +" {\n" +" int4 myPair;\n" +" myPair.x = aabbs[i].m_minIndices[3];\n" +" myPair.y = aabbs[j].m_minIndices[3];\n" +" myPair.z = NEW_PAIR_MARKER;\n" +" myPair.w = NEW_PAIR_MARKER;\n" +" int curPair = atomic_inc (pairCount);\n" +" if (curPair<maxPairs)\n" +" {\n" +" pairsOut[curPair] = myPair; //flush to main memory\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" +"{\n" +" int i = get_global_id(0);\n" +" int localId = get_local_id(0);\n" +" __local int numActiveWgItems[1];\n" +" __local int breakRequest[1];\n" +" if (localId==0)\n" +" {\n" +" numActiveWgItems[0] = 0;\n" +" breakRequest[0] = 0;\n" +" }\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" atomic_inc(numActiveWgItems);\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" int localBreak = 0;\n" +" int j=i+1;\n" +" do\n" +" {\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if (j<numObjects)\n" +" {\n" +" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n" +" {\n" +" if (!localBreak)\n" +" {\n" +" atomic_inc(breakRequest);\n" +" localBreak = 1;\n" +" }\n" +" }\n" +" }\n" +" \n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if (j>=numObjects && !localBreak)\n" +" {\n" +" atomic_inc(breakRequest);\n" +" localBreak = 1;\n" +" }\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if (!localBreak)\n" +" {\n" +" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" +" {\n" +" int4 myPair;\n" +" myPair.x = aabbs[i].m_minIndices[3];\n" +" myPair.y = aabbs[j].m_minIndices[3];\n" +" myPair.z = NEW_PAIR_MARKER;\n" +" myPair.w = NEW_PAIR_MARKER;\n" +" int curPair = atomic_inc (pairCount);\n" +" if (curPair<maxPairs)\n" +" {\n" +" pairsOut[curPair] = myPair; //flush to main memory\n" +" }\n" +" }\n" +" }\n" +" j++;\n" +" } while (breakRequest[0]<numActiveWgItems[0]);\n" +"}\n" +"__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" +"{\n" +" int i = get_global_id(0);\n" +" int localId = get_local_id(0);\n" +" __local int numActiveWgItems[1];\n" +" __local int breakRequest[1];\n" +" __local btAabbCL localAabbs[128];// = aabbs[i];\n" +" \n" +" btAabbCL myAabb;\n" +" \n" +" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n" +" float testValue = myAabb.m_maxElems[axis];\n" +" \n" +" if (localId==0)\n" +" {\n" +" numActiveWgItems[0] = 0;\n" +" breakRequest[0] = 0;\n" +" }\n" +" int localCount=0;\n" +" int block=0;\n" +" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n" +" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n" +" \n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" atomic_inc(numActiveWgItems);\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" int localBreak = 0;\n" +" \n" +" int j=i+1;\n" +" do\n" +" {\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if (j<numObjects)\n" +" {\n" +" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n" +" {\n" +" if (!localBreak)\n" +" {\n" +" atomic_inc(breakRequest);\n" +" localBreak = 1;\n" +" }\n" +" }\n" +" }\n" +" \n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if (j>=numObjects && !localBreak)\n" +" {\n" +" atomic_inc(breakRequest);\n" +" localBreak = 1;\n" +" }\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if (!localBreak)\n" +" {\n" +" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n" +" {\n" +" int4 myPair;\n" +" myPair.x = myAabb.m_minIndices[3];\n" +" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n" +" myPair.z = NEW_PAIR_MARKER;\n" +" myPair.w = NEW_PAIR_MARKER;\n" +" int curPair = atomic_inc (pairCount);\n" +" if (curPair<maxPairs)\n" +" {\n" +" pairsOut[curPair] = myPair; //flush to main memory\n" +" }\n" +" }\n" +" }\n" +" \n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" localCount++;\n" +" if (localCount==64)\n" +" {\n" +" localCount = 0;\n" +" block+=64; \n" +" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n" +" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n" +" }\n" +" j++;\n" +" \n" +" } while (breakRequest[0]<numActiveWgItems[0]);\n" +" \n" +"}\n" +"//http://stereopsis.com/radix.html\n" +"unsigned int FloatFlip(float fl);\n" +"unsigned int FloatFlip(float fl)\n" +"{\n" +" unsigned int f = *(unsigned int*)&fl;\n" +" unsigned int mask = -(int)(f >> 31) | 0x80000000;\n" +" return f ^ mask;\n" +"}\n" +"float IFloatFlip(unsigned int f);\n" +"float IFloatFlip(unsigned int f)\n" +"{\n" +" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n" +" unsigned int fl = f ^ mask;\n" +" return *(float*)&fl;\n" +"}\n" +"__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numObjects)\n" +" return;\n" +" int src = destAabbs[i].m_maxIndices[3];\n" +" destAabbs[i] = allAabbs[src];\n" +" destAabbs[i].m_maxIndices[3] = src;\n" +"}\n" +"__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numObjects)\n" +" return;\n" +" \n" +" \n" +" sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n" +" sortData[i].y = i;\n" +" \n" +"}\n" +"__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numObjects)\n" +" return;\n" +" \n" +" sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n" +"}\n" +"__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numAabbs)\n" +" return;\n" +" \n" +" btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n" +" \n" +" float4 s;\n" +" s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n" +" sum[i]=s;\n" +" sum2[i]=s*s; \n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/CMakeLists.txt b/thirdparty/bullet/src/Bullet3OpenCL/CMakeLists.txt new file mode 100644 index 0000000000..1da58d4a99 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/CMakeLists.txt @@ -0,0 +1,77 @@ +INCLUDE_DIRECTORIES( ${BULLET_PHYSICS_SOURCE_DIR}/src ) + +ADD_DEFINITIONS(-DB3_USE_CLEW) + +SET(Bullet3OpenCL_clew_SRCS + ../clew/clew.c + BroadphaseCollision/b3GpuGridBroadphase.cpp + BroadphaseCollision/b3GpuSapBroadphase.cpp + BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp + BroadphaseCollision/b3GpuParallelLinearBvh.cpp + Initialize/b3OpenCLUtils.cpp + NarrowphaseCollision/b3ContactCache.cpp + NarrowphaseCollision/b3ConvexHullContact.cpp + NarrowphaseCollision/b3GjkEpa.cpp + NarrowphaseCollision/b3OptimizedBvh.cpp + NarrowphaseCollision/b3QuantizedBvh.cpp + NarrowphaseCollision/b3StridingMeshInterface.cpp + NarrowphaseCollision/b3TriangleCallback.cpp + NarrowphaseCollision/b3TriangleIndexVertexArray.cpp + NarrowphaseCollision/b3VoronoiSimplexSolver.cpp + ParallelPrimitives/b3BoundSearchCL.cpp + ParallelPrimitives/b3FillCL.cpp + ParallelPrimitives/b3LauncherCL.cpp + ParallelPrimitives/b3PrefixScanCL.cpp + ParallelPrimitives/b3PrefixScanFloat4CL.cpp + ParallelPrimitives/b3RadixSort32CL.cpp + Raycast/b3GpuRaycast.cpp + RigidBody/b3GpuGenericConstraint.cpp + RigidBody/b3GpuJacobiContactSolver.cpp + RigidBody/b3GpuNarrowPhase.cpp + RigidBody/b3GpuPgsConstraintSolver.cpp + RigidBody/b3GpuPgsContactSolver.cpp + RigidBody/b3GpuRigidBodyPipeline.cpp + RigidBody/b3Solver.cpp +) + + +SET(Bullet3OpenCL_clew_HDRS +# ${Root_HDRS} +) + + +ADD_LIBRARY(Bullet3OpenCL_clew ${Bullet3OpenCL_clew_SRCS} ${Bullet3OpenCL_clew_HDRS}) +SET_TARGET_PROPERTIES(Bullet3OpenCL_clew PROPERTIES VERSION ${BULLET_VERSION}) +SET_TARGET_PROPERTIES(Bullet3OpenCL_clew PROPERTIES SOVERSION ${BULLET_VERSION}) +IF (BUILD_SHARED_LIBS) + TARGET_LINK_LIBRARIES(Bullet3OpenCL_clew LinearMath Bullet3Dynamics ${CMAKE_DL_LIBS}) +ENDIF (BUILD_SHARED_LIBS) + + +IF (INSTALL_LIBS) + IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES) + #INSTALL of other files requires CMake 2.6 + IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5) + IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK) + INSTALL(TARGETS Bullet3OpenCL_clew DESTINATION .) + ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK) + INSTALL(TARGETS Bullet3OpenCL_clew RUNTIME DESTINATION bin + LIBRARY DESTINATION lib${LIB_SUFFIX} + ARCHIVE DESTINATION lib${LIB_SUFFIX}) + INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} +DESTINATION ${INCLUDE_INSTALL_DIR} FILES_MATCHING PATTERN "*.h" PATTERN ".svn" EXCLUDE PATTERN "CMakeFiles" EXCLUDE) +# INSTALL(FILES ../btBullet3OpenCL_clewCommon.h +#DESTINATION ${INCLUDE_INSTALL_DIR}/Bullet3OpenCL_clew) + ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK) + ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5) + + IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK) + SET_TARGET_PROPERTIES(Bullet3OpenCL_clew PROPERTIES FRAMEWORK true) + + SET_TARGET_PROPERTIES(Bullet3OpenCL_clew PROPERTIES PUBLIC_HEADER "${Root_HDRS}") + # Have to list out sub-directories manually: + SET_PROPERTY(SOURCE ${BroadphaseCollision_HDRS} PROPERTY MACOSX_PACKAGE_LOCATION Headers/BroadphaseCollision) + + ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK) + ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES) +ENDIF (INSTALL_LIBS) diff --git a/thirdparty/bullet/src/Bullet3OpenCL/Initialize/b3OpenCLInclude.h b/thirdparty/bullet/src/Bullet3OpenCL/Initialize/b3OpenCLInclude.h new file mode 100644 index 0000000000..e79182d7cb --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/Initialize/b3OpenCLInclude.h @@ -0,0 +1,48 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef B3_OPENCL_INCLUDE_H +#define B3_OPENCL_INCLUDE_H + +#ifdef B3_USE_CLEW + #include "clew/clew.h" +#else + +#ifdef __APPLE__ +#ifdef USE_MINICL +#include <MiniCL/cl.h> +#else +#include <OpenCL/cl.h> +#include <OpenCL/cl_ext.h> //clLogMessagesToStderrAPPLE +#endif +#else +#ifdef USE_MINICL +#include <MiniCL/cl.h> +#else +#include <CL/cl.h> +#ifdef _WIN32 +#include "CL/cl_gl.h" +#endif //_WIN32 +#endif +#endif //__APPLE__ +#endif //B3_USE_CLEW + +#include <assert.h> +#include <stdio.h> +#define oclCHECKERROR(a, b) if((a)!=(b)) { printf("OCL Error : %d\n", (a)); assert((a) == (b)); } + + +#endif //B3_OPENCL_INCLUDE_H + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp b/thirdparty/bullet/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp new file mode 100644 index 0000000000..dd194fc7ba --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp @@ -0,0 +1,1011 @@ +/* +Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org +Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +//Original author: Roman Ponomarev +//Mostly Reimplemented by Erwin Coumans + + +bool gDebugForceLoadingFromSource = false; +bool gDebugSkipLoadingBinary = false; + +#include "Bullet3Common/b3Logging.h" + +#include <string.h> + +#ifdef _WIN32 +#pragma warning (disable:4996) +#endif +#include "b3OpenCLUtils.h" +//#include "b3OpenCLInclude.h" + +#include <stdio.h> +#include <stdlib.h> + +#define B3_MAX_CL_DEVICES 16 //who needs 16 devices? + +#ifdef _WIN32 +#include <windows.h> +#endif + +#include <assert.h> +#define b3Assert assert +#ifndef _WIN32 +#include <sys/stat.h> + +#endif + +static const char* sCachedBinaryPath="cache"; + + +//Set the preferred platform vendor using the OpenCL SDK +static const char* spPlatformVendor = +#if defined(CL_PLATFORM_MINI_CL) +"MiniCL, SCEA"; +#elif defined(CL_PLATFORM_AMD) +"Advanced Micro Devices, Inc."; +#elif defined(CL_PLATFORM_NVIDIA) +"NVIDIA Corporation"; +#elif defined(CL_PLATFORM_INTEL) +"Intel(R) Corporation"; +#elif defined(B3_USE_CLEW) +"clew (OpenCL Extension Wrangler library)"; +#else +"Unknown Vendor"; +#endif + +#ifndef CL_PLATFORM_MINI_CL +#ifdef _WIN32 +#ifndef B3_USE_CLEW +#include "CL/cl_gl.h" +#endif //B3_USE_CLEW +#endif //_WIN32 +#endif + + +void MyFatalBreakAPPLE( const char * errstr , + const void * private_info , + size_t cb , + void * user_data ) +{ + + + const char* patloc = strstr(errstr, "Warning"); + //find out if it is a warning or error, exit if error + + if (patloc) + { + b3Warning("Warning: %s\n", errstr); + } else + { + b3Error("Error: %s\n", errstr); + b3Assert(0); + } + +} + +#ifdef B3_USE_CLEW + +int b3OpenCLUtils_clewInit() +{ + int result = -1; + +#ifdef _WIN32 + const char* cl = "OpenCL.dll"; +#elif defined __APPLE__ + const char* cl = "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL"; +#else//presumable Linux? + //linux (tested on Ubuntu 12.10 with Catalyst 13.4 beta drivers, not that there is no symbolic link from libOpenCL.so + const char* cl = "libOpenCL.so.1"; + result = clewInit(cl); + if (result != CLEW_SUCCESS) + { + cl = "libOpenCL.so"; + } else + { + clewExit(); + } +#endif + result = clewInit(cl); + if (result!=CLEW_SUCCESS) + { + b3Error("clewInit failed with error code %d\n",result); + } + else + { + b3Printf("clewInit succesfull using %s\n",cl); + } + return result; +} +#endif + +int b3OpenCLUtils_getNumPlatforms(cl_int* pErrNum) +{ +#ifdef B3_USE_CLEW + b3OpenCLUtils_clewInit(); +#endif + + cl_platform_id pPlatforms[10] = { 0 }; + + cl_uint numPlatforms = 0; + cl_int ciErrNum = clGetPlatformIDs(10, pPlatforms, &numPlatforms); + //cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms); + + if(ciErrNum != CL_SUCCESS) + { + if(pErrNum != NULL) + *pErrNum = ciErrNum; + } + return numPlatforms; + +} + +const char* b3OpenCLUtils_getSdkVendorName() +{ + return spPlatformVendor; +} + +void b3OpenCLUtils_setCachePath(const char* path) +{ + sCachedBinaryPath = path; +} + +cl_platform_id b3OpenCLUtils_getPlatform(int platformIndex0, cl_int* pErrNum) +{ +#ifdef B3_USE_CLEW + b3OpenCLUtils_clewInit(); +#endif + + cl_platform_id platform = 0; + unsigned int platformIndex = (unsigned int )platformIndex0; + cl_uint numPlatforms; + cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms); + + if (platformIndex<numPlatforms) + { + cl_platform_id* platforms = (cl_platform_id*) malloc (sizeof(cl_platform_id)*numPlatforms); + ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL); + if(ciErrNum != CL_SUCCESS) + { + if(pErrNum != NULL) + *pErrNum = ciErrNum; + return platform; + } + + platform = platforms[platformIndex]; + + free (platforms); + } + + return platform; +} + +void b3OpenCLUtils::getPlatformInfo(cl_platform_id platform, b3OpenCLPlatformInfo* platformInfo) +{ + b3Assert(platform); + cl_int ciErrNum; + ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_VENDOR,B3_MAX_STRING_LENGTH,platformInfo->m_platformVendor,NULL); + oclCHECKERROR(ciErrNum,CL_SUCCESS); + ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_NAME,B3_MAX_STRING_LENGTH,platformInfo->m_platformName,NULL); + oclCHECKERROR(ciErrNum,CL_SUCCESS); + ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_VERSION,B3_MAX_STRING_LENGTH,platformInfo->m_platformVersion,NULL); + oclCHECKERROR(ciErrNum,CL_SUCCESS); +} + +void b3OpenCLUtils_printPlatformInfo( cl_platform_id platform) +{ + b3OpenCLPlatformInfo platformInfo; + b3OpenCLUtils::getPlatformInfo (platform, &platformInfo); + b3Printf("Platform info:\n"); + b3Printf(" CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor); + b3Printf(" CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName); + b3Printf(" CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion); +} + + + +cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex) +{ + cl_context retContext = 0; + cl_int ciErrNum=0; + cl_uint num_entries; + cl_device_id devices[B3_MAX_CL_DEVICES]; + cl_uint num_devices; + cl_context_properties* cprops; + + /* + * If we could find our platform, use it. Otherwise pass a NULL and get whatever the + * implementation thinks we should be using. + */ + cl_context_properties cps[7] = {0,0,0,0,0,0,0}; + cps[0] = CL_CONTEXT_PLATFORM; + cps[1] = (cl_context_properties)platform; +#ifdef _WIN32 +#ifndef B3_USE_CLEW + if (pGLContext && pGLDC) + { + cps[2] = CL_GL_CONTEXT_KHR; + cps[3] = (cl_context_properties)pGLContext; + cps[4] = CL_WGL_HDC_KHR; + cps[5] = (cl_context_properties)pGLDC; + } +#endif //B3_USE_CLEW +#endif //_WIN32 + num_entries = B3_MAX_CL_DEVICES; + + + num_devices=-1; + + ciErrNum = clGetDeviceIDs( + platform, + deviceType, + num_entries, + devices, + &num_devices); + + if (ciErrNum<0) + { + b3Printf("clGetDeviceIDs returned %d\n",ciErrNum); + return 0; + } + cprops = (NULL == platform) ? NULL : cps; + + if (!num_devices) + return 0; + + if (pGLContext) + { + //search for the GPU that relates to the OpenCL context + unsigned int i; + for (i=0;i<num_devices;i++) + { + retContext = clCreateContext(cprops,1,&devices[i],NULL,NULL,&ciErrNum); + if (ciErrNum==CL_SUCCESS) + break; + } + } + else + { + if (preferredDeviceIndex>=0 && (unsigned int)preferredDeviceIndex<num_devices) + { + //create a context of the preferred device index + retContext = clCreateContext(cprops,1,&devices[preferredDeviceIndex],NULL,NULL,&ciErrNum); + } else + { + //create a context of all devices +#if defined (__APPLE__) + retContext = clCreateContext(cprops,num_devices,devices,MyFatalBreakAPPLE,NULL,&ciErrNum); +#else + b3Printf("numDevices=%d\n",num_devices); + + retContext = clCreateContext(cprops,num_devices,devices,NULL,NULL,&ciErrNum); +#endif + } + } + if(pErrNum != NULL) + { + *pErrNum = ciErrNum; + }; + + return retContext; +} + +cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC , int preferredDeviceIndex, int preferredPlatformIndex, cl_platform_id* retPlatformId) +{ +#ifdef B3_USE_CLEW + b3OpenCLUtils_clewInit(); +#endif + + + cl_uint numPlatforms; + cl_context retContext = 0; + unsigned int i; + + cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms); + if(ciErrNum != CL_SUCCESS) + { + if(pErrNum != NULL) *pErrNum = ciErrNum; + return NULL; + } + if(numPlatforms > 0) + { + cl_platform_id* platforms = (cl_platform_id*) malloc (sizeof(cl_platform_id)*numPlatforms); + ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL); + if(ciErrNum != CL_SUCCESS) + { + if(pErrNum != NULL) + *pErrNum = ciErrNum; + free(platforms); + return NULL; + } + + + + for ( i = 0; i < numPlatforms; ++i) + { + char pbuf[128]; + ciErrNum = clGetPlatformInfo( platforms[i], + CL_PLATFORM_VENDOR, + sizeof(pbuf), + pbuf, + NULL); + if(ciErrNum != CL_SUCCESS) + { + if(pErrNum != NULL) *pErrNum = ciErrNum; + return NULL; + } + + if (preferredPlatformIndex>=0 && i==preferredPlatformIndex) + { + cl_platform_id tmpPlatform = platforms[0]; + platforms[0] = platforms[i]; + platforms[i] = tmpPlatform; + break; + } else + { + if(!strcmp(pbuf, spPlatformVendor)) + { + cl_platform_id tmpPlatform = platforms[0]; + platforms[0] = platforms[i]; + platforms[i] = tmpPlatform; + } + } + } + + for (i = 0; i < numPlatforms; ++i) + { + cl_platform_id platform = platforms[i]; + assert(platform); + + retContext = b3OpenCLUtils_createContextFromPlatform(platform,deviceType,pErrNum,pGLContext,pGLDC,preferredDeviceIndex,preferredPlatformIndex); + + if (retContext) + { +// printf("OpenCL platform details:\n"); + b3OpenCLPlatformInfo platformInfo; + + b3OpenCLUtils::getPlatformInfo(platform, &platformInfo); + + if (retPlatformId) + *retPlatformId = platform; + + break; + } + } + + free (platforms); + } + return retContext; +} + + +////////////////////////////////////////////////////////////////////////////// +//! Gets the id of the nth device from the context +//! +//! @return the id or -1 when out of range +//! @param cxMainContext OpenCL context +//! @param device_idx index of the device of interest +////////////////////////////////////////////////////////////////////////////// +cl_device_id b3OpenCLUtils_getDevice(cl_context cxMainContext, int deviceIndex) +{ + assert(cxMainContext); + + size_t szParmDataBytes; + cl_device_id* cdDevices; + cl_device_id device ; + + // get the list of devices associated with context + clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes); + + if( szParmDataBytes / sizeof(cl_device_id) < (unsigned int)deviceIndex ) { + return (cl_device_id)-1; + } + + cdDevices = (cl_device_id*) malloc(szParmDataBytes); + + clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL); + + device = cdDevices[deviceIndex]; + free(cdDevices); + + return device; +} + +int b3OpenCLUtils_getNumDevices(cl_context cxMainContext) +{ + size_t szParamDataBytes; + int device_count; + clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParamDataBytes); + device_count = (int) szParamDataBytes/ sizeof(cl_device_id); + return device_count; +} + + + +void b3OpenCLUtils::getDeviceInfo(cl_device_id device, b3OpenCLDeviceInfo* info) +{ + // CL_DEVICE_NAME + clGetDeviceInfo(device, CL_DEVICE_NAME, B3_MAX_STRING_LENGTH, &info->m_deviceName, NULL); + + // CL_DEVICE_VENDOR + clGetDeviceInfo(device, CL_DEVICE_VENDOR, B3_MAX_STRING_LENGTH, &info->m_deviceVendor, NULL); + + // CL_DRIVER_VERSION + clGetDeviceInfo(device, CL_DRIVER_VERSION, B3_MAX_STRING_LENGTH, &info->m_driverVersion, NULL); + + // CL_DEVICE_INFO + clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &info->m_deviceType, NULL); + + // CL_DEVICE_MAX_COMPUTE_UNITS + clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(info->m_computeUnits), &info->m_computeUnits, NULL); + + // CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS + clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(info->m_workitemDims), &info->m_workitemDims, NULL); + + // CL_DEVICE_MAX_WORK_ITEM_SIZES + clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(info->m_workItemSize), &info->m_workItemSize, NULL); + + // CL_DEVICE_MAX_WORK_GROUP_SIZE + clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(info->m_workgroupSize), &info->m_workgroupSize, NULL); + + // CL_DEVICE_MAX_CLOCK_FREQUENCY + clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(info->m_clockFrequency), &info->m_clockFrequency, NULL); + + // CL_DEVICE_ADDRESS_BITS + clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(info->m_addressBits), &info->m_addressBits, NULL); + + // CL_DEVICE_MAX_MEM_ALLOC_SIZE + clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(info->m_maxMemAllocSize), &info->m_maxMemAllocSize, NULL); + + // CL_DEVICE_GLOBAL_MEM_SIZE + clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(info->m_globalMemSize), &info->m_globalMemSize, NULL); + + // CL_DEVICE_ERROR_CORRECTION_SUPPORT + clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(info->m_errorCorrectionSupport), &info->m_errorCorrectionSupport, NULL); + + // CL_DEVICE_LOCAL_MEM_TYPE + clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(info->m_localMemType), &info->m_localMemType, NULL); + + // CL_DEVICE_LOCAL_MEM_SIZE + clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(info->m_localMemSize), &info->m_localMemSize, NULL); + + // CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE + clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(info->m_constantBufferSize), &info->m_constantBufferSize, NULL); + + // CL_DEVICE_QUEUE_PROPERTIES + clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(info->m_queueProperties), &info->m_queueProperties, NULL); + + // CL_DEVICE_IMAGE_SUPPORT + clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(info->m_imageSupport), &info->m_imageSupport, NULL); + + // CL_DEVICE_MAX_READ_IMAGE_ARGS + clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(info->m_maxReadImageArgs), &info->m_maxReadImageArgs, NULL); + + // CL_DEVICE_MAX_WRITE_IMAGE_ARGS + clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(info->m_maxWriteImageArgs), &info->m_maxWriteImageArgs, NULL); + + // CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH + clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &info->m_image2dMaxWidth, NULL); + clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &info->m_image2dMaxHeight, NULL); + clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &info->m_image3dMaxWidth, NULL); + clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &info->m_image3dMaxHeight, NULL); + clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &info->m_image3dMaxDepth, NULL); + + // CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines + clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, B3_MAX_STRING_LENGTH, &info->m_deviceExtensions, NULL); + + // CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type> + clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &info->m_vecWidthChar, NULL); + clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &info->m_vecWidthShort, NULL); + clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &info->m_vecWidthInt, NULL); + clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &info->m_vecWidthLong, NULL); + clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &info->m_vecWidthFloat, NULL); + clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &info->m_vecWidthDouble, NULL); +} + + +void b3OpenCLUtils_printDeviceInfo(cl_device_id device) +{ + b3OpenCLDeviceInfo info; + b3OpenCLUtils::getDeviceInfo(device,&info); + b3Printf("Device Info:\n"); + b3Printf(" CL_DEVICE_NAME: \t\t\t%s\n", info.m_deviceName); + b3Printf(" CL_DEVICE_VENDOR: \t\t\t%s\n", info.m_deviceVendor); + b3Printf(" CL_DRIVER_VERSION: \t\t\t%s\n", info.m_driverVersion); + + if( info.m_deviceType & CL_DEVICE_TYPE_CPU ) + b3Printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU"); + if( info.m_deviceType & CL_DEVICE_TYPE_GPU ) + b3Printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU"); + if( info.m_deviceType & CL_DEVICE_TYPE_ACCELERATOR ) + b3Printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR"); + if( info.m_deviceType & CL_DEVICE_TYPE_DEFAULT ) + b3Printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT"); + + b3Printf(" CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", info.m_computeUnits); + b3Printf(" CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", info.m_workitemDims); + b3Printf(" CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", info.m_workItemSize[0], info.m_workItemSize[1], info.m_workItemSize[2]); + b3Printf(" CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", info.m_workgroupSize); + b3Printf(" CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", info.m_clockFrequency); + b3Printf(" CL_DEVICE_ADDRESS_BITS:\t\t%u\n", info.m_addressBits); + b3Printf(" CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_maxMemAllocSize/ (1024 * 1024))); + b3Printf(" CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_globalMemSize/ (1024 * 1024))); + b3Printf(" CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", info.m_errorCorrectionSupport== CL_TRUE ? "yes" : "no"); + b3Printf(" CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", info.m_localMemType == 1 ? "local" : "global"); + b3Printf(" CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(info.m_localMemSize / 1024)); + b3Printf(" CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(info.m_constantBufferSize / 1024)); + if( info.m_queueProperties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE ) + b3Printf(" CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE"); + if( info.m_queueProperties & CL_QUEUE_PROFILING_ENABLE ) + b3Printf(" CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE"); + + b3Printf(" CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", info.m_imageSupport); + + b3Printf(" CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", info.m_maxReadImageArgs); + b3Printf(" CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", info.m_maxWriteImageArgs); + b3Printf("\n CL_DEVICE_IMAGE <dim>"); + b3Printf("\t\t\t2D_MAX_WIDTH\t %u\n", info.m_image2dMaxWidth); + b3Printf("\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", info.m_image2dMaxHeight); + b3Printf("\t\t\t\t\t3D_MAX_WIDTH\t %u\n", info.m_image3dMaxWidth); + b3Printf("\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", info.m_image3dMaxHeight); + b3Printf("\t\t\t\t\t3D_MAX_DEPTH\t %u\n", info.m_image3dMaxDepth); + if (*info.m_deviceExtensions != 0) + { + b3Printf("\n CL_DEVICE_EXTENSIONS:%s\n",info.m_deviceExtensions); + } + else + { + b3Printf(" CL_DEVICE_EXTENSIONS: None\n"); + } + b3Printf(" CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t"); + b3Printf("CHAR %u, SHORT %u, INT %u,LONG %u, FLOAT %u, DOUBLE %u\n\n\n", + info.m_vecWidthChar, info.m_vecWidthShort, info.m_vecWidthInt, info.m_vecWidthLong,info.m_vecWidthFloat, info.m_vecWidthDouble); + + +} + + +static const char* strip2(const char* name, const char* pattern) +{ + size_t const patlen = strlen(pattern); + size_t patcnt = 0; + const char * oriptr; + const char * patloc; + // find how many times the pattern occurs in the original string + for (oriptr = name; (patloc = strstr(oriptr, pattern)); oriptr = patloc + patlen) + { + patcnt++; + } + return oriptr; +} + +cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSourceOrg, cl_int* pErrNum, const char* additionalMacrosArg , const char* clFileNameForCaching, bool disableBinaryCaching) +{ + const char* additionalMacros = additionalMacrosArg?additionalMacrosArg:""; + + if (disableBinaryCaching) + { + //kernelSourceOrg = 0; + } + + cl_program m_cpProgram=0; + cl_int status; + + char binaryFileName[B3_MAX_STRING_LENGTH]; + + char deviceName[256]; + char driverVersion[256]; + const char* strippedName; + int fileUpToDate = 0; +#ifdef _WIN32 + int binaryFileValid=0; +#endif + if (!disableBinaryCaching && clFileNameForCaching) + { + clGetDeviceInfo(device, CL_DEVICE_NAME, 256, &deviceName, NULL); + clGetDeviceInfo(device, CL_DRIVER_VERSION, 256, &driverVersion, NULL); + + strippedName = strip2(clFileNameForCaching,"\\"); + strippedName = strip2(strippedName,"/"); + +#ifdef _MSVC_VER + sprintf_s(binaryFileName,B3_MAX_STRING_LENGTH,"%s/%s.%s.%s.bin",sCachedBinaryPath,strippedName, deviceName,driverVersion ); +#else + sprintf(binaryFileName,"%s/%s.%s.%s.bin",sCachedBinaryPath,strippedName, deviceName,driverVersion ); +#endif + } + if (clFileNameForCaching && !(disableBinaryCaching || gDebugSkipLoadingBinary||gDebugForceLoadingFromSource) ) + { + +#ifdef _WIN32 + char* bla=0; + + + + //printf("searching for %s\n", binaryFileName); + + + FILETIME modtimeBinary; + CreateDirectoryA(sCachedBinaryPath,0); + { + + HANDLE binaryFileHandle = CreateFileA(binaryFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0); + if (binaryFileHandle ==INVALID_HANDLE_VALUE) + { + DWORD errorCode; + errorCode = GetLastError(); + switch (errorCode) + { + case ERROR_FILE_NOT_FOUND: + { + b3Warning("\nCached file not found %s\n", binaryFileName); + break; + } + case ERROR_PATH_NOT_FOUND: + { + b3Warning("\nCached file path not found %s\n", binaryFileName); + break; + } + default: + { + b3Warning("\nFailed reading cached file with errorCode = %d\n", errorCode); + } + } + } else + { + if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary)==0) + { + DWORD errorCode; + errorCode = GetLastError(); + b3Warning("\nGetFileTime errorCode = %d\n", errorCode); + } else + { + binaryFileValid = 1; + } + CloseHandle(binaryFileHandle); + } + + if (binaryFileValid) + { + HANDLE srcFileHandle = CreateFileA(clFileNameForCaching,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0); + + if (srcFileHandle==INVALID_HANDLE_VALUE) + { + const char* prefix[]={"./","../","../../","../../../","../../../../"}; + for (int i=0;(srcFileHandle==INVALID_HANDLE_VALUE) && i<5;i++) + { + char relativeFileName[1024]; + sprintf(relativeFileName,"%s%s",prefix[i],clFileNameForCaching); + srcFileHandle = CreateFileA(relativeFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0); + } + + } + + + if (srcFileHandle!=INVALID_HANDLE_VALUE) + { + FILETIME modtimeSrc; + if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc)==0) + { + DWORD errorCode; + errorCode = GetLastError(); + b3Warning("\nGetFileTime errorCode = %d\n", errorCode); + } + if ( ( modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime) + ||(( modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime)&&(modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime))) + { + fileUpToDate=1; + } else + { + b3Warning("\nCached binary file out-of-date (%s)\n",binaryFileName); + } + CloseHandle(srcFileHandle); + } + else + { +#ifdef _DEBUG + DWORD errorCode; + errorCode = GetLastError(); + switch (errorCode) + { + case ERROR_FILE_NOT_FOUND: + { + b3Warning("\nSrc file not found %s\n", clFileNameForCaching); + break; + } + case ERROR_PATH_NOT_FOUND: + { + b3Warning("\nSrc path not found %s\n", clFileNameForCaching); + break; + } + default: + { + b3Warning("\nnSrc file reading errorCode = %d\n", errorCode); + } + } + + //we should make sure the src file exists so we can verify the timestamp with binary +// assert(0); + b3Warning("Warning: cannot find OpenCL kernel %s to verify timestamp of binary cached kernel %s\n",clFileNameForCaching, binaryFileName); + fileUpToDate = true; +#else + //if we cannot find the source, assume it is OK in release builds + fileUpToDate = true; +#endif + } + } + + + } + + + +#else + fileUpToDate = true; + if (mkdir(sCachedBinaryPath,0777) == -1) + { + } + else + { + b3Printf("Succesfully created cache directory: %s\n", sCachedBinaryPath); + } +#endif //_WIN32 + } + + + if( fileUpToDate) + { +#ifdef _MSC_VER + FILE* file; + if (fopen_s(&file,binaryFileName, "rb")!=0) + file=0; +#else + FILE* file = fopen(binaryFileName, "rb"); +#endif + + if (file) + { + size_t binarySize=0; + char* binary =0; + + fseek( file, 0L, SEEK_END ); + binarySize = ftell( file ); + rewind( file ); + binary = (char*)malloc(sizeof(char)*binarySize); + int bytesRead; + bytesRead = fread( binary, sizeof(char), binarySize, file ); + fclose( file ); + + m_cpProgram = clCreateProgramWithBinary( clContext, 1,&device, &binarySize, (const unsigned char**)&binary, 0, &status ); + b3Assert( status == CL_SUCCESS ); + status = clBuildProgram( m_cpProgram, 1, &device, additionalMacros, 0, 0 ); + b3Assert( status == CL_SUCCESS ); + + if( status != CL_SUCCESS ) + { + char *build_log; + size_t ret_val_size; + clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); + build_log = (char*)malloc(sizeof(char)*(ret_val_size+1)); + clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); + build_log[ret_val_size] = '\0'; + b3Error("%s\n", build_log); + free (build_log); + b3Assert(0); + m_cpProgram = 0; + + b3Warning("clBuildProgram reported failure on cached binary: %s\n",binaryFileName); + + } else + { + b3Printf("clBuildProgram successfully compiled cached binary: %s\n",binaryFileName); + } + free (binary); + + } else + { + b3Warning("Cannot open cached binary: %s\n",binaryFileName); + } + } + + + + + + + + + + if (!m_cpProgram) + { + + cl_int localErrNum; + char* compileFlags; + int flagsize; + + + + const char* kernelSource = kernelSourceOrg; + + if (!kernelSourceOrg || gDebugForceLoadingFromSource) + { + if (clFileNameForCaching) + { + + FILE* file = fopen(clFileNameForCaching, "rb"); + //in many cases the relative path is a few levels up the directory hierarchy, so try it + if (!file) + { + const char* prefix[]={"../","../../","../../../","../../../../"}; + for (int i=0;!file && i<3;i++) + { + char relativeFileName[1024]; + sprintf(relativeFileName,"%s%s",prefix[i],clFileNameForCaching); + file = fopen(relativeFileName, "rb"); + } + } + + if (file) + { + char* kernelSrc=0; + fseek( file, 0L, SEEK_END ); + int kernelSize = ftell( file ); + rewind( file ); + kernelSrc = (char*)malloc(kernelSize+1); + int readBytes; + readBytes = fread((void*)kernelSrc,1,kernelSize, file); + kernelSrc[kernelSize] = 0; + fclose(file); + kernelSource = kernelSrc; + } + } + } + + size_t program_length = kernelSource ? strlen(kernelSource) : 0; +#ifdef MAC //or __APPLE__? + char* flags = "-cl-mad-enable -DMAC "; +#else + const char* flags = ""; +#endif + + + m_cpProgram = clCreateProgramWithSource(clContext, 1, (const char**)&kernelSource, &program_length, &localErrNum); + if (localErrNum!= CL_SUCCESS) + { + if (pErrNum) + *pErrNum = localErrNum; + return 0; + } + + // Build the program with 'mad' Optimization option + + + + flagsize = sizeof(char)*(strlen(additionalMacros) + strlen(flags) + 5); + compileFlags = (char*) malloc(flagsize); +#ifdef _MSC_VER + sprintf_s(compileFlags,flagsize, "%s %s", flags, additionalMacros); +#else + sprintf(compileFlags, "%s %s", flags, additionalMacros); +#endif + localErrNum = clBuildProgram(m_cpProgram, 1, &device, compileFlags, NULL, NULL); + if (localErrNum!= CL_SUCCESS) + { + char *build_log; + size_t ret_val_size; + clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); + build_log = (char*) malloc(sizeof(char)*(ret_val_size+1)); + clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); + + // to be carefully, terminate with \0 + // there's no information in the reference whether the string is 0 terminated or not + build_log[ret_val_size] = '\0'; + + + b3Error("Error in clBuildProgram, Line %u in file %s, Log: \n%s\n !!!\n\n", __LINE__, __FILE__, build_log); + free (build_log); + if (pErrNum) + *pErrNum = localErrNum; + return 0; + } + + + if( !disableBinaryCaching && clFileNameForCaching ) + { // write to binary + + cl_uint numAssociatedDevices; + status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0 ); + b3Assert( status == CL_SUCCESS ); + if (numAssociatedDevices==1) + { + + size_t binarySize; + char* binary ; + + status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 ); + b3Assert( status == CL_SUCCESS ); + + binary = (char*)malloc(sizeof(char)*binarySize); + + status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 ); + b3Assert( status == CL_SUCCESS ); + + { + FILE* file=0; +#ifdef _MSC_VER + if (fopen_s(&file,binaryFileName, "wb")!=0) + file=0; +#else + file = fopen(binaryFileName, "wb"); +#endif + if (file) + { + fwrite( binary, sizeof(char), binarySize, file ); + fclose( file ); + } else + { + b3Warning("cannot write file %s\n", binaryFileName); + } + } + + free (binary); + } + } + + free(compileFlags); + + } + return m_cpProgram; +} + + +cl_kernel b3OpenCLUtils_compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros ) +{ + + cl_kernel kernel; + cl_int localErrNum; + + cl_program m_cpProgram = prog; + + b3Printf("compiling kernel %s ",kernelName); + + if (!m_cpProgram) + { + m_cpProgram = b3OpenCLUtils_compileCLProgramFromString(clContext,device,kernelSource,pErrNum, additionalMacros,0, false); + } + + + // Create the kernel + kernel = clCreateKernel(m_cpProgram, kernelName, &localErrNum); + if (localErrNum != CL_SUCCESS) + { + b3Error("Error in clCreateKernel, Line %u in file %s, cannot find kernel function %s !!!\n\n", __LINE__, __FILE__, kernelName); + assert(0); + if (pErrNum) + *pErrNum = localErrNum; + return 0; + } + + if (!prog && m_cpProgram) + { + clReleaseProgram(m_cpProgram); + } + b3Printf("ready. \n"); + + + if (pErrNum) + *pErrNum = CL_SUCCESS; + return kernel; + +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h b/thirdparty/bullet/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h new file mode 100644 index 0000000000..db6466e76b --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h @@ -0,0 +1,194 @@ +/* +Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org +Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +//original author: Roman Ponomarev +//cleanup by Erwin Coumans + +#ifndef B3_OPENCL_UTILS_H +#define B3_OPENCL_UTILS_H + +#include "b3OpenCLInclude.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +///C API for OpenCL utilities: convenience functions, see below for C++ API + +/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code +/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise. +cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC , int preferredDeviceIndex , int preferredPlatformIndex, cl_platform_id* platformId); + +int b3OpenCLUtils_getNumDevices(cl_context cxMainContext); + +cl_device_id b3OpenCLUtils_getDevice(cl_context cxMainContext, int nr); + +void b3OpenCLUtils_printDeviceInfo(cl_device_id device); + +cl_kernel b3OpenCLUtils_compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog,const char* additionalMacros); + +//optional +cl_program b3OpenCLUtils_compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum,const char* additionalMacros , const char* srcFileNameForCaching, bool disableBinaryCaching); + +//the following optional APIs provide access using specific platform information +int b3OpenCLUtils_getNumPlatforms(cl_int* pErrNum); + +///get the nr'th platform, where nr is in the range [0..getNumPlatforms) +cl_platform_id b3OpenCLUtils_getPlatform(int nr, cl_int* pErrNum); + + +void b3OpenCLUtils_printPlatformInfo(cl_platform_id platform); + +const char* b3OpenCLUtils_getSdkVendorName(); + +///set the path (directory/folder) where the compiled OpenCL kernel are stored +void b3OpenCLUtils_setCachePath(const char* path); + +cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC ,int preferredDeviceIndex , int preferredPlatformIndex); + +#ifdef __cplusplus +} + +#define B3_MAX_STRING_LENGTH 1024 + +typedef struct +{ + char m_deviceName[B3_MAX_STRING_LENGTH]; + char m_deviceVendor[B3_MAX_STRING_LENGTH]; + char m_driverVersion[B3_MAX_STRING_LENGTH]; + char m_deviceExtensions[B3_MAX_STRING_LENGTH]; + + cl_device_type m_deviceType; + cl_uint m_computeUnits; + size_t m_workitemDims; + size_t m_workItemSize[3]; + size_t m_image2dMaxWidth; + size_t m_image2dMaxHeight; + size_t m_image3dMaxWidth; + size_t m_image3dMaxHeight; + size_t m_image3dMaxDepth; + size_t m_workgroupSize; + cl_uint m_clockFrequency; + cl_ulong m_constantBufferSize; + cl_ulong m_localMemSize; + cl_ulong m_globalMemSize; + cl_bool m_errorCorrectionSupport; + cl_device_local_mem_type m_localMemType; + cl_uint m_maxReadImageArgs; + cl_uint m_maxWriteImageArgs; + + + + cl_uint m_addressBits; + cl_ulong m_maxMemAllocSize; + cl_command_queue_properties m_queueProperties; + cl_bool m_imageSupport; + cl_uint m_vecWidthChar; + cl_uint m_vecWidthShort; + cl_uint m_vecWidthInt; + cl_uint m_vecWidthLong; + cl_uint m_vecWidthFloat; + cl_uint m_vecWidthDouble; + +} b3OpenCLDeviceInfo; + +struct b3OpenCLPlatformInfo +{ + char m_platformVendor[B3_MAX_STRING_LENGTH]; + char m_platformName[B3_MAX_STRING_LENGTH]; + char m_platformVersion[B3_MAX_STRING_LENGTH]; + + b3OpenCLPlatformInfo() + { + m_platformVendor[0]=0; + m_platformName[0]=0; + m_platformVersion[0]=0; + } +}; + + +///C++ API for OpenCL utilities: convenience functions +struct b3OpenCLUtils +{ + /// CL Context optionally takes a GL context. This is a generic type because we don't really want this code + /// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise. + static inline cl_context createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex= - 1, cl_platform_id* platformId=0) + { + return b3OpenCLUtils_createContextFromType(deviceType, pErrNum, pGLCtx , pGLDC , preferredDeviceIndex, preferredPlatformIndex, platformId); + } + + static inline int getNumDevices(cl_context cxMainContext) + { + return b3OpenCLUtils_getNumDevices(cxMainContext); + } + static inline cl_device_id getDevice(cl_context cxMainContext, int nr) + { + return b3OpenCLUtils_getDevice(cxMainContext,nr); + } + + static void getDeviceInfo(cl_device_id device, b3OpenCLDeviceInfo* info); + + static inline void printDeviceInfo(cl_device_id device) + { + b3OpenCLUtils_printDeviceInfo(device); + } + + static inline cl_kernel compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum=0, cl_program prog=0,const char* additionalMacros = "" ) + { + return b3OpenCLUtils_compileCLKernelFromString(clContext,device, kernelSource, kernelName, pErrNum, prog,additionalMacros); + } + + //optional + static inline cl_program compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum=0,const char* additionalMacros = "" , const char* srcFileNameForCaching=0, bool disableBinaryCaching=false) + { + return b3OpenCLUtils_compileCLProgramFromString(clContext,device, kernelSource, pErrNum,additionalMacros, srcFileNameForCaching, disableBinaryCaching); + } + + //the following optional APIs provide access using specific platform information + static inline int getNumPlatforms(cl_int* pErrNum=0) + { + return b3OpenCLUtils_getNumPlatforms(pErrNum); + } + ///get the nr'th platform, where nr is in the range [0..getNumPlatforms) + static inline cl_platform_id getPlatform(int nr, cl_int* pErrNum=0) + { + return b3OpenCLUtils_getPlatform(nr,pErrNum); + } + + static void getPlatformInfo(cl_platform_id platform, b3OpenCLPlatformInfo* platformInfo); + + static inline void printPlatformInfo(cl_platform_id platform) + { + b3OpenCLUtils_printPlatformInfo(platform); + } + + static inline const char* getSdkVendorName() + { + return b3OpenCLUtils_getSdkVendorName(); + } + static inline cl_context createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0,int preferredDeviceIndex = -1, int preferredPlatformIndex= -1) + { + return b3OpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLCtx,pGLDC,preferredDeviceIndex, preferredPlatformIndex); + } + static void setCachePath(const char* path) + { + b3OpenCLUtils_setCachePath(path); + } +}; + +#endif //__cplusplus + +#endif // B3_OPENCL_UTILS_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h new file mode 100644 index 0000000000..872f039506 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h @@ -0,0 +1,18 @@ +#ifndef B3_BVH_INFO_H +#define B3_BVH_INFO_H + +#include "Bullet3Common/b3Vector3.h" + +struct b3BvhInfo +{ + b3Vector3 m_aabbMin; + b3Vector3 m_aabbMax; + b3Vector3 m_quantization; + int m_numNodes; + int m_numSubTrees; + int m_nodeOffset; + int m_subTreeOffset; + +}; + +#endif //B3_BVH_INFO_H
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp new file mode 100644 index 0000000000..cb30ee939b --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp @@ -0,0 +1,258 @@ + +#if 0 +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + +#include "b3ContactCache.h" +#include "Bullet3Common/b3Transform.h" + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h" + +b3Scalar gContactBreakingThreshold = b3Scalar(0.02); + +///gContactCalcArea3Points will approximate the convex hull area using 3 points +///when setting it to false, it will use 4 points to compute the area: it is more accurate but slower +bool gContactCalcArea3Points = true; + + + + +static inline b3Scalar calcArea4Points(const b3Vector3 &p0,const b3Vector3 &p1,const b3Vector3 &p2,const b3Vector3 &p3) +{ + // It calculates possible 3 area constructed from random 4 points and returns the biggest one. + + b3Vector3 a[3],b[3]; + a[0] = p0 - p1; + a[1] = p0 - p2; + a[2] = p0 - p3; + b[0] = p2 - p3; + b[1] = p1 - p3; + b[2] = p1 - p2; + + //todo: Following 3 cross production can be easily optimized by SIMD. + b3Vector3 tmp0 = a[0].cross(b[0]); + b3Vector3 tmp1 = a[1].cross(b[1]); + b3Vector3 tmp2 = a[2].cross(b[2]); + + return b3Max(b3Max(tmp0.length2(),tmp1.length2()),tmp2.length2()); +} +#if 0 + +//using localPointA for all points +int b3ContactCache::sortCachedPoints(const b3Vector3& pt) +{ + //calculate 4 possible cases areas, and take biggest area + //also need to keep 'deepest' + + int maxPenetrationIndex = -1; +#define KEEP_DEEPEST_POINT 1 +#ifdef KEEP_DEEPEST_POINT + b3Scalar maxPenetration = pt.getDistance(); + for (int i=0;i<4;i++) + { + if (m_pointCache[i].getDistance() < maxPenetration) + { + maxPenetrationIndex = i; + maxPenetration = m_pointCache[i].getDistance(); + } + } +#endif //KEEP_DEEPEST_POINT + + b3Scalar res0(b3Scalar(0.)),res1(b3Scalar(0.)),res2(b3Scalar(0.)),res3(b3Scalar(0.)); + + if (gContactCalcArea3Points) + { + if (maxPenetrationIndex != 0) + { + b3Vector3 a0 = pt.m_localPointA-m_pointCache[1].m_localPointA; + b3Vector3 b0 = m_pointCache[3].m_localPointA-m_pointCache[2].m_localPointA; + b3Vector3 cross = a0.cross(b0); + res0 = cross.length2(); + } + if (maxPenetrationIndex != 1) + { + b3Vector3 a1 = pt.m_localPointA-m_pointCache[0].m_localPointA; + b3Vector3 b1 = m_pointCache[3].m_localPointA-m_pointCache[2].m_localPointA; + b3Vector3 cross = a1.cross(b1); + res1 = cross.length2(); + } + + if (maxPenetrationIndex != 2) + { + b3Vector3 a2 = pt.m_localPointA-m_pointCache[0].m_localPointA; + b3Vector3 b2 = m_pointCache[3].m_localPointA-m_pointCache[1].m_localPointA; + b3Vector3 cross = a2.cross(b2); + res2 = cross.length2(); + } + + if (maxPenetrationIndex != 3) + { + b3Vector3 a3 = pt.m_localPointA-m_pointCache[0].m_localPointA; + b3Vector3 b3 = m_pointCache[2].m_localPointA-m_pointCache[1].m_localPointA; + b3Vector3 cross = a3.cross(b3); + res3 = cross.length2(); + } + } + else + { + if(maxPenetrationIndex != 0) { + res0 = calcArea4Points(pt.m_localPointA,m_pointCache[1].m_localPointA,m_pointCache[2].m_localPointA,m_pointCache[3].m_localPointA); + } + + if(maxPenetrationIndex != 1) { + res1 = calcArea4Points(pt.m_localPointA,m_pointCache[0].m_localPointA,m_pointCache[2].m_localPointA,m_pointCache[3].m_localPointA); + } + + if(maxPenetrationIndex != 2) { + res2 = calcArea4Points(pt.m_localPointA,m_pointCache[0].m_localPointA,m_pointCache[1].m_localPointA,m_pointCache[3].m_localPointA); + } + + if(maxPenetrationIndex != 3) { + res3 = calcArea4Points(pt.m_localPointA,m_pointCache[0].m_localPointA,m_pointCache[1].m_localPointA,m_pointCache[2].m_localPointA); + } + } + b3Vector4 maxvec(res0,res1,res2,res3); + int biggestarea = maxvec.closestAxis4(); + return biggestarea; + +} + + +int b3ContactCache::getCacheEntry(const b3Vector3& newPoint) const +{ + b3Scalar shortestDist = getContactBreakingThreshold() * getContactBreakingThreshold(); + int size = getNumContacts(); + int nearestPoint = -1; + for( int i = 0; i < size; i++ ) + { + const b3Vector3 &mp = m_pointCache[i]; + + b3Vector3 diffA = mp.m_localPointA- newPoint.m_localPointA; + const b3Scalar distToManiPoint = diffA.dot(diffA); + if( distToManiPoint < shortestDist ) + { + shortestDist = distToManiPoint; + nearestPoint = i; + } + } + return nearestPoint; +} + +int b3ContactCache::addManifoldPoint(const b3Vector3& newPoint) +{ + b3Assert(validContactDistance(newPoint)); + + int insertIndex = getNumContacts(); + if (insertIndex == MANIFOLD_CACHE_SIZE) + { +#if MANIFOLD_CACHE_SIZE >= 4 + //sort cache so best points come first, based on area + insertIndex = sortCachedPoints(newPoint); +#else + insertIndex = 0; +#endif + clearUserCache(m_pointCache[insertIndex]); + + } else + { + m_cachedPoints++; + + + } + if (insertIndex<0) + insertIndex=0; + + //b3Assert(m_pointCache[insertIndex].m_userPersistentData==0); + m_pointCache[insertIndex] = newPoint; + return insertIndex; +} + +#endif + +bool b3ContactCache::validContactDistance(const b3Vector3& pt) +{ + return pt.w <= gContactBreakingThreshold; +} + +void b3ContactCache::removeContactPoint(struct b3Contact4Data& newContactCache,int i) +{ + int numContacts = b3Contact4Data_getNumPoints(&newContactCache); + if (i!=(numContacts-1)) + { + b3Swap(newContactCache.m_localPosA[i],newContactCache.m_localPosA[numContacts-1]); + b3Swap(newContactCache.m_localPosB[i],newContactCache.m_localPosB[numContacts-1]); + b3Swap(newContactCache.m_worldPosB[i],newContactCache.m_worldPosB[numContacts-1]); + } + b3Contact4Data_setNumPoints(&newContactCache,numContacts-1); + +} + + +void b3ContactCache::refreshContactPoints(const b3Transform& trA,const b3Transform& trB, struct b3Contact4Data& contacts) +{ + + int numContacts = b3Contact4Data_getNumPoints(&contacts); + + + int i; + /// first refresh worldspace positions and distance + for (i=numContacts-1;i>=0;i--) + { + b3Vector3 worldPosA = trA( contacts.m_localPosA[i]); + b3Vector3 worldPosB = trB( contacts.m_localPosB[i]); + contacts.m_worldPosB[i] = worldPosB; + float distance = (worldPosA - worldPosB).dot(contacts.m_worldNormalOnB); + contacts.m_worldPosB[i].w = distance; + } + + /// then + b3Scalar distance2d; + b3Vector3 projectedDifference,projectedPoint; + for (i=numContacts-1;i>=0;i--) + { + b3Vector3 worldPosA = trA( contacts.m_localPosA[i]); + b3Vector3 worldPosB = trB( contacts.m_localPosB[i]); + b3Vector3&pt = contacts.m_worldPosB[i]; + //contact becomes invalid when signed distance exceeds margin (projected on contactnormal direction) + if (!validContactDistance(pt)) + { + removeContactPoint(contacts,i); + } else + { + //contact also becomes invalid when relative movement orthogonal to normal exceeds margin + projectedPoint = worldPosA - contacts.m_worldNormalOnB * contacts.m_worldPosB[i].w; + projectedDifference = contacts.m_worldPosB[i] - projectedPoint; + distance2d = projectedDifference.dot(projectedDifference); + if (distance2d > gContactBreakingThreshold*gContactBreakingThreshold ) + { + removeContactPoint(contacts,i); + } else + { + ////contact point processed callback + //if (gContactProcessedCallback) + // (*gContactProcessedCallback)(manifoldPoint,(void*)m_body0,(void*)m_body1); + } + } + } + + +} + + + + + +#endif diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h new file mode 100644 index 0000000000..d6c9b0a07e --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h @@ -0,0 +1,80 @@ + +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2013 Erwin Coumans http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef B3_CONTACT_CACHE_H +#define B3_CONTACT_CACHE_H + + +#include "Bullet3Common/b3Vector3.h" +#include "Bullet3Common/b3Transform.h" +#include "Bullet3Common/b3AlignedAllocator.h" + + +///maximum contact breaking and merging threshold +extern b3Scalar gContactBreakingThreshold; + + + +#define MANIFOLD_CACHE_SIZE 4 + +///b3ContactCache is a contact point cache, it stays persistent as long as objects are overlapping in the broadphase. +///Those contact points are created by the collision narrow phase. +///The cache can be empty, or hold 1,2,3 or 4 points. Some collision algorithms (GJK) might only add one point at a time. +///updates/refreshes old contact points, and throw them away if necessary (distance becomes too large) +///reduces the cache to 4 points, when more then 4 points are added, using following rules: +///the contact point with deepest penetration is always kept, and it tries to maximuze the area covered by the points +///note that some pairs of objects might have more then one contact manifold. +B3_ATTRIBUTE_ALIGNED16( class) b3ContactCache +{ + + + + + /// sort cached points so most isolated points come first + int sortCachedPoints(const b3Vector3& pt); + + + +public: + + B3_DECLARE_ALIGNED_ALLOCATOR(); + + + + int addManifoldPoint( const b3Vector3& newPoint); + + /*void replaceContactPoint(const b3Vector3& newPoint,int insertIndex) + { + b3Assert(validContactDistance(newPoint)); + m_pointCache[insertIndex] = newPoint; + } + */ + + + + static bool validContactDistance(const b3Vector3& pt); + + /// calculated new worldspace coordinates and depth, and reject points that exceed the collision margin + static void refreshContactPoints( const b3Transform& trA,const b3Transform& trB, struct b3Contact4Data& newContactCache); + + static void removeContactPoint(struct b3Contact4Data& newContactCache,int i); + + +}; + + + +#endif //B3_CONTACT_CACHE_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp new file mode 100644 index 0000000000..fb435aa7fd --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp @@ -0,0 +1,4733 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +bool findSeparatingAxisOnGpu = true; +bool splitSearchSepAxisConcave = false; +bool splitSearchSepAxisConvex = true; +bool useMprGpu = true;//use mpr for edge-edge (+contact point) or sat. Needs testing on main OpenCL platforms, before enabling... +bool bvhTraversalKernelGPU = true; +bool findConcaveSeparatingAxisKernelGPU = true; +bool clipConcaveFacesAndFindContactsCPU = false;//false;//true; +bool clipConvexFacesAndFindContactsCPU = false;//false;//true; +bool reduceConcaveContactsOnGPU = true;//false; +bool reduceConvexContactsOnGPU = true;//false; +bool findConvexClippingFacesGPU = true; +bool useGjk = false;///option for CPU/host testing, when findSeparatingAxisOnGpu = false +bool useGjkContacts = false;//////option for CPU/host testing when findSeparatingAxisOnGpu = false + + +static int myframecount=0;///for testing + +///This file was written by Erwin Coumans +///Separating axis rest based on work from Pierre Terdiman, see +///And contact clipping based on work from Simon Hobbs + +//#define B3_DEBUG_SAT_FACE + +//#define CHECK_ON_HOST + +#ifdef CHECK_ON_HOST +//#define PERSISTENT_CONTACTS_HOST +#endif + +int b3g_actualSATPairTests=0; + +#include "b3ConvexHullContact.h" +#include <string.h>//memcpy +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3MprPenetration.h" + +#include "Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h" +#include "Bullet3Geometry/b3AabbUtil.h" + +typedef b3AlignedObjectArray<b3Vector3> b3VertexArray; + + +#include <float.h> //for FLT_MAX +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" +//#include "AdlQuaternion.h" + +#include "kernels/satKernels.h" +#include "kernels/mprKernels.h" + +#include "kernels/satConcaveKernels.h" + +#include "kernels/satClipHullContacts.h" +#include "kernels/bvhTraversal.h" +#include "kernels/primitiveContacts.h" + + +#include "Bullet3Geometry/b3AabbUtil.h" + +#define BT_NARROWPHASE_SAT_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl" +#define BT_NARROWPHASE_SAT_CONCAVE_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcave.cl" + +#define BT_NARROWPHASE_MPR_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl" + + +#define BT_NARROWPHASE_CLIPHULL_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl" +#define BT_NARROWPHASE_BVH_TRAVERSAL_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl" +#define BT_NARROWPHASE_PRIMITIVE_CONTACT_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl" + + +#ifndef __global +#define __global +#endif + +#ifndef __kernel +#define __kernel +#endif + + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhTraversal.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3FindConcaveSatAxis.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ClipFaces.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3NewContactReduction.h" + + + +#define dot3F4 b3Dot + +GpuSatCollision::GpuSatCollision(cl_context ctx,cl_device_id device, cl_command_queue q ) +:m_context(ctx), +m_device(device), +m_queue(q), + +m_findSeparatingAxisKernel(0), +m_findSeparatingAxisVertexFaceKernel(0), +m_findSeparatingAxisEdgeEdgeKernel(0), +m_unitSphereDirections(m_context,m_queue), + +m_totalContactsOut(m_context, m_queue), +m_sepNormals(m_context, m_queue), +m_dmins(m_context,m_queue), + +m_hasSeparatingNormals(m_context, m_queue), +m_concaveSepNormals(m_context, m_queue), +m_concaveHasSeparatingNormals(m_context,m_queue), +m_numConcavePairsOut(m_context, m_queue), + + +m_gpuCompoundPairs(m_context, m_queue), + + +m_gpuCompoundSepNormals(m_context, m_queue), +m_gpuHasCompoundSepNormals(m_context, m_queue), + +m_numCompoundPairsOut(m_context, m_queue) +{ + m_totalContactsOut.push_back(0); + + cl_int errNum=0; + + if (1) + { + const char* mprSrc = mprKernelsCL; + + const char* srcConcave = satConcaveKernelsCL; + char flags[1024]={0}; +//#ifdef CL_PLATFORM_INTEL +// sprintf(flags,"-g -s \"%s\"","C:/develop/bullet3_experiments2/opencl/gpu_narrowphase/kernels/sat.cl"); +//#endif + m_mprPenetrationKernel = 0; + m_findSeparatingAxisUnitSphereKernel = 0; + + if (useMprGpu) + { + cl_program mprProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,mprSrc,&errNum,flags,BT_NARROWPHASE_MPR_PATH); + b3Assert(errNum==CL_SUCCESS); + + m_mprPenetrationKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,mprSrc, "mprPenetrationKernel",&errNum,mprProg ); + b3Assert(m_mprPenetrationKernel); + b3Assert(errNum==CL_SUCCESS); + + m_findSeparatingAxisUnitSphereKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,mprSrc, "findSeparatingAxisUnitSphereKernel",&errNum,mprProg ); + b3Assert(m_findSeparatingAxisUnitSphereKernel); + b3Assert(errNum==CL_SUCCESS); + + + int numDirections = sizeof(unitSphere162)/sizeof(b3Vector3); + m_unitSphereDirections.resize(numDirections); + m_unitSphereDirections.copyFromHostPointer(unitSphere162,numDirections,0,true); + + + } + + + cl_program satProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,satKernelsCL,&errNum,flags,BT_NARROWPHASE_SAT_PATH); + b3Assert(errNum==CL_SUCCESS); + + cl_program satConcaveProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcConcave,&errNum,flags,BT_NARROWPHASE_SAT_CONCAVE_PATH); + b3Assert(errNum==CL_SUCCESS); + + m_findSeparatingAxisKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findSeparatingAxisKernel",&errNum,satProg ); + b3Assert(m_findSeparatingAxisKernel); + b3Assert(errNum==CL_SUCCESS); + + + m_findSeparatingAxisVertexFaceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findSeparatingAxisVertexFaceKernel",&errNum,satProg ); + b3Assert(m_findSeparatingAxisVertexFaceKernel); + + m_findSeparatingAxisEdgeEdgeKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findSeparatingAxisEdgeEdgeKernel",&errNum,satProg ); + b3Assert(m_findSeparatingAxisVertexFaceKernel); + + + m_findConcaveSeparatingAxisKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findConcaveSeparatingAxisKernel",&errNum,satProg ); + b3Assert(m_findConcaveSeparatingAxisKernel); + b3Assert(errNum==CL_SUCCESS); + + m_findConcaveSeparatingAxisVertexFaceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcConcave, "findConcaveSeparatingAxisVertexFaceKernel",&errNum,satConcaveProg ); + b3Assert(m_findConcaveSeparatingAxisVertexFaceKernel); + b3Assert(errNum==CL_SUCCESS); + + m_findConcaveSeparatingAxisEdgeEdgeKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcConcave, "findConcaveSeparatingAxisEdgeEdgeKernel",&errNum,satConcaveProg ); + b3Assert(m_findConcaveSeparatingAxisEdgeEdgeKernel); + b3Assert(errNum==CL_SUCCESS); + + + + + m_findCompoundPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findCompoundPairsKernel",&errNum,satProg ); + b3Assert(m_findCompoundPairsKernel); + b3Assert(errNum==CL_SUCCESS); + m_processCompoundPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "processCompoundPairsKernel",&errNum,satProg ); + b3Assert(m_processCompoundPairsKernel); + b3Assert(errNum==CL_SUCCESS); + } + + if (1) + { + const char* srcClip = satClipKernelsCL; + + char flags[1024]={0}; +//#ifdef CL_PLATFORM_INTEL +// sprintf(flags,"-g -s \"%s\"","C:/develop/bullet3_experiments2/opencl/gpu_narrowphase/kernels/satClipHullContacts.cl"); +//#endif + + cl_program satClipContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcClip,&errNum,flags,BT_NARROWPHASE_CLIPHULL_PATH); + b3Assert(errNum==CL_SUCCESS); + + m_clipHullHullKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipHullHullKernel",&errNum,satClipContactsProg); + b3Assert(errNum==CL_SUCCESS); + + m_clipCompoundsHullHullKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipCompoundsHullHullKernel",&errNum,satClipContactsProg); + b3Assert(errNum==CL_SUCCESS); + + + m_findClippingFacesKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "findClippingFacesKernel",&errNum,satClipContactsProg); + b3Assert(errNum==CL_SUCCESS); + + m_clipFacesAndFindContacts = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipFacesAndFindContactsKernel",&errNum,satClipContactsProg); + b3Assert(errNum==CL_SUCCESS); + + m_clipHullHullConcaveConvexKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipHullHullConcaveConvexKernel",&errNum,satClipContactsProg); + b3Assert(errNum==CL_SUCCESS); + +// m_extractManifoldAndAddContactKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "extractManifoldAndAddContactKernel",&errNum,satClipContactsProg); + // b3Assert(errNum==CL_SUCCESS); + + m_newContactReductionKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, + "newContactReductionKernel",&errNum,satClipContactsProg); + b3Assert(errNum==CL_SUCCESS); + } + else + { + m_clipHullHullKernel=0; + m_clipCompoundsHullHullKernel = 0; + m_findClippingFacesKernel = 0; + m_newContactReductionKernel=0; + m_clipFacesAndFindContacts = 0; + m_clipHullHullConcaveConvexKernel = 0; +// m_extractManifoldAndAddContactKernel = 0; + } + + if (1) + { + const char* srcBvh = bvhTraversalKernelCL; + cl_program bvhTraversalProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcBvh,&errNum,"",BT_NARROWPHASE_BVH_TRAVERSAL_PATH); + b3Assert(errNum==CL_SUCCESS); + + m_bvhTraversalKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcBvh, "bvhTraversalKernel",&errNum,bvhTraversalProg,""); + b3Assert(errNum==CL_SUCCESS); + + } + + { + const char* primitiveContactsSrc = primitiveContactsKernelsCL; + cl_program primitiveContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,primitiveContactsSrc,&errNum,"",BT_NARROWPHASE_PRIMITIVE_CONTACT_PATH); + b3Assert(errNum==CL_SUCCESS); + + m_primitiveContactsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,primitiveContactsSrc, "primitiveContactsKernel",&errNum,primitiveContactsProg,""); + b3Assert(errNum==CL_SUCCESS); + + m_findConcaveSphereContactsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,primitiveContactsSrc, "findConcaveSphereContactsKernel",&errNum,primitiveContactsProg ); + b3Assert(errNum==CL_SUCCESS); + b3Assert(m_findConcaveSphereContactsKernel); + + m_processCompoundPairsPrimitivesKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,primitiveContactsSrc, "processCompoundPairsPrimitivesKernel",&errNum,primitiveContactsProg,""); + b3Assert(errNum==CL_SUCCESS); + b3Assert(m_processCompoundPairsPrimitivesKernel); + + } + + +} + +GpuSatCollision::~GpuSatCollision() +{ + + if (m_findSeparatingAxisVertexFaceKernel) + clReleaseKernel(m_findSeparatingAxisVertexFaceKernel); + + if (m_findSeparatingAxisEdgeEdgeKernel) + clReleaseKernel(m_findSeparatingAxisEdgeEdgeKernel); + + if (m_findSeparatingAxisUnitSphereKernel) + clReleaseKernel(m_findSeparatingAxisUnitSphereKernel); + + if (m_mprPenetrationKernel) + clReleaseKernel(m_mprPenetrationKernel); + + + if (m_findSeparatingAxisKernel) + clReleaseKernel(m_findSeparatingAxisKernel); + + if (m_findConcaveSeparatingAxisVertexFaceKernel) + clReleaseKernel(m_findConcaveSeparatingAxisVertexFaceKernel); + + + if (m_findConcaveSeparatingAxisEdgeEdgeKernel) + clReleaseKernel(m_findConcaveSeparatingAxisEdgeEdgeKernel); + + if (m_findConcaveSeparatingAxisKernel) + clReleaseKernel(m_findConcaveSeparatingAxisKernel); + + if (m_findCompoundPairsKernel) + clReleaseKernel(m_findCompoundPairsKernel); + + if (m_processCompoundPairsKernel) + clReleaseKernel(m_processCompoundPairsKernel); + + if (m_findClippingFacesKernel) + clReleaseKernel(m_findClippingFacesKernel); + + if (m_clipFacesAndFindContacts) + clReleaseKernel(m_clipFacesAndFindContacts); + if (m_newContactReductionKernel) + clReleaseKernel(m_newContactReductionKernel); + if (m_primitiveContactsKernel) + clReleaseKernel(m_primitiveContactsKernel); + + if (m_findConcaveSphereContactsKernel) + clReleaseKernel(m_findConcaveSphereContactsKernel); + + if (m_processCompoundPairsPrimitivesKernel) + clReleaseKernel(m_processCompoundPairsPrimitivesKernel); + + if (m_clipHullHullKernel) + clReleaseKernel(m_clipHullHullKernel); + if (m_clipCompoundsHullHullKernel) + clReleaseKernel(m_clipCompoundsHullHullKernel); + + if (m_clipHullHullConcaveConvexKernel) + clReleaseKernel(m_clipHullHullConcaveConvexKernel); +// if (m_extractManifoldAndAddContactKernel) + // clReleaseKernel(m_extractManifoldAndAddContactKernel); + + if (m_bvhTraversalKernel) + clReleaseKernel(m_bvhTraversalKernel); + +} + +struct MyTriangleCallback : public b3NodeOverlapCallback +{ + int m_bodyIndexA; + int m_bodyIndexB; + + virtual void processNode(int subPart, int triangleIndex) + { + printf("bodyIndexA %d, bodyIndexB %d\n",m_bodyIndexA,m_bodyIndexB); + printf("triangleIndex %d\n", triangleIndex); + } +}; + + +#define float4 b3Vector3 +#define make_float4(x,y,z,w) b3MakeVector3(x,y,z,w) + +float signedDistanceFromPointToPlane(const float4& point, const float4& planeEqn, float4* closestPointOnFace) +{ + float4 n = planeEqn; + n[3] = 0.f; + float dist = dot3F4(n, point) + planeEqn[3]; + *closestPointOnFace = point - dist * n; + return dist; +} + + + +#define cross3(a,b) (a.cross(b)) +b3Vector3 transform(const b3Vector3* v, const b3Vector3* pos, const b3Quaternion* orn) +{ + b3Transform tr; + tr.setIdentity(); + tr.setOrigin(*pos); + tr.setRotation(*orn); + b3Vector3 res = tr(*v); + return res; +} + + +inline bool IsPointInPolygon(const float4& p, + const b3GpuFace* face, + const float4* baseVertex, + const int* convexIndices, + float4* out) +{ + float4 a; + float4 b; + float4 ab; + float4 ap; + float4 v; + + float4 plane = b3MakeVector3(face->m_plane.x,face->m_plane.y,face->m_plane.z,0.f); + + if (face->m_numIndices<2) + return false; + + + float4 v0 = baseVertex[convexIndices[face->m_indexOffset + face->m_numIndices-1]]; + b = v0; + + for(unsigned i=0; i != face->m_numIndices; ++i) + { + a = b; + float4 vi = baseVertex[convexIndices[face->m_indexOffset + i]]; + b = vi; + ab = b-a; + ap = p-a; + v = cross3(ab,plane); + + if (b3Dot(ap, v) > 0.f) + { + float ab_m2 = b3Dot(ab, ab); + float rt = ab_m2 != 0.f ? b3Dot(ab, ap) / ab_m2 : 0.f; + if (rt <= 0.f) + { + *out = a; + } + else if (rt >= 1.f) + { + *out = b; + } + else + { + float s = 1.f - rt; + out[0].x = s * a.x + rt * b.x; + out[0].y = s * a.y + rt * b.y; + out[0].z = s * a.z + rt * b.z; + } + return false; + } + } + return true; +} + +#define normalize3(a) (a.normalize()) + + +int extractManifoldSequentialGlobal( const float4* p, int nPoints, const float4& nearNormal, b3Int4* contactIdx) +{ + if( nPoints == 0 ) + return 0; + + if (nPoints <=4) + return nPoints; + + + if (nPoints >64) + nPoints = 64; + + float4 center = b3MakeVector3(0,0,0,0); + { + + for (int i=0;i<nPoints;i++) + center += p[i]; + center /= (float)nPoints; + } + + + + // sample 4 directions + + float4 aVector = p[0] - center; + float4 u = cross3( nearNormal, aVector ); + float4 v = cross3( nearNormal, u ); + u = normalize3( u ); + v = normalize3( v ); + + + //keep point with deepest penetration + float minW= FLT_MAX; + + int minIndex=-1; + + float4 maxDots; + maxDots.x = FLT_MIN; + maxDots.y = FLT_MIN; + maxDots.z = FLT_MIN; + maxDots.w = FLT_MIN; + + // idx, distance + for(int ie = 0; ie<nPoints; ie++ ) + { + if (p[ie].w<minW) + { + minW = p[ie].w; + minIndex=ie; + } + float f; + float4 r = p[ie]-center; + f = dot3F4( u, r ); + if (f<maxDots.x) + { + maxDots.x = f; + contactIdx[0].x = ie; + } + + f = dot3F4( -u, r ); + if (f<maxDots.y) + { + maxDots.y = f; + contactIdx[0].y = ie; + } + + + f = dot3F4( v, r ); + if (f<maxDots.z) + { + maxDots.z = f; + contactIdx[0].z = ie; + } + + f = dot3F4( -v, r ); + if (f<maxDots.w) + { + maxDots.w = f; + contactIdx[0].w = ie; + } + + } + + if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex) + { + //replace the first contact with minimum (todo: replace contact with least penetration) + contactIdx[0].x = minIndex; + } + + return 4; + +} + + + +#define MAX_VERTS 1024 + + +inline void project(const b3ConvexPolyhedronData& hull, const float4& pos, const b3Quaternion& orn, const float4& dir, const b3AlignedObjectArray<b3Vector3>& vertices, b3Scalar& min, b3Scalar& max) +{ + min = FLT_MAX; + max = -FLT_MAX; + int numVerts = hull.m_numVertices; + + const float4 localDir = b3QuatRotate(orn.inverse(),dir); + + b3Scalar offset = dot3F4(pos,dir); + + for(int i=0;i<numVerts;i++) + { + //b3Vector3 pt = trans * vertices[m_vertexOffset+i]; + //b3Scalar dp = pt.dot(dir); + //b3Vector3 vertex = vertices[hull.m_vertexOffset+i]; + b3Scalar dp = dot3F4((float4&)vertices[hull.m_vertexOffset+i],localDir); + //b3Assert(dp==dpL); + if(dp < min) min = dp; + if(dp > max) max = dp; + } + if(min>max) + { + b3Scalar tmp = min; + min = max; + max = tmp; + } + min += offset; + max += offset; +} + + +static bool TestSepAxis(const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, + const float4& posA,const b3Quaternion& ornA, + const float4& posB,const b3Quaternion& ornB, + const float4& sep_axis, const b3AlignedObjectArray<b3Vector3>& verticesA,const b3AlignedObjectArray<b3Vector3>& verticesB,b3Scalar& depth) +{ + b3Scalar Min0,Max0; + b3Scalar Min1,Max1; + project(hullA,posA,ornA,sep_axis,verticesA, Min0, Max0); + project(hullB,posB,ornB, sep_axis,verticesB, Min1, Max1); + + if(Max0<Min1 || Max1<Min0) + return false; + + b3Scalar d0 = Max0 - Min1; + assert(d0>=0.0f); + b3Scalar d1 = Max1 - Min0; + assert(d1>=0.0f); + depth = d0<d1 ? d0:d1; + return true; +} + +inline bool IsAlmostZero(const b3Vector3& v) +{ + if(fabsf(v.x)>1e-6 || fabsf(v.y)>1e-6 || fabsf(v.z)>1e-6) return false; + return true; +} + + +static bool findSeparatingAxis( const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, + const float4& posA1, + const b3Quaternion& ornA, + const float4& posB1, + const b3Quaternion& ornB, + const b3AlignedObjectArray<b3Vector3>& verticesA, + const b3AlignedObjectArray<b3Vector3>& uniqueEdgesA, + const b3AlignedObjectArray<b3GpuFace>& facesA, + const b3AlignedObjectArray<int>& indicesA, + const b3AlignedObjectArray<b3Vector3>& verticesB, + const b3AlignedObjectArray<b3Vector3>& uniqueEdgesB, + const b3AlignedObjectArray<b3GpuFace>& facesB, + const b3AlignedObjectArray<int>& indicesB, + + b3Vector3& sep) +{ + B3_PROFILE("findSeparatingAxis"); + + b3g_actualSATPairTests++; + float4 posA = posA1; + posA.w = 0.f; + float4 posB = posB1; + posB.w = 0.f; +//#ifdef TEST_INTERNAL_OBJECTS + float4 c0local = (float4&)hullA.m_localCenter; + float4 c0 = transform(&c0local, &posA, &ornA); + float4 c1local = (float4&)hullB.m_localCenter; + float4 c1 = transform(&c1local,&posB,&ornB); + const float4 deltaC2 = c0 - c1; +//#endif + + b3Scalar dmin = FLT_MAX; + int curPlaneTests=0; + + int numFacesA = hullA.m_numFaces; + // Test normals from hullA + for(int i=0;i<numFacesA;i++) + { + const float4& normal = (float4&)facesA[hullA.m_faceOffset+i].m_plane; + float4 faceANormalWS = b3QuatRotate(ornA,normal); + + if (dot3F4(deltaC2,faceANormalWS)<0) + faceANormalWS*=-1.f; + + curPlaneTests++; +#ifdef TEST_INTERNAL_OBJECTS + gExpectedNbTests++; + if(gUseInternalObject && !TestInternalObjects(transA,transB, DeltaC2, faceANormalWS, hullA, hullB, dmin)) + continue; + gActualNbTests++; +#endif + + + b3Scalar d; + if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,faceANormalWS, verticesA, verticesB,d)) + return false; + + if(d<dmin) + { + dmin = d; + sep = (b3Vector3&)faceANormalWS; + } + } + + int numFacesB = hullB.m_numFaces; + // Test normals from hullB + for(int i=0;i<numFacesB;i++) + { + float4 normal = (float4&)facesB[hullB.m_faceOffset+i].m_plane; + float4 WorldNormal = b3QuatRotate(ornB, normal); + + if (dot3F4(deltaC2,WorldNormal)<0) + { + WorldNormal*=-1.f; + } + curPlaneTests++; +#ifdef TEST_INTERNAL_OBJECTS + gExpectedNbTests++; + if(gUseInternalObject && !TestInternalObjects(transA,transB,DeltaC2, WorldNormal, hullA, hullB, dmin)) + continue; + gActualNbTests++; +#endif + + b3Scalar d; + if(!TestSepAxis(hullA, hullB,posA,ornA,posB,ornB,WorldNormal,verticesA,verticesB,d)) + return false; + + if(d<dmin) + { + dmin = d; + sep = (b3Vector3&)WorldNormal; + } + } + + int curEdgeEdge = 0; + // Test edges + for(int e0=0;e0<hullA.m_numUniqueEdges;e0++) + { + const float4& edge0 = (float4&) uniqueEdgesA[hullA.m_uniqueEdgesOffset+e0]; + float4 edge0World = b3QuatRotate(ornA,(float4&)edge0); + + for(int e1=0;e1<hullB.m_numUniqueEdges;e1++) + { + const b3Vector3 edge1 = uniqueEdgesB[hullB.m_uniqueEdgesOffset+e1]; + float4 edge1World = b3QuatRotate(ornB,(float4&)edge1); + + + float4 crossje = cross3(edge0World,edge1World); + + curEdgeEdge++; + if(!IsAlmostZero((b3Vector3&)crossje)) + { + crossje = normalize3(crossje); + if (dot3F4(deltaC2,crossje)<0) + crossje*=-1.f; + + +#ifdef TEST_INTERNAL_OBJECTS + gExpectedNbTests++; + if(gUseInternalObject && !TestInternalObjects(transA,transB,DeltaC2, Cross, hullA, hullB, dmin)) + continue; + gActualNbTests++; +#endif + + b3Scalar dist; + if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,crossje, verticesA,verticesB,dist)) + return false; + + if(dist<dmin) + { + dmin = dist; + sep = (b3Vector3&)crossje; + } + } + } + + } + + + if((dot3F4(-deltaC2,(float4&)sep))>0.0f) + sep = -sep; + + return true; +} + + +bool findSeparatingAxisEdgeEdge( __global const b3ConvexPolyhedronData* hullA, __global const b3ConvexPolyhedronData* hullB, + const b3Float4& posA1, + const b3Quat& ornA, + const b3Float4& posB1, + const b3Quat& ornB, + const b3Float4& DeltaC2, + __global const b3AlignedObjectArray<float4>& vertices, + __global const b3AlignedObjectArray<float4>& uniqueEdges, + __global const b3AlignedObjectArray<b3GpuFace>& faces, + __global const b3AlignedObjectArray<int>& indices, + float4* sep, + float* dmin) +{ +// int i = get_global_id(0); + + float4 posA = posA1; + posA.w = 0.f; + float4 posB = posB1; + posB.w = 0.f; + + //int curPlaneTests=0; + + int curEdgeEdge = 0; + // Test edges + for(int e0=0;e0<hullA->m_numUniqueEdges;e0++) + { + const float4 edge0 = uniqueEdges[hullA->m_uniqueEdgesOffset+e0]; + float4 edge0World = b3QuatRotate(ornA,edge0); + + for(int e1=0;e1<hullB->m_numUniqueEdges;e1++) + { + const float4 edge1 = uniqueEdges[hullB->m_uniqueEdgesOffset+e1]; + float4 edge1World = b3QuatRotate(ornB,edge1); + + + float4 crossje = cross3(edge0World,edge1World); + + curEdgeEdge++; + if(!IsAlmostZero(crossje)) + { + crossje = normalize3(crossje); + if (dot3F4(DeltaC2,crossje)<0) + crossje*=-1.f; + + float dist; + bool result = true; + { + float Min0,Max0; + float Min1,Max1; + project(*hullA,posA,ornA,crossje,vertices, Min0, Max0); + project(*hullB,posB,ornB,crossje,vertices, Min1, Max1); + + if(Max0<Min1 || Max1<Min0) + result = false; + + float d0 = Max0 - Min1; + float d1 = Max1 - Min0; + dist = d0<d1 ? d0:d1; + result = true; + + } + + + if(dist<*dmin) + { + *dmin = dist; + *sep = crossje; + } + } + } + + } + + + if((dot3F4(-DeltaC2,*sep))>0.0f) + { + *sep = -(*sep); + } + return true; +} + + +__inline float4 lerp3(const float4& a,const float4& b, float t) +{ + return b3MakeVector3( a.x + (b.x - a.x) * t, + a.y + (b.y - a.y) * t, + a.z + (b.z - a.z) * t, + 0.f); +} + + +// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut +int clipFace(const float4* pVtxIn, int numVertsIn, float4& planeNormalWS,float planeEqWS, float4* ppVtxOut) +{ + + int ve; + float ds, de; + int numVertsOut = 0; + if (numVertsIn < 2) + return 0; + + float4 firstVertex=pVtxIn[numVertsIn-1]; + float4 endVertex = pVtxIn[0]; + + ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS; + + for (ve = 0; ve < numVertsIn; ve++) + { + endVertex=pVtxIn[ve]; + + de = dot3F4(planeNormalWS,endVertex)+planeEqWS; + + if (ds<0) + { + if (de<0) + { + // Start < 0, end < 0, so output endVertex + ppVtxOut[numVertsOut++] = endVertex; + } + else + { + // Start < 0, end >= 0, so output intersection + ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) ); + } + } + else + { + if (de<0) + { + // Start >= 0, end < 0 so output intersection and end + ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) ); + ppVtxOut[numVertsOut++] = endVertex; + } + } + firstVertex = endVertex; + ds = de; + } + return numVertsOut; +} + + +int clipFaceAgainstHull(const float4& separatingNormal, const b3ConvexPolyhedronData* hullA, + const float4& posA, const b3Quaternion& ornA, float4* worldVertsB1, int numWorldVertsB1, + float4* worldVertsB2, int capacityWorldVertsB2, + const float minDist, float maxDist, + const b3AlignedObjectArray<float4>& verticesA, const b3AlignedObjectArray<b3GpuFace>& facesA, const b3AlignedObjectArray<int>& indicesA, + //const float4* verticesB, const b3GpuFace* facesB, const int* indicesB, + float4* contactsOut, + int contactCapacity) +{ + int numContactsOut = 0; + + float4* pVtxIn = worldVertsB1; + float4* pVtxOut = worldVertsB2; + + int numVertsIn = numWorldVertsB1; + int numVertsOut = 0; + + int closestFaceA=-1; + { + float dmin = FLT_MAX; + for(int face=0;face<hullA->m_numFaces;face++) + { + const float4 Normal = b3MakeVector3( + facesA[hullA->m_faceOffset+face].m_plane.x, + facesA[hullA->m_faceOffset+face].m_plane.y, + facesA[hullA->m_faceOffset+face].m_plane.z,0.f); + const float4 faceANormalWS = b3QuatRotate(ornA,Normal); + + float d = dot3F4(faceANormalWS,separatingNormal); + if (d < dmin) + { + dmin = d; + closestFaceA = face; + } + } + } + if (closestFaceA<0) + return numContactsOut; + + b3GpuFace polyA = facesA[hullA->m_faceOffset+closestFaceA]; + + // clip polygon to back of planes of all faces of hull A that are adjacent to witness face +// int numContacts = numWorldVertsB1; + int numVerticesA = polyA.m_numIndices; + for(int e0=0;e0<numVerticesA;e0++) + { + const float4 a = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+e0]]; + const float4 b = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+((e0+1)%numVerticesA)]]; + const float4 edge0 = a - b; + const float4 WorldEdge0 = b3QuatRotate(ornA,edge0); + float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f); + float4 worldPlaneAnormal1 = b3QuatRotate(ornA,planeNormalA); + + float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1); + float4 worldA1 = transform(&a,&posA,&ornA); + float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1); + + float4 planeNormalWS = planeNormalWS1; + float planeEqWS=planeEqWS1; + + //clip face + //clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS); + numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut); + + //btSwap(pVtxIn,pVtxOut); + float4* tmp = pVtxOut; + pVtxOut = pVtxIn; + pVtxIn = tmp; + numVertsIn = numVertsOut; + numVertsOut = 0; + } + + + // only keep points that are behind the witness face + { + float4 localPlaneNormal = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f); + float localPlaneEq = polyA.m_plane.w; + float4 planeNormalWS = b3QuatRotate(ornA,localPlaneNormal); + float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA); + for (int i=0;i<numVertsIn;i++) + { + float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS; + if (depth <=minDist) + { + depth = minDist; + } + if (numContactsOut<contactCapacity) + { + if (depth <=maxDist) + { + float4 pointInWorld = pVtxIn[i]; + //resultOut.addContactPoint(separatingNormal,point,depth); + contactsOut[numContactsOut++] = b3MakeVector3(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth); + //printf("depth=%f\n",depth); + } + } else + { + b3Error("exceeding contact capacity (%d,%df)\n", numContactsOut,contactCapacity); + } + } + } + + return numContactsOut; +} + + + +static int clipHullAgainstHull(const float4& separatingNormal, + const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, + const float4& posA, const b3Quaternion& ornA,const float4& posB, const b3Quaternion& ornB, + float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts, + const float minDist, float maxDist, + const b3AlignedObjectArray<float4>& verticesA, const b3AlignedObjectArray<b3GpuFace>& facesA, const b3AlignedObjectArray<int>& indicesA, + const b3AlignedObjectArray<float4>& verticesB, const b3AlignedObjectArray<b3GpuFace>& facesB, const b3AlignedObjectArray<int>& indicesB, + + float4* contactsOut, + int contactCapacity) +{ + int numContactsOut = 0; + int numWorldVertsB1= 0; + + B3_PROFILE("clipHullAgainstHull"); + +// float curMaxDist=maxDist; + int closestFaceB=-1; + float dmax = -FLT_MAX; + + { + //B3_PROFILE("closestFaceB"); + if (hullB.m_numFaces!=1) + { + //printf("wtf\n"); + } + static bool once = true; + //printf("separatingNormal=%f,%f,%f\n",separatingNormal.x,separatingNormal.y,separatingNormal.z); + + for(int face=0;face<hullB.m_numFaces;face++) + { +#ifdef BT_DEBUG_SAT_FACE + if (once) + printf("face %d\n",face); + const b3GpuFace* faceB = &facesB[hullB.m_faceOffset+face]; + if (once) + { + for (int i=0;i<faceB->m_numIndices;i++) + { + float4 vert = verticesB[hullB.m_vertexOffset+indicesB[faceB->m_indexOffset+i]]; + printf("vert[%d] = %f,%f,%f\n",i,vert.x,vert.y,vert.z); + } + } +#endif //BT_DEBUG_SAT_FACE + //if (facesB[hullB.m_faceOffset+face].m_numIndices>2) + { + const float4 Normal = b3MakeVector3(facesB[hullB.m_faceOffset+face].m_plane.x, + facesB[hullB.m_faceOffset+face].m_plane.y, facesB[hullB.m_faceOffset+face].m_plane.z,0.f); + const float4 WorldNormal = b3QuatRotate(ornB, Normal); +#ifdef BT_DEBUG_SAT_FACE + if (once) + printf("faceNormal = %f,%f,%f\n",Normal.x,Normal.y,Normal.z); +#endif + float d = dot3F4(WorldNormal,separatingNormal); + if (d > dmax) + { + dmax = d; + closestFaceB = face; + } + } + } + once = false; + } + + + b3Assert(closestFaceB>=0); + { + //B3_PROFILE("worldVertsB1"); + const b3GpuFace& polyB = facesB[hullB.m_faceOffset+closestFaceB]; + const int numVertices = polyB.m_numIndices; + for(int e0=0;e0<numVertices;e0++) + { + const float4& b = verticesB[hullB.m_vertexOffset+indicesB[polyB.m_indexOffset+e0]]; + worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB); + } + } + + if (closestFaceB>=0) + { + //B3_PROFILE("clipFaceAgainstHull"); + numContactsOut = clipFaceAgainstHull((float4&)separatingNormal, &hullA, + posA,ornA, + worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist, + verticesA, facesA, indicesA, + contactsOut,contactCapacity); + } + + return numContactsOut; +} + + + + + + +#define PARALLEL_SUM(v, n) for(int j=1; j<n; j++) v[0] += v[j]; +#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;} +#define REDUCE_MAX(v, n) {int i=0;\ +for(int offset=0; offset<n; offset++) v[i] = (v[i].y > v[i+offset].y)? v[i]: v[i+offset]; } +#define REDUCE_MIN(v, n) {int i=0;\ +for(int offset=0; offset<n; offset++) v[i] = (v[i].y < v[i+offset].y)? v[i]: v[i+offset]; } + +int extractManifold(const float4* p, int nPoints, const float4& nearNormal, b3Int4* contactIdx) +{ + if( nPoints == 0 ) + return 0; + + if (nPoints <=4) + return nPoints; + + + if (nPoints >64) + nPoints = 64; + + float4 center = make_float4(0,0,0,0); + { + + for (int i=0;i<nPoints;i++) + center += p[i]; + center /= (float)nPoints; + } + + + + // sample 4 directions + + float4 aVector = p[0] - center; + float4 u = cross3( nearNormal, aVector ); + float4 v = cross3( nearNormal, u ); + u = normalize3( u ); + v = normalize3( v ); + + + //keep point with deepest penetration + float minW= FLT_MAX; + + int minIndex=-1; + + float4 maxDots; + maxDots.x = FLT_MIN; + maxDots.y = FLT_MIN; + maxDots.z = FLT_MIN; + maxDots.w = FLT_MIN; + + // idx, distance + for(int ie = 0; ie<nPoints; ie++ ) + { + if (p[ie].w<minW) + { + minW = p[ie].w; + minIndex=ie; + } + float f; + float4 r = p[ie]-center; + f = dot3F4( u, r ); + if (f<maxDots.x) + { + maxDots.x = f; + contactIdx[0].x = ie; + } + + f = dot3F4( -u, r ); + if (f<maxDots.y) + { + maxDots.y = f; + contactIdx[0].y = ie; + } + + + f = dot3F4( v, r ); + if (f<maxDots.z) + { + maxDots.z = f; + contactIdx[0].z = ie; + } + + f = dot3F4( -v, r ); + if (f<maxDots.w) + { + maxDots.w = f; + contactIdx[0].w = ie; + } + + } + + if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex) + { + //replace the first contact with minimum (todo: replace contact with least penetration) + contactIdx[0].x = minIndex; + } + + return 4; + +} + + + + +int clipHullHullSingle( + int bodyIndexA, int bodyIndexB, + const float4& posA, + const b3Quaternion& ornA, + const float4& posB, + const b3Quaternion& ornB, + + int collidableIndexA, int collidableIndexB, + + const b3AlignedObjectArray<b3RigidBodyData>* bodyBuf, + b3AlignedObjectArray<b3Contact4>* globalContactOut, + int& nContacts, + + const b3AlignedObjectArray<b3ConvexPolyhedronData>& hostConvexDataA, + const b3AlignedObjectArray<b3ConvexPolyhedronData>& hostConvexDataB, + + const b3AlignedObjectArray<b3Vector3>& verticesA, + const b3AlignedObjectArray<b3Vector3>& uniqueEdgesA, + const b3AlignedObjectArray<b3GpuFace>& facesA, + const b3AlignedObjectArray<int>& indicesA, + + const b3AlignedObjectArray<b3Vector3>& verticesB, + const b3AlignedObjectArray<b3Vector3>& uniqueEdgesB, + const b3AlignedObjectArray<b3GpuFace>& facesB, + const b3AlignedObjectArray<int>& indicesB, + + const b3AlignedObjectArray<b3Collidable>& hostCollidablesA, + const b3AlignedObjectArray<b3Collidable>& hostCollidablesB, + const b3Vector3& sepNormalWorldSpace, + int maxContactCapacity ) +{ + int contactIndex = -1; + b3ConvexPolyhedronData hullA, hullB; + + b3Collidable colA = hostCollidablesA[collidableIndexA]; + hullA = hostConvexDataA[colA.m_shapeIndex]; + //printf("numvertsA = %d\n",hullA.m_numVertices); + + + b3Collidable colB = hostCollidablesB[collidableIndexB]; + hullB = hostConvexDataB[colB.m_shapeIndex]; + //printf("numvertsB = %d\n",hullB.m_numVertices); + + + float4 contactsOut[MAX_VERTS]; + int localContactCapacity = MAX_VERTS; + +#ifdef _WIN32 + b3Assert(_finite(bodyBuf->at(bodyIndexA).m_pos.x)); + b3Assert(_finite(bodyBuf->at(bodyIndexB).m_pos.x)); +#endif + + + { + + float4 worldVertsB1[MAX_VERTS]; + float4 worldVertsB2[MAX_VERTS]; + int capacityWorldVerts = MAX_VERTS; + + float4 hostNormal = make_float4(sepNormalWorldSpace.x,sepNormalWorldSpace.y,sepNormalWorldSpace.z,0.f); + int shapeA = hostCollidablesA[collidableIndexA].m_shapeIndex; + int shapeB = hostCollidablesB[collidableIndexB].m_shapeIndex; + + b3Scalar minDist = -1; + b3Scalar maxDist = 0.; + + + + b3Transform trA,trB; + { + //B3_PROFILE("transform computation"); + //trA.setIdentity(); + trA.setOrigin(b3MakeVector3(posA.x,posA.y,posA.z)); + trA.setRotation(b3Quaternion(ornA.x,ornA.y,ornA.z,ornA.w)); + + //trB.setIdentity(); + trB.setOrigin(b3MakeVector3(posB.x,posB.y,posB.z)); + trB.setRotation(b3Quaternion(ornB.x,ornB.y,ornB.z,ornB.w)); + } + + b3Quaternion trAorn = trA.getRotation(); + b3Quaternion trBorn = trB.getRotation(); + + int numContactsOut = clipHullAgainstHull(hostNormal, + hostConvexDataA.at(shapeA), + hostConvexDataB.at(shapeB), + (float4&)trA.getOrigin(), (b3Quaternion&)trAorn, + (float4&)trB.getOrigin(), (b3Quaternion&)trBorn, + worldVertsB1,worldVertsB2,capacityWorldVerts, + minDist, maxDist, + verticesA, facesA,indicesA, + verticesB, facesB,indicesB, + + contactsOut,localContactCapacity); + + if (numContactsOut>0) + { + B3_PROFILE("overlap"); + + float4 normalOnSurfaceB = (float4&)hostNormal; + + b3Int4 contactIdx; + contactIdx.x = 0; + contactIdx.y = 1; + contactIdx.z = 2; + contactIdx.w = 3; + + int numPoints = 0; + + { + // B3_PROFILE("extractManifold"); + numPoints = extractManifold(contactsOut, numContactsOut, normalOnSurfaceB, &contactIdx); + } + + b3Assert(numPoints); + + if (nContacts<maxContactCapacity) + { + contactIndex = nContacts; + globalContactOut->expand(); + b3Contact4& contact = globalContactOut->at(nContacts); + contact.m_batchIdx = 0;//i; + contact.m_bodyAPtrAndSignBit = (bodyBuf->at(bodyIndexA).m_invMass==0)? -bodyIndexA:bodyIndexA; + contact.m_bodyBPtrAndSignBit = (bodyBuf->at(bodyIndexB).m_invMass==0)? -bodyIndexB:bodyIndexB; + + contact.m_frictionCoeffCmp = 45874; + contact.m_restituitionCoeffCmp = 0; + + // float distance = 0.f; + for (int p=0;p<numPoints;p++) + { + contact.m_worldPosB[p] = contactsOut[contactIdx.s[p]];//check if it is actually on B + contact.m_worldNormalOnB = normalOnSurfaceB; + } + //printf("bodyIndexA %d,bodyIndexB %d,normal=%f,%f,%f numPoints %d\n",bodyIndexA,bodyIndexB,normalOnSurfaceB.x,normalOnSurfaceB.y,normalOnSurfaceB.z,numPoints); + contact.m_worldNormalOnB.w = (b3Scalar)numPoints; + nContacts++; + } else + { + b3Error("Error: exceeding contact capacity (%d/%d)\n", nContacts,maxContactCapacity); + } + } + } + return contactIndex; +} + + + + + +void computeContactPlaneConvex(int pairIndex, + int bodyIndexA, int bodyIndexB, + int collidableIndexA, int collidableIndexB, + const b3RigidBodyData* rigidBodies, + const b3Collidable* collidables, + const b3ConvexPolyhedronData* convexShapes, + const b3Vector3* convexVertices, + const int* convexIndices, + const b3GpuFace* faces, + b3Contact4* globalContactsOut, + int& nGlobalContactsOut, + int maxContactCapacity) +{ + + int shapeIndex = collidables[collidableIndexB].m_shapeIndex; + const b3ConvexPolyhedronData* hullB = &convexShapes[shapeIndex]; + + b3Vector3 posB = rigidBodies[bodyIndexB].m_pos; + b3Quaternion ornB = rigidBodies[bodyIndexB].m_quat; + b3Vector3 posA = rigidBodies[bodyIndexA].m_pos; + b3Quaternion ornA = rigidBodies[bodyIndexA].m_quat; + +// int numContactsOut = 0; +// int numWorldVertsB1= 0; + + b3Vector3 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane; + b3Vector3 planeNormal=b3MakeVector3(planeEq.x,planeEq.y,planeEq.z); + b3Vector3 planeNormalWorld = b3QuatRotate(ornA,planeNormal); + float planeConstant = planeEq.w; + b3Transform convexWorldTransform; + convexWorldTransform.setIdentity(); + convexWorldTransform.setOrigin(posB); + convexWorldTransform.setRotation(ornB); + b3Transform planeTransform; + planeTransform.setIdentity(); + planeTransform.setOrigin(posA); + planeTransform.setRotation(ornA); + + b3Transform planeInConvex; + planeInConvex= convexWorldTransform.inverse() * planeTransform; + b3Transform convexInPlane; + convexInPlane = planeTransform.inverse() * convexWorldTransform; + + b3Vector3 planeNormalInConvex = planeInConvex.getBasis()*-planeNormal; + float maxDot = -1e30; + int hitVertex=-1; + b3Vector3 hitVtx; + +#define MAX_PLANE_CONVEX_POINTS 64 + + b3Vector3 contactPoints[MAX_PLANE_CONVEX_POINTS]; + int numPoints = 0; + + b3Int4 contactIdx; + contactIdx.s[0] = 0; + contactIdx.s[1] = 1; + contactIdx.s[2] = 2; + contactIdx.s[3] = 3; + + for (int i=0;i<hullB->m_numVertices;i++) + { + b3Vector3 vtx = convexVertices[hullB->m_vertexOffset+i]; + float curDot = vtx.dot(planeNormalInConvex); + + + if (curDot>maxDot) + { + hitVertex=i; + maxDot=curDot; + hitVtx = vtx; + //make sure the deepest points is always included + if (numPoints==MAX_PLANE_CONVEX_POINTS) + numPoints--; + } + + if (numPoints<MAX_PLANE_CONVEX_POINTS) + { + b3Vector3 vtxWorld = convexWorldTransform*vtx; + b3Vector3 vtxInPlane = planeTransform.inverse()*vtxWorld; + float dist = planeNormal.dot(vtxInPlane)-planeConstant; + if (dist<0.f) + { + vtxWorld.w = dist; + contactPoints[numPoints] = vtxWorld; + numPoints++; + } + } + + } + + int numReducedPoints = 0; + + numReducedPoints = numPoints; + + if (numPoints>4) + { + numReducedPoints = extractManifoldSequentialGlobal( contactPoints, numPoints, planeNormalInConvex, &contactIdx); + } + int dstIdx; +// dstIdx = nGlobalContactsOut++;//AppendInc( nGlobalContactsOut, dstIdx ); + + if (numReducedPoints>0) + { + if (nGlobalContactsOut < maxContactCapacity) + { + dstIdx=nGlobalContactsOut; + nGlobalContactsOut++; + + b3Contact4* c = &globalContactsOut[dstIdx]; + c->m_worldNormalOnB = -planeNormalWorld; + c->setFrictionCoeff(0.7); + c->setRestituitionCoeff(0.f); + + c->m_batchIdx = pairIndex; + c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB; + for (int i=0;i<numReducedPoints;i++) + { + b3Vector3 pOnB1 = contactPoints[contactIdx.s[i]]; + c->m_worldPosB[i] = pOnB1; + } + c->m_worldNormalOnB.w = (b3Scalar)numReducedPoints; + }//if (dstIdx < numPairs) + } + + + +// printf("computeContactPlaneConvex\n"); +} + + + +B3_FORCE_INLINE b3Vector3 MyUnQuantize(const unsigned short* vecIn, const b3Vector3& quantization, const b3Vector3& bvhAabbMin) + { + b3Vector3 vecOut; + vecOut.setValue( + (b3Scalar)(vecIn[0]) / (quantization.x), + (b3Scalar)(vecIn[1]) / (quantization.y), + (b3Scalar)(vecIn[2]) / (quantization.z)); + vecOut += bvhAabbMin; + return vecOut; + } + +void traverseTreeTree() +{ + +} + +#include "Bullet3Common/shared/b3Mat3x3.h" + +int numAabbChecks = 0; +int maxNumAabbChecks = 0; +int maxDepth = 0; + +// work-in-progress +__kernel void findCompoundPairsKernel( + int pairIndex, + int bodyIndexA, + int bodyIndexB, + int collidableIndexA, + int collidableIndexB, + __global const b3RigidBodyData* rigidBodies, + __global const b3Collidable* collidables, + __global const b3ConvexPolyhedronData* convexShapes, + __global const b3AlignedObjectArray<b3Float4>& vertices, + __global const b3AlignedObjectArray<b3Aabb>& aabbsWorldSpace, + __global const b3AlignedObjectArray<b3Aabb>& aabbsLocalSpace, + __global const b3GpuChildShape* gpuChildShapes, + __global b3Int4* gpuCompoundPairsOut, + __global int* numCompoundPairsOut, + int maxNumCompoundPairsCapacity, + b3AlignedObjectArray<b3QuantizedBvhNode>& treeNodesCPU, + b3AlignedObjectArray<b3BvhSubtreeInfo>& subTreesCPU, + b3AlignedObjectArray<b3BvhInfo>& bvhInfoCPU + ) +{ + numAabbChecks=0; + maxNumAabbChecks=0; +// int i = pairIndex; + { + + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + + //once the broadphase avoids static-static pairs, we can remove this test + if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0)) + { + return; + } + + if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) &&(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)) + { + int bvhA = collidables[collidableIndexA].m_compoundBvhIndex; + int bvhB = collidables[collidableIndexB].m_compoundBvhIndex; + int numSubTreesA = bvhInfoCPU[bvhA].m_numSubTrees; + int subTreesOffsetA = bvhInfoCPU[bvhA].m_subTreeOffset; + int subTreesOffsetB = bvhInfoCPU[bvhB].m_subTreeOffset; + + + int numSubTreesB = bvhInfoCPU[bvhB].m_numSubTrees; + + float4 posA = rigidBodies[bodyIndexA].m_pos; + b3Quat ornA = rigidBodies[bodyIndexA].m_quat; + + b3Transform transA; + transA.setIdentity(); + transA.setOrigin(posA); + transA.setRotation(ornA); + + b3Quat ornB = rigidBodies[bodyIndexB].m_quat; + float4 posB = rigidBodies[bodyIndexB].m_pos; + + b3Transform transB; + transB.setIdentity(); + transB.setOrigin(posB); + transB.setRotation(ornB); + + + + for (int p=0;p<numSubTreesA;p++) + { + b3BvhSubtreeInfo subtreeA = subTreesCPU[subTreesOffsetA+p]; + //bvhInfoCPU[bvhA].m_quantization + b3Vector3 treeAminLocal = MyUnQuantize(subtreeA.m_quantizedAabbMin,bvhInfoCPU[bvhA].m_quantization,bvhInfoCPU[bvhA].m_aabbMin); + b3Vector3 treeAmaxLocal = MyUnQuantize(subtreeA.m_quantizedAabbMax,bvhInfoCPU[bvhA].m_quantization,bvhInfoCPU[bvhA].m_aabbMin); + + b3Vector3 aabbAMinOut,aabbAMaxOut; + float margin=0.f; + b3TransformAabb2(treeAminLocal,treeAmaxLocal, margin,transA.getOrigin(),transA.getRotation(),&aabbAMinOut,&aabbAMaxOut); + + for (int q=0;q<numSubTreesB;q++) + { + b3BvhSubtreeInfo subtreeB = subTreesCPU[subTreesOffsetB+q]; + + b3Vector3 treeBminLocal = MyUnQuantize(subtreeB.m_quantizedAabbMin,bvhInfoCPU[bvhB].m_quantization,bvhInfoCPU[bvhB].m_aabbMin); + b3Vector3 treeBmaxLocal = MyUnQuantize(subtreeB.m_quantizedAabbMax,bvhInfoCPU[bvhB].m_quantization,bvhInfoCPU[bvhB].m_aabbMin); + + b3Vector3 aabbBMinOut,aabbBMaxOut; + float margin=0.f; + b3TransformAabb2(treeBminLocal,treeBmaxLocal, margin,transB.getOrigin(),transB.getRotation(),&aabbBMinOut,&aabbBMaxOut); + + + numAabbChecks=0; + bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut); + if (aabbOverlap) + { + + int startNodeIndexA = subtreeA.m_rootNodeIndex+bvhInfoCPU[bvhA].m_nodeOffset; + // int endNodeIndexA = startNodeIndexA+subtreeA.m_subtreeSize; + + int startNodeIndexB = subtreeB.m_rootNodeIndex+bvhInfoCPU[bvhB].m_nodeOffset; + // int endNodeIndexB = startNodeIndexB+subtreeB.m_subtreeSize; + + b3AlignedObjectArray<b3Int2> nodeStack; + b3Int2 node0; + node0.x = startNodeIndexA; + node0.y = startNodeIndexB; + + int maxStackDepth = 1024; + nodeStack.resize(maxStackDepth); + int depth=0; + nodeStack[depth++]=node0; + + do + { + if (depth > maxDepth) + { + maxDepth=depth; + printf("maxDepth=%d\n",maxDepth); + } + b3Int2 node = nodeStack[--depth]; + + b3Vector3 aMinLocal = MyUnQuantize(treeNodesCPU[node.x].m_quantizedAabbMin,bvhInfoCPU[bvhA].m_quantization,bvhInfoCPU[bvhA].m_aabbMin); + b3Vector3 aMaxLocal = MyUnQuantize(treeNodesCPU[node.x].m_quantizedAabbMax,bvhInfoCPU[bvhA].m_quantization,bvhInfoCPU[bvhA].m_aabbMin); + + b3Vector3 bMinLocal = MyUnQuantize(treeNodesCPU[node.y].m_quantizedAabbMin,bvhInfoCPU[bvhB].m_quantization,bvhInfoCPU[bvhB].m_aabbMin); + b3Vector3 bMaxLocal = MyUnQuantize(treeNodesCPU[node.y].m_quantizedAabbMax,bvhInfoCPU[bvhB].m_quantization,bvhInfoCPU[bvhB].m_aabbMin); + + float margin=0.f; + b3Vector3 aabbAMinOut,aabbAMaxOut; + b3TransformAabb2(aMinLocal,aMaxLocal, margin,transA.getOrigin(),transA.getRotation(),&aabbAMinOut,&aabbAMaxOut); + + b3Vector3 aabbBMinOut,aabbBMaxOut; + b3TransformAabb2(bMinLocal,bMaxLocal, margin,transB.getOrigin(),transB.getRotation(),&aabbBMinOut,&aabbBMaxOut); + + numAabbChecks++; + bool nodeOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut); + if (nodeOverlap) + { + bool isLeafA = treeNodesCPU[node.x].isLeafNode(); + bool isLeafB = treeNodesCPU[node.y].isLeafNode(); + bool isInternalA = !isLeafA; + bool isInternalB = !isLeafB; + + //fail, even though it might hit two leaf nodes + if (depth+4>maxStackDepth && !(isLeafA && isLeafB)) + { + b3Error("Error: traversal exceeded maxStackDepth\n"); + continue; + } + + if(isInternalA) + { + int nodeAleftChild = node.x+1; + bool isNodeALeftChildLeaf = treeNodesCPU[node.x+1].isLeafNode(); + int nodeArightChild = isNodeALeftChildLeaf? node.x+2 : node.x+1 + treeNodesCPU[node.x+1].getEscapeIndex(); + + if(isInternalB) + { + int nodeBleftChild = node.y+1; + bool isNodeBLeftChildLeaf = treeNodesCPU[node.y+1].isLeafNode(); + int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + treeNodesCPU[node.y+1].getEscapeIndex(); + + nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBleftChild); + nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBleftChild); + nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBrightChild); + nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBrightChild); + } + else + { + nodeStack[depth++] = b3MakeInt2(nodeAleftChild,node.y); + nodeStack[depth++] = b3MakeInt2(nodeArightChild,node.y); + } + } + else + { + if(isInternalB) + { + int nodeBleftChild = node.y+1; + bool isNodeBLeftChildLeaf = treeNodesCPU[node.y+1].isLeafNode(); + int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + treeNodesCPU[node.y+1].getEscapeIndex(); + nodeStack[depth++] = b3MakeInt2(node.x,nodeBleftChild); + nodeStack[depth++] = b3MakeInt2(node.x,nodeBrightChild); + } + else + { + int compoundPairIdx = b3AtomicInc(numCompoundPairsOut); + if (compoundPairIdx<maxNumCompoundPairsCapacity) + { + int childShapeIndexA = treeNodesCPU[node.x].getTriangleIndex(); + int childShapeIndexB = treeNodesCPU[node.y].getTriangleIndex(); + gpuCompoundPairsOut[compoundPairIdx] = b3MakeInt4(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB); + } + } + } + } + } while (depth); + maxNumAabbChecks = b3Max(numAabbChecks,maxNumAabbChecks); + } + } + } + + return; + } + + if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)) + { + + if (collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + { + + int numChildrenA = collidables[collidableIndexA].m_numChildShapes; + for (int c=0;c<numChildrenA;c++) + { + int childShapeIndexA = collidables[collidableIndexA].m_shapeIndex+c; + int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex; + + float4 posA = rigidBodies[bodyIndexA].m_pos; + b3Quat ornA = rigidBodies[bodyIndexA].m_quat; + float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition; + b3Quat childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation; + float4 newPosA = b3QuatRotate(ornA,childPosA)+posA; + b3Quat newOrnA = b3QuatMul(ornA,childOrnA); + + + + b3Aabb aabbA = aabbsLocalSpace[childColIndexA]; + + + b3Transform transA; + transA.setIdentity(); + transA.setOrigin(newPosA); + transA.setRotation(newOrnA); + b3Scalar margin=0.0f; + + b3Vector3 aabbAMinOut,aabbAMaxOut; + + b3TransformAabb2((const b3Float4&)aabbA.m_min,(const b3Float4&)aabbA.m_max, margin,transA.getOrigin(),transA.getRotation(),&aabbAMinOut,&aabbAMaxOut); + + if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + { + int numChildrenB = collidables[collidableIndexB].m_numChildShapes; + for (int b=0;b<numChildrenB;b++) + { + int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b; + int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex; + b3Quat ornB = rigidBodies[bodyIndexB].m_quat; + float4 posB = rigidBodies[bodyIndexB].m_pos; + float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition; + b3Quat childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; + float4 newPosB = transform(&childPosB,&posB,&ornB); + b3Quat newOrnB = b3QuatMul(ornB,childOrnB); + + + + b3Aabb aabbB = aabbsLocalSpace[childColIndexB]; + + b3Transform transB; + transB.setIdentity(); + transB.setOrigin(newPosB); + transB.setRotation(newOrnB); + + b3Vector3 aabbBMinOut,aabbBMaxOut; + b3TransformAabb2((const b3Float4&)aabbB.m_min,(const b3Float4&)aabbB.m_max, margin,transB.getOrigin(),transB.getRotation(),&aabbBMinOut,&aabbBMaxOut); + + numAabbChecks++; + bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut); + if (aabbOverlap) + { + /* + int numFacesA = convexShapes[shapeIndexA].m_numFaces; + float dmin = FLT_MAX; + float4 posA = newPosA; + posA.w = 0.f; + float4 posB = newPosB; + posB.w = 0.f; + float4 c0local = convexShapes[shapeIndexA].m_localCenter; + b3Quat ornA = newOrnA; + float4 c0 = transform(&c0local, &posA, &ornA); + float4 c1local = convexShapes[shapeIndexB].m_localCenter; + b3Quat ornB =newOrnB; + float4 c1 = transform(&c1local,&posB,&ornB); + const float4 DeltaC2 = c0 - c1; + */ + {// + int compoundPairIdx = b3AtomicInc(numCompoundPairsOut); + if (compoundPairIdx<maxNumCompoundPairsCapacity) + { + gpuCompoundPairsOut[compoundPairIdx] = b3MakeInt4(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB); + } + }// + }//fi(1) + } //for (int b=0 + }//if (collidables[collidableIndexB]. + else//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + { + if (1) + { + // int numFacesA = convexShapes[shapeIndexA].m_numFaces; + // float dmin = FLT_MAX; + float4 posA = newPosA; + posA.w = 0.f; + float4 posB = rigidBodies[bodyIndexB].m_pos; + posB.w = 0.f; + float4 c0local = convexShapes[shapeIndexA].m_localCenter; + b3Quat ornA = newOrnA; + float4 c0; + c0 = transform(&c0local, &posA, &ornA); + float4 c1local = convexShapes[shapeIndexB].m_localCenter; + b3Quat ornB = rigidBodies[bodyIndexB].m_quat; + float4 c1; + c1 = transform(&c1local,&posB,&ornB); + // const float4 DeltaC2 = c0 - c1; + + { + int compoundPairIdx = b3AtomicInc(numCompoundPairsOut); + if (compoundPairIdx<maxNumCompoundPairsCapacity) + { + gpuCompoundPairsOut[compoundPairIdx] = b3MakeInt4(bodyIndexA,bodyIndexB,childShapeIndexA,-1); + }//if (compoundPairIdx<maxNumCompoundPairsCapacity) + }// + }//fi (1) + }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + }//for (int b=0;b<numChildrenB;b++) + return; + }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH) + && (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)) + { + int numChildrenB = collidables[collidableIndexB].m_numChildShapes; + for (int b=0;b<numChildrenB;b++) + { + int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b; + int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex; + b3Quat ornB = rigidBodies[bodyIndexB].m_quat; + float4 posB = rigidBodies[bodyIndexB].m_pos; + float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition; + b3Quat childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; + float4 newPosB = b3QuatRotate(ornB,childPosB)+posB; + b3Quat newOrnB = b3QuatMul(ornB,childOrnB); + + int shapeIndexB = collidables[childColIndexB].m_shapeIndex; + + + ////////////////////////////////////// + + if (1) + { + // int numFacesA = convexShapes[shapeIndexA].m_numFaces; + // float dmin = FLT_MAX; + float4 posA = rigidBodies[bodyIndexA].m_pos; + posA.w = 0.f; + float4 posB = newPosB; + posB.w = 0.f; + float4 c0local = convexShapes[shapeIndexA].m_localCenter; + b3Quat ornA = rigidBodies[bodyIndexA].m_quat; + float4 c0; + c0 = transform(&c0local, &posA, &ornA); + float4 c1local = convexShapes[shapeIndexB].m_localCenter; + b3Quat ornB =newOrnB; + float4 c1; + c1 = transform(&c1local,&posB,&ornB); + // const float4 DeltaC2 = c0 - c1; + {// + int compoundPairIdx = b3AtomicInc(numCompoundPairsOut); + if (compoundPairIdx<maxNumCompoundPairsCapacity) + { + gpuCompoundPairsOut[compoundPairIdx] = b3MakeInt4(bodyIndexA,bodyIndexB,-1,childShapeIndexB); + }//fi (compoundPairIdx<maxNumCompoundPairsCapacity) + }// + }//fi (1) + }//for (int b=0;b<numChildrenB;b++) + return; + }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + return; + }//fi ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)) + }//i<numPairs +} + + + +__kernel void processCompoundPairsKernel( __global const b3Int4* gpuCompoundPairs, + __global const b3RigidBodyData* rigidBodies, + __global const b3Collidable* collidables, + __global const b3ConvexPolyhedronData* convexShapes, + __global const b3AlignedObjectArray<b3Float4>& vertices, + __global const b3AlignedObjectArray<b3Float4>& uniqueEdges, + __global const b3AlignedObjectArray<b3GpuFace>& faces, + __global const b3AlignedObjectArray<int>& indices, + __global b3Aabb* aabbs, + __global const b3GpuChildShape* gpuChildShapes, + __global b3AlignedObjectArray<b3Float4>& gpuCompoundSepNormalsOut, + __global b3AlignedObjectArray<int>& gpuHasCompoundSepNormalsOut, + int numCompoundPairs, + int i + ) +{ + +// int i = get_global_id(0); + if (i<numCompoundPairs) + { + int bodyIndexA = gpuCompoundPairs[i].x; + int bodyIndexB = gpuCompoundPairs[i].y; + + int childShapeIndexA = gpuCompoundPairs[i].z; + int childShapeIndexB = gpuCompoundPairs[i].w; + + int collidableIndexA = -1; + int collidableIndexB = -1; + + b3Quat ornA = rigidBodies[bodyIndexA].m_quat; + float4 posA = rigidBodies[bodyIndexA].m_pos; + + b3Quat ornB = rigidBodies[bodyIndexB].m_quat; + float4 posB = rigidBodies[bodyIndexB].m_pos; + + if (childShapeIndexA >= 0) + { + collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex; + float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition; + b3Quat childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation; + float4 newPosA = b3QuatRotate(ornA,childPosA)+posA; + b3Quat newOrnA = b3QuatMul(ornA,childOrnA); + posA = newPosA; + ornA = newOrnA; + } else + { + collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + } + + if (childShapeIndexB>=0) + { + collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex; + float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition; + b3Quat childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; + float4 newPosB = b3QuatRotate(ornB,childPosB)+posB; + b3Quat newOrnB = b3QuatMul(ornB,childOrnB); + posB = newPosB; + ornB = newOrnB; + } else + { + collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + } + + gpuHasCompoundSepNormalsOut[i] = 0; + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + int shapeTypeA = collidables[collidableIndexA].m_shapeType; + int shapeTypeB = collidables[collidableIndexB].m_shapeType; + + + if ((shapeTypeA != SHAPE_CONVEX_HULL) || (shapeTypeB != SHAPE_CONVEX_HULL)) + { + return; + } + + int hasSeparatingAxis = 5; + + // int numFacesA = convexShapes[shapeIndexA].m_numFaces; + float dmin = FLT_MAX; + posA.w = 0.f; + posB.w = 0.f; + float4 c0local = convexShapes[shapeIndexA].m_localCenter; + float4 c0 = transform(&c0local, &posA, &ornA); + float4 c1local = convexShapes[shapeIndexB].m_localCenter; + float4 c1 = transform(&c1local,&posB,&ornB); + const float4 DeltaC2 = c0 - c1; + float4 sepNormal = make_float4(1,0,0,0); +// bool sepA = findSeparatingAxis( convexShapes[shapeIndexA], convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin); + bool sepA = findSeparatingAxis( convexShapes[shapeIndexA], convexShapes[shapeIndexB],posA,ornA,posB,ornB,vertices,uniqueEdges,faces,indices,vertices,uniqueEdges,faces,indices,sepNormal);//,&dmin); + + hasSeparatingAxis = 4; + if (!sepA) + { + hasSeparatingAxis = 0; + } else + { + bool sepB = findSeparatingAxis( convexShapes[shapeIndexB],convexShapes[shapeIndexA],posB,ornB,posA,ornA,vertices,uniqueEdges,faces,indices,vertices,uniqueEdges,faces,indices,sepNormal);//,&dmin); + + if (!sepB) + { + hasSeparatingAxis = 0; + } else//(!sepB) + { + bool sepEE = findSeparatingAxisEdgeEdge( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin); + if (sepEE) + { + gpuCompoundSepNormalsOut[i] = sepNormal;//fastNormalize4(sepNormal); + gpuHasCompoundSepNormalsOut[i] = 1; + }//sepEE + }//(!sepB) + }//(!sepA) + + + } + +} + + +__kernel void clipCompoundsHullHullKernel( __global const b3Int4* gpuCompoundPairs, + __global const b3RigidBodyData* rigidBodies, + __global const b3Collidable* collidables, + __global const b3ConvexPolyhedronData* convexShapes, + __global const b3AlignedObjectArray<b3Float4>& vertices, + __global const b3AlignedObjectArray<b3Float4>& uniqueEdges, + __global const b3AlignedObjectArray<b3GpuFace>& faces, + __global const b3AlignedObjectArray<int>& indices, + __global const b3GpuChildShape* gpuChildShapes, + __global const b3AlignedObjectArray<b3Float4>& gpuCompoundSepNormalsOut, + __global const b3AlignedObjectArray<int>& gpuHasCompoundSepNormalsOut, + __global struct b3Contact4Data* globalContactsOut, + int* nGlobalContactsOut, + int numCompoundPairs, int maxContactCapacity, int i) +{ + +// int i = get_global_id(0); + int pairIndex = i; + + float4 worldVertsB1[64]; + float4 worldVertsB2[64]; + int capacityWorldVerts = 64; + + float4 localContactsOut[64]; + int localContactCapacity=64; + + float minDist = -1e30f; + float maxDist = 0.0f; + + if (i<numCompoundPairs) + { + + if (gpuHasCompoundSepNormalsOut[i]) + { + + int bodyIndexA = gpuCompoundPairs[i].x; + int bodyIndexB = gpuCompoundPairs[i].y; + + int childShapeIndexA = gpuCompoundPairs[i].z; + int childShapeIndexB = gpuCompoundPairs[i].w; + + int collidableIndexA = -1; + int collidableIndexB = -1; + + b3Quat ornA = rigidBodies[bodyIndexA].m_quat; + float4 posA = rigidBodies[bodyIndexA].m_pos; + + b3Quat ornB = rigidBodies[bodyIndexB].m_quat; + float4 posB = rigidBodies[bodyIndexB].m_pos; + + if (childShapeIndexA >= 0) + { + collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex; + float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition; + b3Quat childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation; + float4 newPosA = b3QuatRotate(ornA,childPosA)+posA; + b3Quat newOrnA = b3QuatMul(ornA,childOrnA); + posA = newPosA; + ornA = newOrnA; + } else + { + collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + } + + if (childShapeIndexB>=0) + { + collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex; + float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition; + b3Quat childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; + float4 newPosB = b3QuatRotate(ornB,childPosB)+posB; + b3Quat newOrnB = b3QuatMul(ornB,childOrnB); + posB = newPosB; + ornB = newOrnB; + } else + { + collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + } + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + int numLocalContactsOut = clipHullAgainstHull(gpuCompoundSepNormalsOut[i], + convexShapes[shapeIndexA], convexShapes[shapeIndexB], + posA,ornA, + posB,ornB, + worldVertsB1,worldVertsB2,capacityWorldVerts, + minDist, maxDist, + vertices,faces,indices, + vertices,faces,indices, + localContactsOut,localContactCapacity); + + if (numLocalContactsOut>0) + { + float4 normal = -gpuCompoundSepNormalsOut[i]; + int nPoints = numLocalContactsOut; + float4* pointsIn = localContactsOut; + b3Int4 contactIdx;// = {-1,-1,-1,-1}; + + contactIdx.s[0] = 0; + contactIdx.s[1] = 1; + contactIdx.s[2] = 2; + contactIdx.s[3] = 3; + + int nReducedContacts = extractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx); + + int dstIdx; + dstIdx = b3AtomicInc( nGlobalContactsOut); + if ((dstIdx+nReducedContacts) < maxContactCapacity) + { + __global struct b3Contact4Data* c = globalContactsOut+ dstIdx; + c->m_worldNormalOnB = -normal; + c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff); + c->m_batchIdx = pairIndex; + int bodyA = gpuCompoundPairs[pairIndex].x; + int bodyB = gpuCompoundPairs[pairIndex].y; + c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB; + c->m_childIndexA = childShapeIndexA; + c->m_childIndexB = childShapeIndexB; + for (int i=0;i<nReducedContacts;i++) + { + c->m_worldPosB[i] = pointsIn[contactIdx.s[i]]; + } + b3Contact4Data_setNumPoints(c,nReducedContacts); + } + + }// if (numContactsOut>0) + }// if (gpuHasCompoundSepNormalsOut[i]) + }// if (i<numCompoundPairs) + +} + + +void computeContactCompoundCompound(int pairIndex, + int bodyIndexA, int bodyIndexB, + int collidableIndexA, int collidableIndexB, + const b3RigidBodyData* rigidBodies, + const b3Collidable* collidables, + const b3ConvexPolyhedronData* convexShapes, + const b3GpuChildShape* cpuChildShapes, + const b3AlignedObjectArray<b3Aabb>& hostAabbsWorldSpace, + const b3AlignedObjectArray<b3Aabb>& hostAabbsLocalSpace, + + const b3AlignedObjectArray<b3Vector3>& convexVertices, + const b3AlignedObjectArray<b3Vector3>& hostUniqueEdges, + const b3AlignedObjectArray<int>& convexIndices, + const b3AlignedObjectArray<b3GpuFace>& faces, + + b3Contact4* globalContactsOut, + int& nGlobalContactsOut, + int maxContactCapacity, + b3AlignedObjectArray<b3QuantizedBvhNode>& treeNodesCPU, + b3AlignedObjectArray<b3BvhSubtreeInfo>& subTreesCPU, + b3AlignedObjectArray<b3BvhInfo>& bvhInfoCPU + ) +{ + + int shapeTypeB = collidables[collidableIndexB].m_shapeType; + b3Assert(shapeTypeB == SHAPE_COMPOUND_OF_CONVEX_HULLS); + + b3AlignedObjectArray<b3Int4> cpuCompoundPairsOut; + int numCompoundPairsOut=0; + int maxNumCompoundPairsCapacity = 8192;//1024; + cpuCompoundPairsOut.resize(maxNumCompoundPairsCapacity); + + // work-in-progress + findCompoundPairsKernel( + pairIndex, + bodyIndexA,bodyIndexB, + collidableIndexA,collidableIndexB, + rigidBodies, + collidables, + convexShapes, + convexVertices, + hostAabbsWorldSpace, + hostAabbsLocalSpace, + cpuChildShapes, + &cpuCompoundPairsOut[0], + &numCompoundPairsOut, + maxNumCompoundPairsCapacity , + treeNodesCPU, + subTreesCPU, + bvhInfoCPU + ); + + printf("maxNumAabbChecks=%d\n",maxNumAabbChecks); + if (numCompoundPairsOut>maxNumCompoundPairsCapacity) + { + b3Error("numCompoundPairsOut exceeded maxNumCompoundPairsCapacity (%d)\n",maxNumCompoundPairsCapacity); + numCompoundPairsOut=maxNumCompoundPairsCapacity; + } + b3AlignedObjectArray<b3Float4> cpuCompoundSepNormalsOut; + b3AlignedObjectArray<int> cpuHasCompoundSepNormalsOut; + cpuCompoundSepNormalsOut.resize(numCompoundPairsOut); + cpuHasCompoundSepNormalsOut.resize(numCompoundPairsOut); + + for (int i=0;i<numCompoundPairsOut;i++) + { + + processCompoundPairsKernel(&cpuCompoundPairsOut[0],rigidBodies,collidables,convexShapes,convexVertices,hostUniqueEdges,faces,convexIndices,0,cpuChildShapes, + cpuCompoundSepNormalsOut,cpuHasCompoundSepNormalsOut,numCompoundPairsOut,i); + } + + for (int i=0;i<numCompoundPairsOut;i++) + { + clipCompoundsHullHullKernel(&cpuCompoundPairsOut[0],rigidBodies,collidables,convexShapes,convexVertices,hostUniqueEdges,faces,convexIndices,cpuChildShapes, + cpuCompoundSepNormalsOut,cpuHasCompoundSepNormalsOut,globalContactsOut,&nGlobalContactsOut,numCompoundPairsOut,maxContactCapacity,i); + } + /* + int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex; + + float4 posA = rigidBodies[bodyIndexA].m_pos; + b3Quat ornA = rigidBodies[bodyIndexA].m_quat; + float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition; + b3Quat childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation; + float4 newPosA = b3QuatRotate(ornA,childPosA)+posA; + b3Quat newOrnA = b3QuatMul(ornA,childOrnA); + + int shapeIndexA = collidables[childColIndexA].m_shapeIndex; + + + bool foundSepAxis = findSeparatingAxis(hullA,hullB, + posA, + ornA, + posB, + ornB, + + convexVertices,uniqueEdges,faces,convexIndices, + convexVertices,uniqueEdges,faces,convexIndices, + + sepNormalWorldSpace + ); + */ + + + /* + if (foundSepAxis) + { + + + contactIndex = clipHullHullSingle( + bodyIndexA, bodyIndexB, + posA,ornA, + posB,ornB, + collidableIndexA, collidableIndexB, + &rigidBodies, + &globalContactsOut, + nGlobalContactsOut, + + convexShapes, + convexShapes, + + convexVertices, + uniqueEdges, + faces, + convexIndices, + + convexVertices, + uniqueEdges, + faces, + convexIndices, + + collidables, + collidables, + sepNormalWorldSpace, + maxContactCapacity); + + } + */ + +// return contactIndex; + + /* + + int numChildrenB = collidables[collidableIndexB].m_numChildShapes; + for (int c=0;c<numChildrenB;c++) + { + int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+c; + int childColIndexB = cpuChildShapes[childShapeIndexB].m_shapeIndex; + + float4 rootPosB = rigidBodies[bodyIndexB].m_pos; + b3Quaternion rootOrnB = rigidBodies[bodyIndexB].m_quat; + b3Vector3 childPosB = cpuChildShapes[childShapeIndexB].m_childPosition; + b3Quaternion childOrnB = cpuChildShapes[childShapeIndexB].m_childOrientation; + float4 posB = b3QuatRotate(rootOrnB,childPosB)+rootPosB; + b3Quaternion ornB = b3QuatMul(rootOrnB,childOrnB);//b3QuatMul(ornB,childOrnB); + + int shapeIndexB = collidables[childColIndexB].m_shapeIndex; + + const b3ConvexPolyhedronData* hullB = &convexShapes[shapeIndexB]; + + } + */ + +} + +void computeContactPlaneCompound(int pairIndex, + int bodyIndexA, int bodyIndexB, + int collidableIndexA, int collidableIndexB, + const b3RigidBodyData* rigidBodies, + const b3Collidable* collidables, + const b3ConvexPolyhedronData* convexShapes, + const b3GpuChildShape* cpuChildShapes, + const b3Vector3* convexVertices, + const int* convexIndices, + const b3GpuFace* faces, + + b3Contact4* globalContactsOut, + int& nGlobalContactsOut, + int maxContactCapacity) +{ + + int shapeTypeB = collidables[collidableIndexB].m_shapeType; + b3Assert(shapeTypeB == SHAPE_COMPOUND_OF_CONVEX_HULLS); + + + int numChildrenB = collidables[collidableIndexB].m_numChildShapes; + for (int c=0;c<numChildrenB;c++) + { + int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+c; + int childColIndexB = cpuChildShapes[childShapeIndexB].m_shapeIndex; + + float4 rootPosB = rigidBodies[bodyIndexB].m_pos; + b3Quaternion rootOrnB = rigidBodies[bodyIndexB].m_quat; + b3Vector3 childPosB = cpuChildShapes[childShapeIndexB].m_childPosition; + b3Quaternion childOrnB = cpuChildShapes[childShapeIndexB].m_childOrientation; + float4 posB = b3QuatRotate(rootOrnB,childPosB)+rootPosB; + b3Quaternion ornB = rootOrnB*childOrnB;//b3QuatMul(ornB,childOrnB); + + int shapeIndexB = collidables[childColIndexB].m_shapeIndex; + + const b3ConvexPolyhedronData* hullB = &convexShapes[shapeIndexB]; + + + b3Vector3 posA = rigidBodies[bodyIndexA].m_pos; + b3Quaternion ornA = rigidBodies[bodyIndexA].m_quat; + + // int numContactsOut = 0; + // int numWorldVertsB1= 0; + + b3Vector3 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane; + b3Vector3 planeNormal=b3MakeVector3(planeEq.x,planeEq.y,planeEq.z); + b3Vector3 planeNormalWorld = b3QuatRotate(ornA,planeNormal); + float planeConstant = planeEq.w; + b3Transform convexWorldTransform; + convexWorldTransform.setIdentity(); + convexWorldTransform.setOrigin(posB); + convexWorldTransform.setRotation(ornB); + b3Transform planeTransform; + planeTransform.setIdentity(); + planeTransform.setOrigin(posA); + planeTransform.setRotation(ornA); + + b3Transform planeInConvex; + planeInConvex= convexWorldTransform.inverse() * planeTransform; + b3Transform convexInPlane; + convexInPlane = planeTransform.inverse() * convexWorldTransform; + + b3Vector3 planeNormalInConvex = planeInConvex.getBasis()*-planeNormal; + float maxDot = -1e30; + int hitVertex=-1; + b3Vector3 hitVtx; + + #define MAX_PLANE_CONVEX_POINTS 64 + + b3Vector3 contactPoints[MAX_PLANE_CONVEX_POINTS]; + int numPoints = 0; + + b3Int4 contactIdx; + contactIdx.s[0] = 0; + contactIdx.s[1] = 1; + contactIdx.s[2] = 2; + contactIdx.s[3] = 3; + + for (int i=0;i<hullB->m_numVertices;i++) + { + b3Vector3 vtx = convexVertices[hullB->m_vertexOffset+i]; + float curDot = vtx.dot(planeNormalInConvex); + + + if (curDot>maxDot) + { + hitVertex=i; + maxDot=curDot; + hitVtx = vtx; + //make sure the deepest points is always included + if (numPoints==MAX_PLANE_CONVEX_POINTS) + numPoints--; + } + + if (numPoints<MAX_PLANE_CONVEX_POINTS) + { + b3Vector3 vtxWorld = convexWorldTransform*vtx; + b3Vector3 vtxInPlane = planeTransform.inverse()*vtxWorld; + float dist = planeNormal.dot(vtxInPlane)-planeConstant; + if (dist<0.f) + { + vtxWorld.w = dist; + contactPoints[numPoints] = vtxWorld; + numPoints++; + } + } + + } + + int numReducedPoints = 0; + + numReducedPoints = numPoints; + + if (numPoints>4) + { + numReducedPoints = extractManifoldSequentialGlobal( contactPoints, numPoints, planeNormalInConvex, &contactIdx); + } + int dstIdx; + // dstIdx = nGlobalContactsOut++;//AppendInc( nGlobalContactsOut, dstIdx ); + + if (numReducedPoints>0) + { + if (nGlobalContactsOut < maxContactCapacity) + { + dstIdx=nGlobalContactsOut; + nGlobalContactsOut++; + + b3Contact4* c = &globalContactsOut[dstIdx]; + c->m_worldNormalOnB = -planeNormalWorld; + c->setFrictionCoeff(0.7); + c->setRestituitionCoeff(0.f); + + c->m_batchIdx = pairIndex; + c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB; + for (int i=0;i<numReducedPoints;i++) + { + b3Vector3 pOnB1 = contactPoints[contactIdx.s[i]]; + c->m_worldPosB[i] = pOnB1; + } + c->m_worldNormalOnB.w = (b3Scalar)numReducedPoints; + }//if (dstIdx < numPairs) + } + + } + + +} + + + + + +void computeContactSphereConvex(int pairIndex, + int bodyIndexA, int bodyIndexB, + int collidableIndexA, int collidableIndexB, + const b3RigidBodyData* rigidBodies, + const b3Collidable* collidables, + const b3ConvexPolyhedronData* convexShapes, + const b3Vector3* convexVertices, + const int* convexIndices, + const b3GpuFace* faces, + b3Contact4* globalContactsOut, + int& nGlobalContactsOut, + int maxContactCapacity) +{ + + float radius = collidables[collidableIndexA].m_radius; + float4 spherePos1 = rigidBodies[bodyIndexA].m_pos; + b3Quaternion sphereOrn = rigidBodies[bodyIndexA].m_quat; + + + + float4 pos = rigidBodies[bodyIndexB].m_pos; + + + b3Quaternion quat = rigidBodies[bodyIndexB].m_quat; + + b3Transform tr; + tr.setIdentity(); + tr.setOrigin(pos); + tr.setRotation(quat); + b3Transform trInv = tr.inverse(); + + float4 spherePos = trInv(spherePos1); + + int collidableIndex = rigidBodies[bodyIndexB].m_collidableIdx; + int shapeIndex = collidables[collidableIndex].m_shapeIndex; + int numFaces = convexShapes[shapeIndex].m_numFaces; + float4 closestPnt = b3MakeVector3(0, 0, 0, 0); +// float4 hitNormalWorld = b3MakeVector3(0, 0, 0, 0); + float minDist = -1000000.f; // TODO: What is the largest/smallest float? + bool bCollide = true; + int region = -1; + float4 localHitNormal; + for ( int f = 0; f < numFaces; f++ ) + { + b3GpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f]; + float4 planeEqn; + float4 localPlaneNormal = b3MakeVector3(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f); + float4 n1 = localPlaneNormal;//quatRotate(quat,localPlaneNormal); + planeEqn = n1; + planeEqn[3] = face.m_plane.w; + + float4 pntReturn; + float dist = signedDistanceFromPointToPlane(spherePos, planeEqn, &pntReturn); + + if ( dist > radius) + { + bCollide = false; + break; + } + + if ( dist > 0 ) + { + //might hit an edge or vertex + b3Vector3 out; + + bool isInPoly = IsPointInPolygon(spherePos, + &face, + &convexVertices[convexShapes[shapeIndex].m_vertexOffset], + convexIndices, + &out); + if (isInPoly) + { + if (dist>minDist) + { + minDist = dist; + closestPnt = pntReturn; + localHitNormal = planeEqn; + region=1; + } + } else + { + b3Vector3 tmp = spherePos-out; + b3Scalar l2 = tmp.length2(); + if (l2<radius*radius) + { + dist = b3Sqrt(l2); + if (dist>minDist) + { + minDist = dist; + closestPnt = out; + localHitNormal = tmp/dist; + region=2; + } + + } else + { + bCollide = false; + break; + } + } + } + else + { + if ( dist > minDist ) + { + minDist = dist; + closestPnt = pntReturn; + localHitNormal = planeEqn; + region=3; + } + } + } + static int numChecks = 0; + numChecks++; + + if (bCollide && minDist > -10000) + { + + float4 normalOnSurfaceB1 = tr.getBasis()*localHitNormal;//-hitNormalWorld; + float4 pOnB1 = tr(closestPnt); + //printf("dist ,%f,",minDist); + float actualDepth = minDist-radius; + if (actualDepth<0) + { + //printf("actualDepth = ,%f,", actualDepth); + //printf("normalOnSurfaceB1 = ,%f,%f,%f,", normalOnSurfaceB1.x,normalOnSurfaceB1.y,normalOnSurfaceB1.z); + //printf("region=,%d,\n", region); + pOnB1[3] = actualDepth; + + int dstIdx; +// dstIdx = nGlobalContactsOut++;//AppendInc( nGlobalContactsOut, dstIdx ); + + if (nGlobalContactsOut < maxContactCapacity) + { + dstIdx=nGlobalContactsOut; + nGlobalContactsOut++; + + b3Contact4* c = &globalContactsOut[dstIdx]; + c->m_worldNormalOnB = normalOnSurfaceB1; + c->setFrictionCoeff(0.7); + c->setRestituitionCoeff(0.f); + + c->m_batchIdx = pairIndex; + c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB; + c->m_worldPosB[0] = pOnB1; + int numPoints = 1; + c->m_worldNormalOnB.w = (b3Scalar)numPoints; + }//if (dstIdx < numPairs) + } + }//if (hasCollision) + +} + + + + +int computeContactConvexConvex2( + int pairIndex, + int bodyIndexA, int bodyIndexB, + int collidableIndexA, int collidableIndexB, + const b3AlignedObjectArray<b3RigidBodyData>& rigidBodies, + const b3AlignedObjectArray<b3Collidable>& collidables, + const b3AlignedObjectArray<b3ConvexPolyhedronData>& convexShapes, + const b3AlignedObjectArray<b3Vector3>& convexVertices, + const b3AlignedObjectArray<b3Vector3>& uniqueEdges, + const b3AlignedObjectArray<int>& convexIndices, + const b3AlignedObjectArray<b3GpuFace>& faces, + b3AlignedObjectArray<b3Contact4>& globalContactsOut, + int& nGlobalContactsOut, + int maxContactCapacity, + const b3AlignedObjectArray<b3Contact4>& oldContacts + ) +{ + int contactIndex = -1; + b3Vector3 posA = rigidBodies[bodyIndexA].m_pos; + b3Quaternion ornA = rigidBodies[bodyIndexA].m_quat; + b3Vector3 posB = rigidBodies[bodyIndexB].m_pos; + b3Quaternion ornB = rigidBodies[bodyIndexB].m_quat; + + + b3ConvexPolyhedronData hullA, hullB; + + b3Vector3 sepNormalWorldSpace; + + + + b3Collidable colA = collidables[collidableIndexA]; + hullA = convexShapes[colA.m_shapeIndex]; + //printf("numvertsA = %d\n",hullA.m_numVertices); + + + b3Collidable colB = collidables[collidableIndexB]; + hullB = convexShapes[colB.m_shapeIndex]; + //printf("numvertsB = %d\n",hullB.m_numVertices); + +// int contactCapacity = MAX_VERTS; + //int numContactsOut=0; + + +#ifdef _WIN32 + b3Assert(_finite(rigidBodies[bodyIndexA].m_pos.x)); + b3Assert(_finite(rigidBodies[bodyIndexB].m_pos.x)); +#endif + + bool foundSepAxis = findSeparatingAxis(hullA,hullB, + posA, + ornA, + posB, + ornB, + + convexVertices,uniqueEdges,faces,convexIndices, + convexVertices,uniqueEdges,faces,convexIndices, + + sepNormalWorldSpace + ); + + + if (foundSepAxis) + { + + + contactIndex = clipHullHullSingle( + bodyIndexA, bodyIndexB, + posA,ornA, + posB,ornB, + collidableIndexA, collidableIndexB, + &rigidBodies, + &globalContactsOut, + nGlobalContactsOut, + + convexShapes, + convexShapes, + + convexVertices, + uniqueEdges, + faces, + convexIndices, + + convexVertices, + uniqueEdges, + faces, + convexIndices, + + collidables, + collidables, + sepNormalWorldSpace, + maxContactCapacity); + + } + + return contactIndex; +} + + + + + + + +void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* pairs, int nPairs, + const b3OpenCLArray<b3RigidBodyData>* bodyBuf, + b3OpenCLArray<b3Contact4>* contactOut, int& nContacts, + const b3OpenCLArray<b3Contact4>* oldContacts, + int maxContactCapacity, + int compoundPairCapacity, + const b3OpenCLArray<b3ConvexPolyhedronData>& convexData, + const b3OpenCLArray<b3Vector3>& gpuVertices, + const b3OpenCLArray<b3Vector3>& gpuUniqueEdges, + const b3OpenCLArray<b3GpuFace>& gpuFaces, + const b3OpenCLArray<int>& gpuIndices, + const b3OpenCLArray<b3Collidable>& gpuCollidables, + const b3OpenCLArray<b3GpuChildShape>& gpuChildShapes, + + const b3OpenCLArray<b3Aabb>& clAabbsWorldSpace, + const b3OpenCLArray<b3Aabb>& clAabbsLocalSpace, + + b3OpenCLArray<b3Vector3>& worldVertsB1GPU, + b3OpenCLArray<b3Int4>& clippingFacesOutGPU, + b3OpenCLArray<b3Vector3>& worldNormalsAGPU, + b3OpenCLArray<b3Vector3>& worldVertsA1GPU, + b3OpenCLArray<b3Vector3>& worldVertsB2GPU, + b3AlignedObjectArray<class b3OptimizedBvh*>& bvhDataUnused, + b3OpenCLArray<b3QuantizedBvhNode>* treeNodesGPU, + b3OpenCLArray<b3BvhSubtreeInfo>* subTreesGPU, + b3OpenCLArray<b3BvhInfo>* bvhInfo, + + int numObjects, + int maxTriConvexPairCapacity, + b3OpenCLArray<b3Int4>& triangleConvexPairsOut, + int& numTriConvexPairsOut + ) +{ + myframecount++; + + if (!nPairs) + return; + +#ifdef CHECK_ON_HOST + + + b3AlignedObjectArray<b3QuantizedBvhNode> treeNodesCPU; + treeNodesGPU->copyToHost(treeNodesCPU); + + b3AlignedObjectArray<b3BvhSubtreeInfo> subTreesCPU; + subTreesGPU->copyToHost(subTreesCPU); + + b3AlignedObjectArray<b3BvhInfo> bvhInfoCPU; + bvhInfo->copyToHost(bvhInfoCPU); + + b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace; + clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace); + + b3AlignedObjectArray<b3Aabb> hostAabbsLocalSpace; + clAabbsLocalSpace.copyToHost(hostAabbsLocalSpace); + + b3AlignedObjectArray<b3Int4> hostPairs; + pairs->copyToHost(hostPairs); + + b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; + bodyBuf->copyToHost(hostBodyBuf); + + + + b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData; + convexData.copyToHost(hostConvexData); + + b3AlignedObjectArray<b3Vector3> hostVertices; + gpuVertices.copyToHost(hostVertices); + + b3AlignedObjectArray<b3Vector3> hostUniqueEdges; + gpuUniqueEdges.copyToHost(hostUniqueEdges); + b3AlignedObjectArray<b3GpuFace> hostFaces; + gpuFaces.copyToHost(hostFaces); + b3AlignedObjectArray<int> hostIndices; + gpuIndices.copyToHost(hostIndices); + b3AlignedObjectArray<b3Collidable> hostCollidables; + gpuCollidables.copyToHost(hostCollidables); + + b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes; + gpuChildShapes.copyToHost(cpuChildShapes); + + + b3AlignedObjectArray<b3Int4> hostTriangleConvexPairs; + + b3AlignedObjectArray<b3Contact4> hostContacts; + if (nContacts) + { + contactOut->copyToHost(hostContacts); + } + + b3AlignedObjectArray<b3Contact4> oldHostContacts; + + if (oldContacts->size()) + { + oldContacts->copyToHost(oldHostContacts); + } + + + hostContacts.resize(maxContactCapacity); + + for (int i=0;i<nPairs;i++) + { + int bodyIndexA = hostPairs[i].x; + int bodyIndexB = hostPairs[i].y; + int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx; + int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx; + + if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_SPHERE && + hostCollidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL) + { + computeContactSphereConvex(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&hostBodyBuf[0], + &hostCollidables[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity); + } + + if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL && + hostCollidables[collidableIndexB].m_shapeType == SHAPE_SPHERE) + { + computeContactSphereConvex(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&hostBodyBuf[0], + &hostCollidables[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity); + //printf("convex-sphere\n"); + + } + + if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL && + hostCollidables[collidableIndexB].m_shapeType == SHAPE_PLANE) + { + computeContactPlaneConvex(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&hostBodyBuf[0], + &hostCollidables[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity); +// printf("convex-plane\n"); + + } + + if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_PLANE && + hostCollidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL) + { + computeContactPlaneConvex(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&hostBodyBuf[0], + &hostCollidables[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity); +// printf("plane-convex\n"); + + } + + if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS && + hostCollidables[collidableIndexB].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS) + { + computeContactCompoundCompound(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&hostBodyBuf[0], + &hostCollidables[0],&hostConvexData[0],&cpuChildShapes[0], hostAabbsWorldSpace,hostAabbsLocalSpace,hostVertices,hostUniqueEdges,hostIndices,hostFaces,&hostContacts[0], + nContacts,maxContactCapacity,treeNodesCPU,subTreesCPU,bvhInfoCPU); +// printf("convex-plane\n"); + + } + + + if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS && + hostCollidables[collidableIndexB].m_shapeType == SHAPE_PLANE) + { + computeContactPlaneCompound(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&hostBodyBuf[0], + &hostCollidables[0],&hostConvexData[0],&cpuChildShapes[0], &hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity); +// printf("convex-plane\n"); + + } + + if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_PLANE && + hostCollidables[collidableIndexB].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS) + { + computeContactPlaneCompound(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&hostBodyBuf[0], + &hostCollidables[0],&hostConvexData[0],&cpuChildShapes[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity); +// printf("plane-convex\n"); + + } + + if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL && + hostCollidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL) + { + //printf("hostPairs[i].z=%d\n",hostPairs[i].z); + int contactIndex = computeContactConvexConvex2( i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,hostBodyBuf, hostCollidables,hostConvexData,hostVertices,hostUniqueEdges,hostIndices,hostFaces,hostContacts,nContacts,maxContactCapacity,oldHostContacts); + //int contactIndex = computeContactConvexConvex(hostPairs,i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,hostBodyBuf,hostCollidables,hostConvexData,hostVertices,hostUniqueEdges,hostIndices,hostFaces,hostContacts,nContacts,maxContactCapacity,oldHostContacts); + + + if (contactIndex>=0) + { +// printf("convex convex contactIndex = %d\n",contactIndex); + hostPairs[i].z = contactIndex; + } +// printf("plane-convex\n"); + + } + + + } + + if (hostPairs.size()) + { + pairs->copyFromHost(hostPairs); + } + + hostContacts.resize(nContacts); + if (nContacts) + { + + contactOut->copyFromHost(hostContacts); + } else + { + contactOut->resize(0); + } + + m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true); + //printf("(HOST) nContacts = %d\n",nContacts); + +#else + + { + if (nPairs) + { + m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true); + + B3_PROFILE("primitiveContactsKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( pairs->getBufferCL(), true ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( gpuCollidables.getBufferCL(),true), + b3BufferInfoCL( convexData.getBufferCL(),true), + b3BufferInfoCL( gpuVertices.getBufferCL(),true), + b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), + b3BufferInfoCL( gpuFaces.getBufferCL(),true), + b3BufferInfoCL( gpuIndices.getBufferCL(),true), + b3BufferInfoCL( contactOut->getBufferCL()), + b3BufferInfoCL( m_totalContactsOut.getBufferCL()) + }; + + b3LauncherCL launcher(m_queue, m_primitiveContactsKernel,"m_primitiveContactsKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( nPairs ); + launcher.setConst(maxContactCapacity); + int num = nPairs; + launcher.launch1D( num); + clFinish(m_queue); + + nContacts = m_totalContactsOut.at(0); + contactOut->resize(nContacts); + } + } + + +#endif//CHECK_ON_HOST + + B3_PROFILE("computeConvexConvexContactsGPUSAT"); + // printf("nContacts = %d\n",nContacts); + + + m_sepNormals.resize(nPairs); + m_hasSeparatingNormals.resize(nPairs); + + int concaveCapacity=maxTriConvexPairCapacity; + m_concaveSepNormals.resize(concaveCapacity); + m_concaveHasSeparatingNormals.resize(concaveCapacity); + m_numConcavePairsOut.resize(0); + m_numConcavePairsOut.push_back(0); + + + m_gpuCompoundPairs.resize(compoundPairCapacity); + + m_gpuCompoundSepNormals.resize(compoundPairCapacity); + + + m_gpuHasCompoundSepNormals.resize(compoundPairCapacity); + + m_numCompoundPairsOut.resize(0); + m_numCompoundPairsOut.push_back(0); + + int numCompoundPairs = 0; + + int numConcavePairs =0; + + { + clFinish(m_queue); + if (findSeparatingAxisOnGpu) + { + m_dmins.resize(nPairs); + if (splitSearchSepAxisConvex) + { + + + if (useMprGpu) + { + nContacts = m_totalContactsOut.at(0); + { + B3_PROFILE("mprPenetrationKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( pairs->getBufferCL(), true ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( gpuCollidables.getBufferCL(),true), + b3BufferInfoCL( convexData.getBufferCL(),true), + b3BufferInfoCL( gpuVertices.getBufferCL(),true), + b3BufferInfoCL( m_sepNormals.getBufferCL()), + b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()), + b3BufferInfoCL( contactOut->getBufferCL()), + b3BufferInfoCL( m_totalContactsOut.getBufferCL()) + }; + + b3LauncherCL launcher(m_queue, m_mprPenetrationKernel,"mprPenetrationKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + + launcher.setConst(maxContactCapacity); + launcher.setConst( nPairs ); + + int num = nPairs; + launcher.launch1D( num); + clFinish(m_queue); + /* + b3AlignedObjectArray<int>hostHasSepAxis; + m_hasSeparatingNormals.copyToHost(hostHasSepAxis); + b3AlignedObjectArray<b3Vector3>hostSepAxis; + m_sepNormals.copyToHost(hostSepAxis); + */ + nContacts = m_totalContactsOut.at(0); + contactOut->resize(nContacts); + // printf("nContacts (after mprPenetrationKernel) = %d\n",nContacts); + if (nContacts>maxContactCapacity) + { + + b3Error("Error: contacts exceeds capacity (%d/%d)\n", nContacts, maxContactCapacity); + nContacts = maxContactCapacity; + } + + } + } + + if (1) + { + + if (1) + { + { + B3_PROFILE("findSeparatingAxisVertexFaceKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( pairs->getBufferCL(), true ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( gpuCollidables.getBufferCL(),true), + b3BufferInfoCL( convexData.getBufferCL(),true), + b3BufferInfoCL( gpuVertices.getBufferCL(),true), + b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), + b3BufferInfoCL( gpuFaces.getBufferCL(),true), + b3BufferInfoCL( gpuIndices.getBufferCL(),true), + b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), + b3BufferInfoCL( m_sepNormals.getBufferCL()), + b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()), + b3BufferInfoCL( m_dmins.getBufferCL()) + }; + + b3LauncherCL launcher(m_queue, m_findSeparatingAxisVertexFaceKernel,"findSeparatingAxisVertexFaceKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( nPairs ); + + int num = nPairs; + launcher.launch1D( num); + clFinish(m_queue); + } + + + int numDirections = sizeof(unitSphere162)/sizeof(b3Vector3); + + { + B3_PROFILE("findSeparatingAxisEdgeEdgeKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( pairs->getBufferCL(), true ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( gpuCollidables.getBufferCL(),true), + b3BufferInfoCL( convexData.getBufferCL(),true), + b3BufferInfoCL( gpuVertices.getBufferCL(),true), + b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), + b3BufferInfoCL( gpuFaces.getBufferCL(),true), + b3BufferInfoCL( gpuIndices.getBufferCL(),true), + b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), + b3BufferInfoCL( m_sepNormals.getBufferCL()), + b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()), + b3BufferInfoCL( m_dmins.getBufferCL()), + b3BufferInfoCL( m_unitSphereDirections.getBufferCL(),true) + + }; + + b3LauncherCL launcher(m_queue, m_findSeparatingAxisEdgeEdgeKernel,"findSeparatingAxisEdgeEdgeKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( numDirections); + launcher.setConst( nPairs ); + int num = nPairs; + launcher.launch1D( num); + clFinish(m_queue); + + } + } + if (useMprGpu) + { + B3_PROFILE("findSeparatingAxisUnitSphereKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( pairs->getBufferCL(), true ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( gpuCollidables.getBufferCL(),true), + b3BufferInfoCL( convexData.getBufferCL(),true), + b3BufferInfoCL( gpuVertices.getBufferCL(),true), + b3BufferInfoCL( m_unitSphereDirections.getBufferCL(),true), + b3BufferInfoCL( m_sepNormals.getBufferCL()), + b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()), + b3BufferInfoCL( m_dmins.getBufferCL()) + }; + + b3LauncherCL launcher(m_queue, m_findSeparatingAxisUnitSphereKernel,"findSeparatingAxisUnitSphereKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + int numDirections = sizeof(unitSphere162)/sizeof(b3Vector3); + launcher.setConst( numDirections); + + launcher.setConst( nPairs ); + + int num = nPairs; + launcher.launch1D( num); + clFinish(m_queue); + } + } + + + } else + { + B3_PROFILE("findSeparatingAxisKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( pairs->getBufferCL(), true ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( gpuCollidables.getBufferCL(),true), + b3BufferInfoCL( convexData.getBufferCL(),true), + b3BufferInfoCL( gpuVertices.getBufferCL(),true), + b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), + b3BufferInfoCL( gpuFaces.getBufferCL(),true), + b3BufferInfoCL( gpuIndices.getBufferCL(),true), + b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), + b3BufferInfoCL( m_sepNormals.getBufferCL()), + b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()) + }; + + b3LauncherCL launcher(m_queue, m_findSeparatingAxisKernel,"m_findSeparatingAxisKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( nPairs ); + + int num = nPairs; + launcher.launch1D( num); + clFinish(m_queue); + } + + + } + else + { + + B3_PROFILE("findSeparatingAxisKernel CPU"); + + + b3AlignedObjectArray<b3Int4> hostPairs; + pairs->copyToHost(hostPairs); + b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; + bodyBuf->copyToHost(hostBodyBuf); + + b3AlignedObjectArray<b3Collidable> hostCollidables; + gpuCollidables.copyToHost(hostCollidables); + + b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes; + gpuChildShapes.copyToHost(cpuChildShapes); + + b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexShapeData; + convexData.copyToHost(hostConvexShapeData); + + b3AlignedObjectArray<b3Vector3> hostVertices; + gpuVertices.copyToHost(hostVertices); + + b3AlignedObjectArray<int> hostHasSepAxis; + hostHasSepAxis.resize(nPairs); + b3AlignedObjectArray<b3Vector3> hostSepAxis; + hostSepAxis.resize(nPairs); + + b3AlignedObjectArray<b3Vector3> hostUniqueEdges; + gpuUniqueEdges.copyToHost(hostUniqueEdges); + b3AlignedObjectArray<b3GpuFace> hostFaces; + gpuFaces.copyToHost(hostFaces); + + b3AlignedObjectArray<int> hostIndices; + gpuIndices.copyToHost(hostIndices); + + b3AlignedObjectArray<b3Contact4> hostContacts; + if (nContacts) + { + contactOut->copyToHost(hostContacts); + } + hostContacts.resize(maxContactCapacity); + int nGlobalContactsOut = nContacts; + + + for (int i=0;i<nPairs;i++) + { + + int bodyIndexA = hostPairs[i].x; + int bodyIndexB = hostPairs[i].y; + int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx; + int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx; + + int shapeIndexA = hostCollidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = hostCollidables[collidableIndexB].m_shapeIndex; + + hostHasSepAxis[i] = 0; + + //once the broadphase avoids static-static pairs, we can remove this test + if ((hostBodyBuf[bodyIndexA].m_invMass==0) &&(hostBodyBuf[bodyIndexB].m_invMass==0)) + { + continue; + } + + + if ((hostCollidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(hostCollidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL)) + { + continue; + } + + float dmin = FLT_MAX; + + b3ConvexPolyhedronData* convexShapeA = &hostConvexShapeData[shapeIndexA]; + b3ConvexPolyhedronData* convexShapeB = &hostConvexShapeData[shapeIndexB]; + b3Vector3 posA = hostBodyBuf[bodyIndexA].m_pos; + b3Vector3 posB = hostBodyBuf[bodyIndexB].m_pos; + b3Quaternion ornA =hostBodyBuf[bodyIndexA].m_quat; + b3Quaternion ornB =hostBodyBuf[bodyIndexB].m_quat; + + + if (useGjk) + { + + //first approximate the separating axis, to 'fail-proof' GJK+EPA or MPR + { + b3Vector3 c0local = hostConvexShapeData[shapeIndexA].m_localCenter; + b3Vector3 c0 = b3TransformPoint(c0local, posA, ornA); + b3Vector3 c1local = hostConvexShapeData[shapeIndexB].m_localCenter; + b3Vector3 c1 = b3TransformPoint(c1local,posB,ornB); + b3Vector3 DeltaC2 = c0 - c1; + + b3Vector3 sepAxis; + + bool hasSepAxisA = b3FindSeparatingAxis(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2, + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &sepAxis, &dmin); + + if (hasSepAxisA) + { + bool hasSepAxisB = b3FindSeparatingAxis(convexShapeB, convexShapeA, posB, ornB, posA, ornA, DeltaC2, + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &sepAxis, &dmin); + if (hasSepAxisB) + { + bool hasEdgeEdge =b3FindSeparatingAxisEdgeEdge(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2, + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &sepAxis, &dmin,false); + + if (hasEdgeEdge) + { + hostHasSepAxis[i] = 1; + hostSepAxis[i] = sepAxis; + hostSepAxis[i].w = dmin; + } + } + } + } + + if (hostHasSepAxis[i]) + { + int pairIndex = i; + + bool useMpr = true; + if (useMpr) + { + int res=0; + float depth = 0.f; + b3Vector3 sepAxis2 = b3MakeVector3(1,0,0); + b3Vector3 resultPointOnBWorld = b3MakeVector3(0,0,0); + + float depthOut; + b3Vector3 dirOut; + b3Vector3 posOut; + + + //res = b3MprPenetration(bodyIndexA,bodyIndexB,hostBodyBuf,hostConvexShapeData,hostCollidables,hostVertices,&mprConfig,&depthOut,&dirOut,&posOut); + res = b3MprPenetration(pairIndex,bodyIndexA,bodyIndexB,&hostBodyBuf[0],&hostConvexShapeData[0],&hostCollidables[0],&hostVertices[0],&hostSepAxis[0],&hostHasSepAxis[0],&depthOut,&dirOut,&posOut); + depth = depthOut; + sepAxis2 = b3MakeVector3(-dirOut.x,-dirOut.y,-dirOut.z); + resultPointOnBWorld = posOut; + //hostHasSepAxis[i] = 0; + + + if (res==0) + { + //add point? + //printf("depth = %f\n",depth); + //printf("normal = %f,%f,%f\n",dir.v[0],dir.v[1],dir.v[2]); + //qprintf("pos = %f,%f,%f\n",pos.v[0],pos.v[1],pos.v[2]); + + + + float dist=0.f; + + const b3ConvexPolyhedronData& hullA = hostConvexShapeData[hostCollidables[hostBodyBuf[bodyIndexA].m_collidableIdx].m_shapeIndex]; + const b3ConvexPolyhedronData& hullB = hostConvexShapeData[hostCollidables[hostBodyBuf[bodyIndexB].m_collidableIdx].m_shapeIndex]; + + if(b3TestSepAxis( &hullA, &hullB, posA,ornA,posB,ornB,&sepAxis2, &hostVertices[0], &hostVertices[0],&dist)) + { + if (depth > dist) + { + float diff = depth - dist; + + static float maxdiff = 0.f; + if (maxdiff < diff) + { + maxdiff = diff; + printf("maxdiff = %20.10f\n",maxdiff); + } + } + } + if (depth > dmin) + { + b3Vector3 oldAxis = hostSepAxis[i]; + depth = dmin; + sepAxis2 = oldAxis; + } + + + + if(b3TestSepAxis( &hullA, &hullB, posA,ornA,posB,ornB,&sepAxis2, &hostVertices[0], &hostVertices[0],&dist)) + { + if (depth > dist) + { + float diff = depth - dist; + //printf("?diff = %f\n",diff ); + static float maxdiff = 0.f; + if (maxdiff < diff) + { + maxdiff = diff; + printf("maxdiff = %20.10f\n",maxdiff); + } + } + //this is used for SAT + //hostHasSepAxis[i] = 1; + //hostSepAxis[i] = sepAxis2; + + //add contact point + + //int contactIndex = nGlobalContactsOut; + b3Contact4& newContact = hostContacts.at(nGlobalContactsOut); + nGlobalContactsOut++; + newContact.m_batchIdx = 0;//i; + newContact.m_bodyAPtrAndSignBit = (hostBodyBuf.at(bodyIndexA).m_invMass==0)? -bodyIndexA:bodyIndexA; + newContact.m_bodyBPtrAndSignBit = (hostBodyBuf.at(bodyIndexB).m_invMass==0)? -bodyIndexB:bodyIndexB; + + newContact.m_frictionCoeffCmp = 45874; + newContact.m_restituitionCoeffCmp = 0; + + + static float maxDepth = 0.f; + + if (depth > maxDepth) + { + maxDepth = depth; + printf("MPR maxdepth = %f\n",maxDepth ); + + } + + + resultPointOnBWorld.w = -depth; + newContact.m_worldPosB[0] = resultPointOnBWorld; + //b3Vector3 resultPointOnAWorld = resultPointOnBWorld+depth*sepAxis2; + newContact.m_worldNormalOnB = sepAxis2; + newContact.m_worldNormalOnB.w = (b3Scalar)1; + } else + { + printf("rejected\n"); + } + + + } + } else + { + + + + //int contactIndex = computeContactConvexConvex2( i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,hostBodyBuf, hostCollidables,hostConvexData,hostVertices,hostUniqueEdges,hostIndices,hostFaces,hostContacts,nContacts,maxContactCapacity,oldHostContacts); + b3AlignedObjectArray<b3Contact4> oldHostContacts; + int result; + result = computeContactConvexConvex2( //hostPairs, + pairIndex, + bodyIndexA, bodyIndexB, + collidableIndexA, collidableIndexB, + hostBodyBuf, + hostCollidables, + hostConvexShapeData, + hostVertices, + hostUniqueEdges, + hostIndices, + hostFaces, + hostContacts, + nGlobalContactsOut, + maxContactCapacity, + oldHostContacts + //hostHasSepAxis, + //hostSepAxis + + ); + }//mpr + }//hostHasSepAxis[i] = 1; + + } else + { + + b3Vector3 c0local = hostConvexShapeData[shapeIndexA].m_localCenter; + b3Vector3 c0 = b3TransformPoint(c0local, posA, ornA); + b3Vector3 c1local = hostConvexShapeData[shapeIndexB].m_localCenter; + b3Vector3 c1 = b3TransformPoint(c1local,posB,ornB); + b3Vector3 DeltaC2 = c0 - c1; + + b3Vector3 sepAxis; + + bool hasSepAxisA = b3FindSeparatingAxis(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2, + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &sepAxis, &dmin); + + if (hasSepAxisA) + { + bool hasSepAxisB = b3FindSeparatingAxis(convexShapeB, convexShapeA, posB, ornB, posA, ornA, DeltaC2, + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &sepAxis, &dmin); + if (hasSepAxisB) + { + bool hasEdgeEdge =b3FindSeparatingAxisEdgeEdge(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2, + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &sepAxis, &dmin,true); + + if (hasEdgeEdge) + { + hostHasSepAxis[i] = 1; + hostSepAxis[i] = sepAxis; + } + } + } + } + } + + if (useGjkContacts)//nGlobalContactsOut>0) + { + //printf("nGlobalContactsOut=%d\n",nGlobalContactsOut); + nContacts = nGlobalContactsOut; + contactOut->copyFromHost(hostContacts); + + m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true); + } + + m_hasSeparatingNormals.copyFromHost(hostHasSepAxis); + m_sepNormals.copyFromHost(hostSepAxis); + + /* + //double-check results from GPU (comment-out the 'else' so both paths are executed + b3AlignedObjectArray<int> checkHasSepAxis; + m_hasSeparatingNormals.copyToHost(checkHasSepAxis); + static int frameCount = 0; + frameCount++; + for (int i=0;i<nPairs;i++) + { + if (hostHasSepAxis[i] != checkHasSepAxis[i]) + { + printf("at frameCount %d hostHasSepAxis[%d] = %d but checkHasSepAxis[i] = %d\n", + frameCount,i,hostHasSepAxis[i],checkHasSepAxis[i]); + } + } + //m_hasSeparatingNormals.copyFromHost(hostHasSepAxis); + // m_sepNormals.copyFromHost(hostSepAxis); + */ + } + + + numCompoundPairs = m_numCompoundPairsOut.at(0); + bool useGpuFindCompoundPairs=true; + if (useGpuFindCompoundPairs) + { + B3_PROFILE("findCompoundPairsKernel"); + b3BufferInfoCL bInfo[] = + { + b3BufferInfoCL( pairs->getBufferCL(), true ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( gpuCollidables.getBufferCL(),true), + b3BufferInfoCL( convexData.getBufferCL(),true), + b3BufferInfoCL( gpuVertices.getBufferCL(),true), + b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), + b3BufferInfoCL( gpuFaces.getBufferCL(),true), + b3BufferInfoCL( gpuIndices.getBufferCL(),true), + b3BufferInfoCL( clAabbsLocalSpace.getBufferCL(),true), + b3BufferInfoCL( gpuChildShapes.getBufferCL(),true), + b3BufferInfoCL( m_gpuCompoundPairs.getBufferCL()), + b3BufferInfoCL( m_numCompoundPairsOut.getBufferCL()), + b3BufferInfoCL(subTreesGPU->getBufferCL()), + b3BufferInfoCL(treeNodesGPU->getBufferCL()), + b3BufferInfoCL(bvhInfo->getBufferCL()) + }; + + b3LauncherCL launcher(m_queue, m_findCompoundPairsKernel,"m_findCompoundPairsKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( nPairs ); + launcher.setConst( compoundPairCapacity); + + int num = nPairs; + launcher.launch1D( num); + clFinish(m_queue); + + numCompoundPairs = m_numCompoundPairsOut.at(0); + //printf("numCompoundPairs =%d\n",numCompoundPairs ); + if (numCompoundPairs) + { + //printf("numCompoundPairs=%d\n",numCompoundPairs); + } + + + } else + { + + + b3AlignedObjectArray<b3QuantizedBvhNode> treeNodesCPU; + treeNodesGPU->copyToHost(treeNodesCPU); + + b3AlignedObjectArray<b3BvhSubtreeInfo> subTreesCPU; + subTreesGPU->copyToHost(subTreesCPU); + + b3AlignedObjectArray<b3BvhInfo> bvhInfoCPU; + bvhInfo->copyToHost(bvhInfoCPU); + + b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace; + clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace); + + b3AlignedObjectArray<b3Aabb> hostAabbsLocalSpace; + clAabbsLocalSpace.copyToHost(hostAabbsLocalSpace); + + b3AlignedObjectArray<b3Int4> hostPairs; + pairs->copyToHost(hostPairs); + + b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; + bodyBuf->copyToHost(hostBodyBuf); + + + b3AlignedObjectArray<b3Int4> cpuCompoundPairsOut; + cpuCompoundPairsOut.resize(compoundPairCapacity); + + b3AlignedObjectArray<b3Collidable> hostCollidables; + gpuCollidables.copyToHost(hostCollidables); + + b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes; + gpuChildShapes.copyToHost(cpuChildShapes); + + b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData; + convexData.copyToHost(hostConvexData); + + b3AlignedObjectArray<b3Vector3> hostVertices; + gpuVertices.copyToHost(hostVertices); + + + + + for (int pairIndex=0;pairIndex<nPairs;pairIndex++) + { + int bodyIndexA = hostPairs[pairIndex].x; + int bodyIndexB = hostPairs[pairIndex].y; + int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx; + int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx; + if (cpuChildShapes.size()) + { + findCompoundPairsKernel( + pairIndex, + bodyIndexA, + bodyIndexB, + collidableIndexA, + collidableIndexB, + &hostBodyBuf[0], + &hostCollidables[0], + &hostConvexData[0], + hostVertices, + hostAabbsWorldSpace, + hostAabbsLocalSpace, + &cpuChildShapes[0], + &cpuCompoundPairsOut[0], + &numCompoundPairs, + compoundPairCapacity, + treeNodesCPU, + subTreesCPU, + bvhInfoCPU + ); + } + } + + + m_numCompoundPairsOut.copyFromHostPointer(&numCompoundPairs,1,0,true); + if (numCompoundPairs) + { + b3CompoundOverlappingPair* ptr = (b3CompoundOverlappingPair*)&cpuCompoundPairsOut[0]; + m_gpuCompoundPairs.copyFromHostPointer(ptr,numCompoundPairs,0,true); + } + //cpuCompoundPairsOut + + } + if (numCompoundPairs) + { + printf("numCompoundPairs=%d\n",numCompoundPairs); + } + + if (numCompoundPairs > compoundPairCapacity) + { + b3Error("Exceeded compound pair capacity (%d/%d)\n", numCompoundPairs, compoundPairCapacity); + numCompoundPairs = compoundPairCapacity; + } + + + + m_gpuCompoundPairs.resize(numCompoundPairs); + m_gpuHasCompoundSepNormals.resize(numCompoundPairs); + m_gpuCompoundSepNormals.resize(numCompoundPairs); + + + if (numCompoundPairs) + { + B3_PROFILE("processCompoundPairsPrimitivesKernel"); + b3BufferInfoCL bInfo[] = + { + b3BufferInfoCL( m_gpuCompoundPairs.getBufferCL(), true ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( gpuCollidables.getBufferCL(),true), + b3BufferInfoCL( convexData.getBufferCL(),true), + b3BufferInfoCL( gpuVertices.getBufferCL(),true), + b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), + b3BufferInfoCL( gpuFaces.getBufferCL(),true), + b3BufferInfoCL( gpuIndices.getBufferCL(),true), + b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), + b3BufferInfoCL( gpuChildShapes.getBufferCL(),true), + b3BufferInfoCL( contactOut->getBufferCL()), + b3BufferInfoCL( m_totalContactsOut.getBufferCL()) + }; + + b3LauncherCL launcher(m_queue, m_processCompoundPairsPrimitivesKernel,"m_processCompoundPairsPrimitivesKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( numCompoundPairs ); + launcher.setConst(maxContactCapacity); + + int num = numCompoundPairs; + launcher.launch1D( num); + clFinish(m_queue); + nContacts = m_totalContactsOut.at(0); + //printf("nContacts (after processCompoundPairsPrimitivesKernel) = %d\n",nContacts); + if (nContacts>maxContactCapacity) + { + + b3Error("Error: contacts exceeds capacity (%d/%d)\n", nContacts, maxContactCapacity); + nContacts = maxContactCapacity; + } + } + + + if (numCompoundPairs) + { + B3_PROFILE("processCompoundPairsKernel"); + b3BufferInfoCL bInfo[] = + { + b3BufferInfoCL( m_gpuCompoundPairs.getBufferCL(), true ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( gpuCollidables.getBufferCL(),true), + b3BufferInfoCL( convexData.getBufferCL(),true), + b3BufferInfoCL( gpuVertices.getBufferCL(),true), + b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), + b3BufferInfoCL( gpuFaces.getBufferCL(),true), + b3BufferInfoCL( gpuIndices.getBufferCL(),true), + b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), + b3BufferInfoCL( gpuChildShapes.getBufferCL(),true), + b3BufferInfoCL( m_gpuCompoundSepNormals.getBufferCL()), + b3BufferInfoCL( m_gpuHasCompoundSepNormals.getBufferCL()) + }; + + b3LauncherCL launcher(m_queue, m_processCompoundPairsKernel,"m_processCompoundPairsKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( numCompoundPairs ); + + int num = numCompoundPairs; + launcher.launch1D( num); + clFinish(m_queue); + + } + + + //printf("numConcave = %d\n",numConcave); + + + +// printf("hostNormals.size()=%d\n",hostNormals.size()); + //int numPairs = pairCount.at(0); + + + + } + int vertexFaceCapacity = 64; + + + + { + //now perform the tree query on GPU + + + + + if (treeNodesGPU->size() && treeNodesGPU->size()) + { + if (bvhTraversalKernelGPU) + { + + B3_PROFILE("m_bvhTraversalKernel"); + + + numConcavePairs = m_numConcavePairsOut.at(0); + + b3LauncherCL launcher(m_queue, m_bvhTraversalKernel,"m_bvhTraversalKernel"); + launcher.setBuffer( pairs->getBufferCL()); + launcher.setBuffer( bodyBuf->getBufferCL()); + launcher.setBuffer( gpuCollidables.getBufferCL()); + launcher.setBuffer( clAabbsWorldSpace.getBufferCL()); + launcher.setBuffer( triangleConvexPairsOut.getBufferCL()); + launcher.setBuffer( m_numConcavePairsOut.getBufferCL()); + launcher.setBuffer( subTreesGPU->getBufferCL()); + launcher.setBuffer( treeNodesGPU->getBufferCL()); + launcher.setBuffer( bvhInfo->getBufferCL()); + + launcher.setConst( nPairs ); + launcher.setConst( maxTriConvexPairCapacity); + int num = nPairs; + launcher.launch1D( num); + clFinish(m_queue); + numConcavePairs = m_numConcavePairsOut.at(0); + } else + { + b3AlignedObjectArray<b3Int4> hostPairs; + pairs->copyToHost(hostPairs); + b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; + bodyBuf->copyToHost(hostBodyBuf); + b3AlignedObjectArray<b3Collidable> hostCollidables; + gpuCollidables.copyToHost(hostCollidables); + b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace; + clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace); + + //int maxTriConvexPairCapacity, + b3AlignedObjectArray<b3Int4> triangleConvexPairsOutHost; + triangleConvexPairsOutHost.resize(maxTriConvexPairCapacity); + + //int numTriConvexPairsOutHost=0; + numConcavePairs = 0; + //m_numConcavePairsOut + + b3AlignedObjectArray<b3QuantizedBvhNode> treeNodesCPU; + treeNodesGPU->copyToHost(treeNodesCPU); + b3AlignedObjectArray<b3BvhSubtreeInfo> subTreesCPU; + subTreesGPU->copyToHost(subTreesCPU); + b3AlignedObjectArray<b3BvhInfo> bvhInfoCPU; + bvhInfo->copyToHost(bvhInfoCPU); + //compute it... + + volatile int hostNumConcavePairsOut=0; + + // + for (int i=0;i<nPairs;i++) + { + b3BvhTraversal( &hostPairs.at(0), + &hostBodyBuf.at(0), + &hostCollidables.at(0), + &hostAabbsWorldSpace.at(0), + &triangleConvexPairsOutHost.at(0), + &hostNumConcavePairsOut, + &subTreesCPU.at(0), + &treeNodesCPU.at(0), + &bvhInfoCPU.at(0), + nPairs, + maxTriConvexPairCapacity, + i); + } + numConcavePairs = hostNumConcavePairsOut; + + if (hostNumConcavePairsOut) + { + triangleConvexPairsOutHost.resize(hostNumConcavePairsOut); + triangleConvexPairsOut.copyFromHost(triangleConvexPairsOutHost); + } + // + + m_numConcavePairsOut.resize(0); + m_numConcavePairsOut.push_back(numConcavePairs); + } + + //printf("numConcavePairs=%d (max = %d\n",numConcavePairs,maxTriConvexPairCapacity); + + if (numConcavePairs > maxTriConvexPairCapacity) + { + static int exceeded_maxTriConvexPairCapacity_count = 0; + b3Error("Exceeded the maxTriConvexPairCapacity (found %d but max is %d, it happened %d times)\n", + numConcavePairs,maxTriConvexPairCapacity,exceeded_maxTriConvexPairCapacity_count++); + numConcavePairs = maxTriConvexPairCapacity; + } + triangleConvexPairsOut.resize(numConcavePairs); + + if (numConcavePairs) + { + + + + + clippingFacesOutGPU.resize(numConcavePairs); + worldNormalsAGPU.resize(numConcavePairs); + worldVertsA1GPU.resize(vertexFaceCapacity*(numConcavePairs)); + worldVertsB1GPU.resize(vertexFaceCapacity*(numConcavePairs)); + + + if (findConcaveSeparatingAxisKernelGPU) + { + + /* + m_concaveHasSeparatingNormals.copyFromHost(concaveHasSeparatingNormalsCPU); + clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU); + worldVertsA1GPU.copyFromHost(worldVertsA1CPU); + worldNormalsAGPU.copyFromHost(worldNormalsACPU); + worldVertsB1GPU.copyFromHost(worldVertsB1CPU); + */ + + //now perform a SAT test for each triangle-convex element (stored in triangleConvexPairsOut) + if (splitSearchSepAxisConcave) + { + //printf("numConcavePairs = %d\n",numConcavePairs); + m_dmins.resize(numConcavePairs); + { + B3_PROFILE("findConcaveSeparatingAxisVertexFaceKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( triangleConvexPairsOut.getBufferCL() ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( gpuCollidables.getBufferCL(),true), + b3BufferInfoCL( convexData.getBufferCL(),true), + b3BufferInfoCL( gpuVertices.getBufferCL(),true), + b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), + b3BufferInfoCL( gpuFaces.getBufferCL(),true), + b3BufferInfoCL( gpuIndices.getBufferCL(),true), + b3BufferInfoCL( gpuChildShapes.getBufferCL(),true), + b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), + b3BufferInfoCL( m_concaveSepNormals.getBufferCL()), + b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()), + b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()), + b3BufferInfoCL( worldVertsA1GPU.getBufferCL()), + b3BufferInfoCL(worldNormalsAGPU.getBufferCL()), + b3BufferInfoCL(worldVertsB1GPU.getBufferCL()), + b3BufferInfoCL(m_dmins.getBufferCL()) + }; + + b3LauncherCL launcher(m_queue, m_findConcaveSeparatingAxisVertexFaceKernel,"m_findConcaveSeparatingAxisVertexFaceKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(vertexFaceCapacity); + launcher.setConst( numConcavePairs ); + + int num = numConcavePairs; + launcher.launch1D( num); + clFinish(m_queue); + + + } +// numConcavePairs = 0; + if (1) + { + B3_PROFILE("findConcaveSeparatingAxisEdgeEdgeKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( triangleConvexPairsOut.getBufferCL() ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( gpuCollidables.getBufferCL(),true), + b3BufferInfoCL( convexData.getBufferCL(),true), + b3BufferInfoCL( gpuVertices.getBufferCL(),true), + b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), + b3BufferInfoCL( gpuFaces.getBufferCL(),true), + b3BufferInfoCL( gpuIndices.getBufferCL(),true), + b3BufferInfoCL( gpuChildShapes.getBufferCL(),true), + b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), + b3BufferInfoCL( m_concaveSepNormals.getBufferCL()), + b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()), + b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()), + b3BufferInfoCL( worldVertsA1GPU.getBufferCL()), + b3BufferInfoCL(worldNormalsAGPU.getBufferCL()), + b3BufferInfoCL(worldVertsB1GPU.getBufferCL()), + b3BufferInfoCL(m_dmins.getBufferCL()) + }; + + b3LauncherCL launcher(m_queue, m_findConcaveSeparatingAxisEdgeEdgeKernel,"m_findConcaveSeparatingAxisEdgeEdgeKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(vertexFaceCapacity); + launcher.setConst( numConcavePairs ); + + int num = numConcavePairs; + launcher.launch1D( num); + clFinish(m_queue); + } + + + // numConcavePairs = 0; + + + + + + + } else + { + B3_PROFILE("findConcaveSeparatingAxisKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( triangleConvexPairsOut.getBufferCL() ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( gpuCollidables.getBufferCL(),true), + b3BufferInfoCL( convexData.getBufferCL(),true), + b3BufferInfoCL( gpuVertices.getBufferCL(),true), + b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), + b3BufferInfoCL( gpuFaces.getBufferCL(),true), + b3BufferInfoCL( gpuIndices.getBufferCL(),true), + b3BufferInfoCL( gpuChildShapes.getBufferCL(),true), + b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), + b3BufferInfoCL( m_concaveSepNormals.getBufferCL()), + b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()), + b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()), + b3BufferInfoCL( worldVertsA1GPU.getBufferCL()), + b3BufferInfoCL(worldNormalsAGPU.getBufferCL()), + b3BufferInfoCL(worldVertsB1GPU.getBufferCL()) + }; + + b3LauncherCL launcher(m_queue, m_findConcaveSeparatingAxisKernel,"m_findConcaveSeparatingAxisKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(vertexFaceCapacity); + launcher.setConst( numConcavePairs ); + + int num = numConcavePairs; + launcher.launch1D( num); + clFinish(m_queue); + } + + + } else + { + + b3AlignedObjectArray<b3Int4> clippingFacesOutCPU; + b3AlignedObjectArray<b3Vector3> worldVertsA1CPU; + b3AlignedObjectArray<b3Vector3> worldNormalsACPU; + b3AlignedObjectArray<b3Vector3> worldVertsB1CPU; + b3AlignedObjectArray<int>concaveHasSeparatingNormalsCPU; + + b3AlignedObjectArray<b3Int4> triangleConvexPairsOutHost; + triangleConvexPairsOut.copyToHost(triangleConvexPairsOutHost); + //triangleConvexPairsOutHost.resize(maxTriConvexPairCapacity); + b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; + bodyBuf->copyToHost(hostBodyBuf); + b3AlignedObjectArray<b3Collidable> hostCollidables; + gpuCollidables.copyToHost(hostCollidables); + b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace; + clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace); + + b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData; + convexData.copyToHost(hostConvexData); + + b3AlignedObjectArray<b3Vector3> hostVertices; + gpuVertices.copyToHost(hostVertices); + + b3AlignedObjectArray<b3Vector3> hostUniqueEdges; + gpuUniqueEdges.copyToHost(hostUniqueEdges); + b3AlignedObjectArray<b3GpuFace> hostFaces; + gpuFaces.copyToHost(hostFaces); + b3AlignedObjectArray<int> hostIndices; + gpuIndices.copyToHost(hostIndices); + b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes; + gpuChildShapes.copyToHost(cpuChildShapes); + + + + b3AlignedObjectArray<b3Vector3> concaveSepNormalsHost; + m_concaveSepNormals.copyToHost(concaveSepNormalsHost); + concaveHasSeparatingNormalsCPU.resize(concaveSepNormalsHost.size()); + + b3GpuChildShape* childShapePointerCPU = 0; + if (cpuChildShapes.size()) + childShapePointerCPU = &cpuChildShapes.at(0); + + clippingFacesOutCPU.resize(clippingFacesOutGPU.size()); + worldVertsA1CPU.resize(worldVertsA1GPU.size()); + worldNormalsACPU.resize(worldNormalsAGPU.size()); + worldVertsB1CPU.resize(worldVertsB1GPU.size()); + + for (int i=0;i<numConcavePairs;i++) + { + b3FindConcaveSeparatingAxisKernel(&triangleConvexPairsOutHost.at(0), + &hostBodyBuf.at(0), + &hostCollidables.at(0), + &hostConvexData.at(0), &hostVertices.at(0),&hostUniqueEdges.at(0), + &hostFaces.at(0),&hostIndices.at(0),childShapePointerCPU, + &hostAabbsWorldSpace.at(0), + &concaveSepNormalsHost.at(0), + &clippingFacesOutCPU.at(0), + &worldVertsA1CPU.at(0), + &worldNormalsACPU.at(0), + &worldVertsB1CPU.at(0), + &concaveHasSeparatingNormalsCPU.at(0), + vertexFaceCapacity, + numConcavePairs,i); + }; + + m_concaveSepNormals.copyFromHost(concaveSepNormalsHost); + m_concaveHasSeparatingNormals.copyFromHost(concaveHasSeparatingNormalsCPU); + clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU); + worldVertsA1GPU.copyFromHost(worldVertsA1CPU); + worldNormalsAGPU.copyFromHost(worldNormalsACPU); + worldVertsB1GPU.copyFromHost(worldVertsB1CPU); + + + + } +// b3AlignedObjectArray<b3Vector3> cpuCompoundSepNormals; +// m_concaveSepNormals.copyToHost(cpuCompoundSepNormals); +// b3AlignedObjectArray<b3Int4> cpuConcavePairs; +// triangleConvexPairsOut.copyToHost(cpuConcavePairs); + + + } + } + + + } + + if (numConcavePairs) + { + if (numConcavePairs) + { + B3_PROFILE("findConcaveSphereContactsKernel"); + nContacts = m_totalContactsOut.at(0); +// printf("nContacts1 = %d\n",nContacts); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( triangleConvexPairsOut.getBufferCL() ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( gpuCollidables.getBufferCL(),true), + b3BufferInfoCL( convexData.getBufferCL(),true), + b3BufferInfoCL( gpuVertices.getBufferCL(),true), + b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), + b3BufferInfoCL( gpuFaces.getBufferCL(),true), + b3BufferInfoCL( gpuIndices.getBufferCL(),true), + b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), + b3BufferInfoCL( contactOut->getBufferCL()), + b3BufferInfoCL( m_totalContactsOut.getBufferCL()) + }; + + b3LauncherCL launcher(m_queue, m_findConcaveSphereContactsKernel,"m_findConcaveSphereContactsKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + + launcher.setConst( numConcavePairs ); + launcher.setConst(maxContactCapacity); + + int num = numConcavePairs; + launcher.launch1D( num); + clFinish(m_queue); + nContacts = m_totalContactsOut.at(0); + //printf("nContacts (after findConcaveSphereContactsKernel) = %d\n",nContacts); + + //printf("nContacts2 = %d\n",nContacts); + + if (nContacts >= maxContactCapacity) + { + b3Error("Error: contacts exceeds capacity (%d/%d)\n", nContacts, maxContactCapacity); + nContacts = maxContactCapacity; + } + } + + } + + + +#ifdef __APPLE__ + bool contactClippingOnGpu = true; +#else + bool contactClippingOnGpu = true; +#endif + + if (contactClippingOnGpu) + { + m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true); +// printf("nContacts3 = %d\n",nContacts); + + + //B3_PROFILE("clipHullHullKernel"); + + bool breakupConcaveConvexKernel = true; + +#ifdef __APPLE__ + //actually, some Apple OpenCL platform/device combinations work fine... + breakupConcaveConvexKernel = true; +#endif + //concave-convex contact clipping + if (numConcavePairs) + { + // printf("numConcavePairs = %d\n", numConcavePairs); + // nContacts = m_totalContactsOut.at(0); + // printf("nContacts before = %d\n", nContacts); + + if (breakupConcaveConvexKernel) + { + + worldVertsB2GPU.resize(vertexFaceCapacity*numConcavePairs); + + + //clipFacesAndFindContacts + + if (clipConcaveFacesAndFindContactsCPU) + { + + b3AlignedObjectArray<b3Int4> clippingFacesOutCPU; + b3AlignedObjectArray<b3Vector3> worldVertsA1CPU; + b3AlignedObjectArray<b3Vector3> worldNormalsACPU; + b3AlignedObjectArray<b3Vector3> worldVertsB1CPU; + + clippingFacesOutGPU.copyToHost(clippingFacesOutCPU); + worldVertsA1GPU.copyToHost(worldVertsA1CPU); + worldNormalsAGPU.copyToHost(worldNormalsACPU); + worldVertsB1GPU.copyToHost(worldVertsB1CPU); + + + + b3AlignedObjectArray<int>concaveHasSeparatingNormalsCPU; + m_concaveHasSeparatingNormals.copyToHost(concaveHasSeparatingNormalsCPU); + + b3AlignedObjectArray<b3Vector3> concaveSepNormalsHost; + m_concaveSepNormals.copyToHost(concaveSepNormalsHost); + + b3AlignedObjectArray<b3Vector3> worldVertsB2CPU; + worldVertsB2CPU.resize(worldVertsB2GPU.size()); + + + for (int i=0;i<numConcavePairs;i++) + { + + clipFacesAndFindContactsKernel( &concaveSepNormalsHost.at(0), + &concaveHasSeparatingNormalsCPU.at(0), + &clippingFacesOutCPU.at(0), + &worldVertsA1CPU.at(0), + &worldNormalsACPU.at(0), + &worldVertsB1CPU.at(0), + &worldVertsB2CPU.at(0), + vertexFaceCapacity, + i); + } + + clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU); + worldVertsB2GPU.copyFromHost(worldVertsB2CPU); + + + } else + { + + if (1) + { + + + + B3_PROFILE("clipFacesAndFindContacts"); + //nContacts = m_totalContactsOut.at(0); + //int h = m_hasSeparatingNormals.at(0); + //int4 p = clippingFacesOutGPU.at(0); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( m_concaveSepNormals.getBufferCL()), + b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()), + b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()), + b3BufferInfoCL( worldVertsA1GPU.getBufferCL()), + b3BufferInfoCL( worldNormalsAGPU.getBufferCL()), + b3BufferInfoCL( worldVertsB1GPU.getBufferCL()), + b3BufferInfoCL( worldVertsB2GPU.getBufferCL()) + }; + b3LauncherCL launcher(m_queue, m_clipFacesAndFindContacts,"m_clipFacesAndFindContacts"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(vertexFaceCapacity); + + launcher.setConst( numConcavePairs ); + int debugMode = 0; + launcher.setConst( debugMode); + int num = numConcavePairs; + launcher.launch1D( num); + clFinish(m_queue); + //int bla = m_totalContactsOut.at(0); + } + } + //contactReduction + { + int newContactCapacity=nContacts+numConcavePairs; + contactOut->reserve(newContactCapacity); + if (reduceConcaveContactsOnGPU) + { +// printf("newReservation = %d\n",newReservation); + { + B3_PROFILE("newContactReductionKernel"); + b3BufferInfoCL bInfo[] = + { + b3BufferInfoCL( triangleConvexPairsOut.getBufferCL(), true ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( m_concaveSepNormals.getBufferCL()), + b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()), + b3BufferInfoCL( contactOut->getBufferCL()), + b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()), + b3BufferInfoCL( worldVertsB2GPU.getBufferCL()), + b3BufferInfoCL( m_totalContactsOut.getBufferCL()) + }; + + b3LauncherCL launcher(m_queue, m_newContactReductionKernel,"m_newContactReductionKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(vertexFaceCapacity); + launcher.setConst(newContactCapacity); + launcher.setConst( numConcavePairs ); + int num = numConcavePairs; + + launcher.launch1D( num); + } + nContacts = m_totalContactsOut.at(0); + contactOut->resize(nContacts); + + //printf("contactOut4 (after newContactReductionKernel) = %d\n",nContacts); + }else + { + + volatile int nGlobalContactsOut = nContacts; + b3AlignedObjectArray<b3Int4> triangleConvexPairsOutHost; + triangleConvexPairsOut.copyToHost(triangleConvexPairsOutHost); + b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; + bodyBuf->copyToHost(hostBodyBuf); + + b3AlignedObjectArray<int>concaveHasSeparatingNormalsCPU; + m_concaveHasSeparatingNormals.copyToHost(concaveHasSeparatingNormalsCPU); + + b3AlignedObjectArray<b3Vector3> concaveSepNormalsHost; + m_concaveSepNormals.copyToHost(concaveSepNormalsHost); + + + b3AlignedObjectArray<b3Contact4> hostContacts; + if (nContacts) + { + contactOut->copyToHost(hostContacts); + } + hostContacts.resize(newContactCapacity); + + b3AlignedObjectArray<b3Int4> clippingFacesOutCPU; + b3AlignedObjectArray<b3Vector3> worldVertsB2CPU; + + clippingFacesOutGPU.copyToHost(clippingFacesOutCPU); + worldVertsB2GPU.copyToHost(worldVertsB2CPU); + + + + for (int i=0;i<numConcavePairs;i++) + { + b3NewContactReductionKernel( &triangleConvexPairsOutHost.at(0), + &hostBodyBuf.at(0), + &concaveSepNormalsHost.at(0), + &concaveHasSeparatingNormalsCPU.at(0), + &hostContacts.at(0), + &clippingFacesOutCPU.at(0), + &worldVertsB2CPU.at(0), + &nGlobalContactsOut, + vertexFaceCapacity, + newContactCapacity, + numConcavePairs, + i + ); + + } + + + nContacts = nGlobalContactsOut; + m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true); +// nContacts = m_totalContactsOut.at(0); + //contactOut->resize(nContacts); + hostContacts.resize(nContacts); + //printf("contactOut4 (after newContactReductionKernel) = %d\n",nContacts); + contactOut->copyFromHost(hostContacts); + } + + } + //re-use? + + + } else + { + B3_PROFILE("clipHullHullConcaveConvexKernel"); + nContacts = m_totalContactsOut.at(0); + int newContactCapacity = contactOut->capacity(); + + //printf("contactOut5 = %d\n",nContacts); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( triangleConvexPairsOut.getBufferCL(), true ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( gpuCollidables.getBufferCL(),true), + b3BufferInfoCL( convexData.getBufferCL(),true), + b3BufferInfoCL( gpuVertices.getBufferCL(),true), + b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), + b3BufferInfoCL( gpuFaces.getBufferCL(),true), + b3BufferInfoCL( gpuIndices.getBufferCL(),true), + b3BufferInfoCL( gpuChildShapes.getBufferCL(),true), + b3BufferInfoCL( m_concaveSepNormals.getBufferCL()), + b3BufferInfoCL( contactOut->getBufferCL()), + b3BufferInfoCL( m_totalContactsOut.getBufferCL()) + }; + b3LauncherCL launcher(m_queue, m_clipHullHullConcaveConvexKernel,"m_clipHullHullConcaveConvexKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(newContactCapacity); + launcher.setConst( numConcavePairs ); + int num = numConcavePairs; + launcher.launch1D( num); + clFinish(m_queue); + nContacts = m_totalContactsOut.at(0); + contactOut->resize(nContacts); + //printf("contactOut6 = %d\n",nContacts); + b3AlignedObjectArray<b3Contact4> cpuContacts; + contactOut->copyToHost(cpuContacts); + } + // printf("nContacts after = %d\n", nContacts); + }//numConcavePairs + + + + //convex-convex contact clipping + + bool breakupKernel = false; + +#ifdef __APPLE__ + breakupKernel = true; +#endif + +#ifdef CHECK_ON_HOST + bool computeConvexConvex = false; +#else + bool computeConvexConvex = true; +#endif//CHECK_ON_HOST + if (computeConvexConvex) + { + B3_PROFILE("clipHullHullKernel"); + if (breakupKernel) + { + + + + + worldVertsB1GPU.resize(vertexFaceCapacity*nPairs); + clippingFacesOutGPU.resize(nPairs); + worldNormalsAGPU.resize(nPairs); + worldVertsA1GPU.resize(vertexFaceCapacity*nPairs); + worldVertsB2GPU.resize(vertexFaceCapacity*nPairs); + + if (findConvexClippingFacesGPU) + { + B3_PROFILE("findClippingFacesKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( pairs->getBufferCL(), true ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( gpuCollidables.getBufferCL(),true), + b3BufferInfoCL( convexData.getBufferCL(),true), + b3BufferInfoCL( gpuVertices.getBufferCL(),true), + b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), + b3BufferInfoCL( gpuFaces.getBufferCL(),true), + b3BufferInfoCL( gpuIndices.getBufferCL(),true), + b3BufferInfoCL( m_sepNormals.getBufferCL()), + b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()), + b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()), + b3BufferInfoCL( worldVertsA1GPU.getBufferCL()), + b3BufferInfoCL( worldNormalsAGPU.getBufferCL()), + b3BufferInfoCL( worldVertsB1GPU.getBufferCL()) + }; + + b3LauncherCL launcher(m_queue, m_findClippingFacesKernel,"m_findClippingFacesKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( vertexFaceCapacity); + launcher.setConst( nPairs ); + int num = nPairs; + launcher.launch1D( num); + clFinish(m_queue); + + } else + { + + float minDist = -1e30f; + float maxDist = 0.02f; + + b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData; + convexData.copyToHost(hostConvexData); + b3AlignedObjectArray<b3Collidable> hostCollidables; + gpuCollidables.copyToHost(hostCollidables); + + b3AlignedObjectArray<int> hostHasSepNormals; + m_hasSeparatingNormals.copyToHost(hostHasSepNormals); + b3AlignedObjectArray<b3Vector3> cpuSepNormals; + m_sepNormals.copyToHost(cpuSepNormals); + + b3AlignedObjectArray<b3Int4> hostPairs; + pairs->copyToHost(hostPairs); + b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; + bodyBuf->copyToHost(hostBodyBuf); + + + //worldVertsB1GPU.resize(vertexFaceCapacity*nPairs); + b3AlignedObjectArray<b3Vector3> worldVertsB1CPU; + worldVertsB1GPU.copyToHost(worldVertsB1CPU); + + b3AlignedObjectArray<b3Int4> clippingFacesOutCPU; + clippingFacesOutGPU.copyToHost(clippingFacesOutCPU); + + b3AlignedObjectArray<b3Vector3> worldNormalsACPU; + worldNormalsACPU.resize(nPairs); + + b3AlignedObjectArray<b3Vector3> worldVertsA1CPU; + worldVertsA1CPU.resize(worldVertsA1GPU.size()); + + + b3AlignedObjectArray<b3Vector3> hostVertices; + gpuVertices.copyToHost(hostVertices); + b3AlignedObjectArray<b3GpuFace> hostFaces; + gpuFaces.copyToHost(hostFaces); + b3AlignedObjectArray<int> hostIndices; + gpuIndices.copyToHost(hostIndices); + + + for (int i=0;i<nPairs;i++) + { + + int bodyIndexA = hostPairs[i].x; + int bodyIndexB = hostPairs[i].y; + + int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx; + int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx; + + int shapeIndexA = hostCollidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = hostCollidables[collidableIndexB].m_shapeIndex; + + + if (hostHasSepNormals[i]) + { + b3FindClippingFaces(cpuSepNormals[i], + &hostConvexData[shapeIndexA], + &hostConvexData[shapeIndexB], + hostBodyBuf[bodyIndexA].m_pos,hostBodyBuf[bodyIndexA].m_quat, + hostBodyBuf[bodyIndexB].m_pos,hostBodyBuf[bodyIndexB].m_quat, + &worldVertsA1CPU.at(0),&worldNormalsACPU.at(0), + &worldVertsB1CPU.at(0), + vertexFaceCapacity,minDist,maxDist, + &hostVertices.at(0),&hostFaces.at(0), + &hostIndices.at(0), + &hostVertices.at(0),&hostFaces.at(0), + &hostIndices.at(0),&clippingFacesOutCPU.at(0),i); + } + } + + clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU); + worldVertsA1GPU.copyFromHost(worldVertsA1CPU); + worldNormalsAGPU.copyFromHost(worldNormalsACPU); + worldVertsB1GPU.copyFromHost(worldVertsB1CPU); + + } + + + + + + ///clip face B against face A, reduce contacts and append them to a global contact array + if (1) + { + if (clipConvexFacesAndFindContactsCPU) + { + + //b3AlignedObjectArray<b3Int4> hostPairs; + //pairs->copyToHost(hostPairs); + + b3AlignedObjectArray<b3Vector3> hostSepNormals; + m_sepNormals.copyToHost(hostSepNormals); + b3AlignedObjectArray<int> hostHasSepAxis; + m_hasSeparatingNormals.copyToHost(hostHasSepAxis); + + b3AlignedObjectArray<b3Int4> hostClippingFaces; + clippingFacesOutGPU.copyToHost(hostClippingFaces); + b3AlignedObjectArray<b3Vector3> worldVertsB2CPU; + worldVertsB2CPU.resize(vertexFaceCapacity*nPairs); + + b3AlignedObjectArray<b3Vector3>worldVertsA1CPU; + worldVertsA1GPU.copyToHost(worldVertsA1CPU); + b3AlignedObjectArray<b3Vector3> worldNormalsACPU; + worldNormalsAGPU.copyToHost(worldNormalsACPU); + + b3AlignedObjectArray<b3Vector3> worldVertsB1CPU; + worldVertsB1GPU.copyToHost(worldVertsB1CPU); + + /* + __global const b3Float4* separatingNormals, + __global const int* hasSeparatingAxis, + __global b3Int4* clippingFacesOut, + __global b3Float4* worldVertsA1, + __global b3Float4* worldNormalsA1, + __global b3Float4* worldVertsB1, + __global b3Float4* worldVertsB2, + int vertexFaceCapacity, + int pairIndex + */ + for (int i=0;i<nPairs;i++) + { + clipFacesAndFindContactsKernel( + &hostSepNormals.at(0), + &hostHasSepAxis.at(0), + &hostClippingFaces.at(0), + &worldVertsA1CPU.at(0), + &worldNormalsACPU.at(0), + &worldVertsB1CPU.at(0), + &worldVertsB2CPU.at(0), + + vertexFaceCapacity, + i); + } + + clippingFacesOutGPU.copyFromHost(hostClippingFaces); + worldVertsB2GPU.copyFromHost(worldVertsB2CPU); + + } else + { + B3_PROFILE("clipFacesAndFindContacts"); + //nContacts = m_totalContactsOut.at(0); + //int h = m_hasSeparatingNormals.at(0); + //int4 p = clippingFacesOutGPU.at(0); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( m_sepNormals.getBufferCL()), + b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()), + b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()), + b3BufferInfoCL( worldVertsA1GPU.getBufferCL()), + b3BufferInfoCL( worldNormalsAGPU.getBufferCL()), + b3BufferInfoCL( worldVertsB1GPU.getBufferCL()), + b3BufferInfoCL( worldVertsB2GPU.getBufferCL()) + }; + + b3LauncherCL launcher(m_queue, m_clipFacesAndFindContacts,"m_clipFacesAndFindContacts"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(vertexFaceCapacity); + + launcher.setConst( nPairs ); + int debugMode = 0; + launcher.setConst( debugMode); + int num = nPairs; + launcher.launch1D( num); + clFinish(m_queue); + } + + { + nContacts = m_totalContactsOut.at(0); + //printf("nContacts = %d\n",nContacts); + + int newContactCapacity = nContacts+nPairs; + contactOut->reserve(newContactCapacity); + + if (reduceConvexContactsOnGPU) + { + { + B3_PROFILE("newContactReductionKernel"); + b3BufferInfoCL bInfo[] = + { + b3BufferInfoCL( pairs->getBufferCL(), true ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( m_sepNormals.getBufferCL()), + b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()), + b3BufferInfoCL( contactOut->getBufferCL()), + b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()), + b3BufferInfoCL( worldVertsB2GPU.getBufferCL()), + b3BufferInfoCL( m_totalContactsOut.getBufferCL()) + }; + + b3LauncherCL launcher(m_queue, m_newContactReductionKernel,"m_newContactReductionKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(vertexFaceCapacity); + launcher.setConst(newContactCapacity); + launcher.setConst( nPairs ); + int num = nPairs; + + launcher.launch1D( num); + } + nContacts = m_totalContactsOut.at(0); + contactOut->resize(nContacts); + } else + { + + volatile int nGlobalContactsOut = nContacts; + b3AlignedObjectArray<b3Int4> hostPairs; + pairs->copyToHost(hostPairs); + b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; + bodyBuf->copyToHost(hostBodyBuf); + b3AlignedObjectArray<b3Vector3> hostSepNormals; + m_sepNormals.copyToHost(hostSepNormals); + b3AlignedObjectArray<int> hostHasSepAxis; + m_hasSeparatingNormals.copyToHost(hostHasSepAxis); + b3AlignedObjectArray<b3Contact4> hostContactsOut; + contactOut->copyToHost(hostContactsOut); + hostContactsOut.resize(newContactCapacity); + + b3AlignedObjectArray<b3Int4> hostClippingFaces; + clippingFacesOutGPU.copyToHost(hostClippingFaces); + b3AlignedObjectArray<b3Vector3> worldVertsB2CPU; + worldVertsB2GPU.copyToHost(worldVertsB2CPU); + + for (int i=0;i<nPairs;i++) + { + b3NewContactReductionKernel(&hostPairs.at(0), + &hostBodyBuf.at(0), + &hostSepNormals.at(0), + &hostHasSepAxis.at(0), + &hostContactsOut.at(0), + &hostClippingFaces.at(0), + &worldVertsB2CPU.at(0), + &nGlobalContactsOut, + vertexFaceCapacity, + newContactCapacity, + nPairs, + i); + } + + nContacts = nGlobalContactsOut; + m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true); + hostContactsOut.resize(nContacts); + //printf("contactOut4 (after newContactReductionKernel) = %d\n",nContacts); + contactOut->copyFromHost(hostContactsOut); + } + // b3Contact4 pt = contactOut->at(0); + // printf("nContacts = %d\n",nContacts); + } + } + } + else//breakupKernel + { + + if (nPairs) + { + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( pairs->getBufferCL(), true ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( gpuCollidables.getBufferCL(),true), + b3BufferInfoCL( convexData.getBufferCL(),true), + b3BufferInfoCL( gpuVertices.getBufferCL(),true), + b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), + b3BufferInfoCL( gpuFaces.getBufferCL(),true), + b3BufferInfoCL( gpuIndices.getBufferCL(),true), + b3BufferInfoCL( m_sepNormals.getBufferCL()), + b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()), + b3BufferInfoCL( contactOut->getBufferCL()), + b3BufferInfoCL( m_totalContactsOut.getBufferCL()) + }; + b3LauncherCL launcher(m_queue, m_clipHullHullKernel,"m_clipHullHullKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( nPairs ); + launcher.setConst(maxContactCapacity); + + int num = nPairs; + launcher.launch1D( num); + clFinish(m_queue); + + nContacts = m_totalContactsOut.at(0); + if (nContacts >= maxContactCapacity) + { + b3Error("Exceeded contact capacity (%d/%d)\n",nContacts,maxContactCapacity); + nContacts = maxContactCapacity; + } + contactOut->resize(nContacts); + } + } + + + int nCompoundsPairs = m_gpuCompoundPairs.size(); + + if (nCompoundsPairs) + { + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( m_gpuCompoundPairs.getBufferCL(), true ), + b3BufferInfoCL( bodyBuf->getBufferCL(),true), + b3BufferInfoCL( gpuCollidables.getBufferCL(),true), + b3BufferInfoCL( convexData.getBufferCL(),true), + b3BufferInfoCL( gpuVertices.getBufferCL(),true), + b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), + b3BufferInfoCL( gpuFaces.getBufferCL(),true), + b3BufferInfoCL( gpuIndices.getBufferCL(),true), + b3BufferInfoCL( gpuChildShapes.getBufferCL(),true), + b3BufferInfoCL( m_gpuCompoundSepNormals.getBufferCL(),true), + b3BufferInfoCL( m_gpuHasCompoundSepNormals.getBufferCL(),true), + b3BufferInfoCL( contactOut->getBufferCL()), + b3BufferInfoCL( m_totalContactsOut.getBufferCL()) + }; + b3LauncherCL launcher(m_queue, m_clipCompoundsHullHullKernel,"m_clipCompoundsHullHullKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( nCompoundsPairs ); + launcher.setConst(maxContactCapacity); + + int num = nCompoundsPairs; + launcher.launch1D( num); + clFinish(m_queue); + + nContacts = m_totalContactsOut.at(0); + if (nContacts>maxContactCapacity) + { + + b3Error("Error: contacts exceeds capacity (%d/%d)\n", nContacts, maxContactCapacity); + nContacts = maxContactCapacity; + } + contactOut->resize(nContacts); + }//if nCompoundsPairs + } + }//contactClippingOnGpu + + //printf("nContacts end = %d\n",nContacts); + + //printf("frameCount = %d\n",frameCount++); +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h new file mode 100644 index 0000000000..e24c1579c6 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h @@ -0,0 +1,118 @@ + +#ifndef _CONVEX_HULL_CONTACT_H +#define _CONVEX_HULL_CONTACT_H + +#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h" +#include "Bullet3Common/b3AlignedObjectArray.h" + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h" +#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h" +#include "Bullet3Common/shared/b3Int2.h" +#include "Bullet3Common/shared/b3Int4.h" +#include "b3OptimizedBvh.h" +#include "b3BvhInfo.h" +#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h" + +//#include "../../dynamics/basic_demo/Stubs/ChNarrowPhase.h" + + + + +struct GpuSatCollision +{ + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + cl_kernel m_findSeparatingAxisKernel; + cl_kernel m_mprPenetrationKernel; + cl_kernel m_findSeparatingAxisUnitSphereKernel; + + + cl_kernel m_findSeparatingAxisVertexFaceKernel; + cl_kernel m_findSeparatingAxisEdgeEdgeKernel; + + cl_kernel m_findConcaveSeparatingAxisKernel; + cl_kernel m_findConcaveSeparatingAxisVertexFaceKernel; + cl_kernel m_findConcaveSeparatingAxisEdgeEdgeKernel; + + + + + cl_kernel m_findCompoundPairsKernel; + cl_kernel m_processCompoundPairsKernel; + + cl_kernel m_clipHullHullKernel; + cl_kernel m_clipCompoundsHullHullKernel; + + cl_kernel m_clipFacesAndFindContacts; + cl_kernel m_findClippingFacesKernel; + + cl_kernel m_clipHullHullConcaveConvexKernel; +// cl_kernel m_extractManifoldAndAddContactKernel; + cl_kernel m_newContactReductionKernel; + + cl_kernel m_bvhTraversalKernel; + cl_kernel m_primitiveContactsKernel; + cl_kernel m_findConcaveSphereContactsKernel; + + cl_kernel m_processCompoundPairsPrimitivesKernel; + + b3OpenCLArray<b3Vector3> m_unitSphereDirections; + + b3OpenCLArray<int> m_totalContactsOut; + + b3OpenCLArray<b3Vector3> m_sepNormals; + b3OpenCLArray<float> m_dmins; + + b3OpenCLArray<int> m_hasSeparatingNormals; + b3OpenCLArray<b3Vector3> m_concaveSepNormals; + b3OpenCLArray<int> m_concaveHasSeparatingNormals; + b3OpenCLArray<int> m_numConcavePairsOut; + b3OpenCLArray<b3CompoundOverlappingPair> m_gpuCompoundPairs; + b3OpenCLArray<b3Vector3> m_gpuCompoundSepNormals; + b3OpenCLArray<int> m_gpuHasCompoundSepNormals; + b3OpenCLArray<int> m_numCompoundPairsOut; + + + GpuSatCollision(cl_context ctx,cl_device_id device, cl_command_queue q ); + virtual ~GpuSatCollision(); + + + void computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* pairs, int nPairs, + const b3OpenCLArray<b3RigidBodyData>* bodyBuf, + b3OpenCLArray<b3Contact4>* contactOut, int& nContacts, + const b3OpenCLArray<b3Contact4>* oldContacts, + int maxContactCapacity, + int compoundPairCapacity, + const b3OpenCLArray<b3ConvexPolyhedronData>& hostConvexData, + const b3OpenCLArray<b3Vector3>& vertices, + const b3OpenCLArray<b3Vector3>& uniqueEdges, + const b3OpenCLArray<b3GpuFace>& faces, + const b3OpenCLArray<int>& indices, + const b3OpenCLArray<b3Collidable>& gpuCollidables, + const b3OpenCLArray<b3GpuChildShape>& gpuChildShapes, + + const b3OpenCLArray<b3Aabb>& clAabbsWorldSpace, + const b3OpenCLArray<b3Aabb>& clAabbsLocalSpace, + + b3OpenCLArray<b3Vector3>& worldVertsB1GPU, + b3OpenCLArray<b3Int4>& clippingFacesOutGPU, + b3OpenCLArray<b3Vector3>& worldNormalsAGPU, + b3OpenCLArray<b3Vector3>& worldVertsA1GPU, + b3OpenCLArray<b3Vector3>& worldVertsB2GPU, + b3AlignedObjectArray<class b3OptimizedBvh*>& bvhData, + b3OpenCLArray<b3QuantizedBvhNode>* treeNodesGPU, + b3OpenCLArray<b3BvhSubtreeInfo>* subTreesGPU, + b3OpenCLArray<b3BvhInfo>* bvhInfo, + int numObjects, + int maxTriConvexPairCapacity, + b3OpenCLArray<b3Int4>& triangleConvexPairs, + int& numTriConvexPairsOut + ); + + +}; + +#endif //_CONVEX_HULL_CONTACT_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h new file mode 100644 index 0000000000..337100fb1a --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h @@ -0,0 +1,9 @@ +#ifndef CONVEX_POLYHEDRON_CL +#define CONVEX_POLYHEDRON_CL + +#include "Bullet3Common/b3Transform.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h" + + + +#endif //CONVEX_POLYHEDRON_CL diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp new file mode 100644 index 0000000000..d636f983c6 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp @@ -0,0 +1,1014 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2008 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the +use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it +freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not +claim that you wrote the original software. If you use this software in a +product, an acknowledgment in the product documentation would be appreciated +but is not required. +2. Altered source versions must be plainly marked as such, and must not be +misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +/* +GJK-EPA collision solver by Nathanael Presson, 2008 +*/ + +#include "b3GjkEpa.h" + +#include "b3SupportMappings.h" + +namespace gjkepa2_impl2 +{ + + // Config + + /* GJK */ +#define GJK_MAX_ITERATIONS 128 +#define GJK_ACCURACY ((b3Scalar)0.0001) +#define GJK_MIN_DISTANCE ((b3Scalar)0.0001) +#define GJK_DUPLICATED_EPS ((b3Scalar)0.0001) +#define GJK_SIMPLEX2_EPS ((b3Scalar)0.0) +#define GJK_SIMPLEX3_EPS ((b3Scalar)0.0) +#define GJK_SIMPLEX4_EPS ((b3Scalar)0.0) + + /* EPA */ +#define EPA_MAX_VERTICES 64 +#define EPA_MAX_FACES (EPA_MAX_VERTICES*2) +#define EPA_MAX_ITERATIONS 255 +#define EPA_ACCURACY ((b3Scalar)0.0001) +#define EPA_FALLBACK (10*EPA_ACCURACY) +#define EPA_PLANE_EPS ((b3Scalar)0.00001) +#define EPA_INSIDE_EPS ((b3Scalar)0.01) + + + // Shorthands + + + // MinkowskiDiff + struct b3MinkowskiDiff + { + + + const b3ConvexPolyhedronData* m_shapes[2]; + + + b3Matrix3x3 m_toshape1; + b3Transform m_toshape0; + + bool m_enableMargin; + + + void EnableMargin(bool enable) + { + m_enableMargin = enable; + } + inline b3Vector3 Support0(const b3Vector3& d, const b3AlignedObjectArray<b3Vector3>& verticesA) const + { + if (m_enableMargin) + { + return localGetSupportVertexWithMargin(d,m_shapes[0],verticesA,0.f); + } else + { + return localGetSupportVertexWithoutMargin(d,m_shapes[0],verticesA); + } + } + inline b3Vector3 Support1(const b3Vector3& d, const b3AlignedObjectArray<b3Vector3>& verticesB) const + { + if (m_enableMargin) + { + return m_toshape0*(localGetSupportVertexWithMargin(m_toshape1*d,m_shapes[1],verticesB,0.f)); + } else + { + return m_toshape0*(localGetSupportVertexWithoutMargin(m_toshape1*d,m_shapes[1],verticesB)); + } + } + + inline b3Vector3 Support(const b3Vector3& d, const b3AlignedObjectArray<b3Vector3>& verticesA, const b3AlignedObjectArray<b3Vector3>& verticesB) const + { + return(Support0(d,verticesA)-Support1(-d,verticesB)); + } + b3Vector3 Support(const b3Vector3& d,unsigned int index,const b3AlignedObjectArray<b3Vector3>& verticesA, const b3AlignedObjectArray<b3Vector3>& verticesB) const + { + if(index) + return(Support1(d,verticesA)); + else + return(Support0(d,verticesB)); + } + }; + + typedef b3MinkowskiDiff tShape; + + + // GJK + struct b3GJK + { + /* Types */ + struct sSV + { + b3Vector3 d,w; + }; + struct sSimplex + { + sSV* c[4]; + b3Scalar p[4]; + unsigned int rank; + }; + struct eStatus { enum _ { + Valid, + Inside, + Failed };}; + /* Fields */ + tShape m_shape; + const b3AlignedObjectArray<b3Vector3>& m_verticesA; + const b3AlignedObjectArray<b3Vector3>& m_verticesB; + b3Vector3 m_ray; + b3Scalar m_distance; + sSimplex m_simplices[2]; + sSV m_store[4]; + sSV* m_free[4]; + unsigned int m_nfree; + unsigned int m_current; + sSimplex* m_simplex; + eStatus::_ m_status; + /* Methods */ + b3GJK(const b3AlignedObjectArray<b3Vector3>& verticesA,const b3AlignedObjectArray<b3Vector3>& verticesB) + :m_verticesA(verticesA),m_verticesB(verticesB) + { + Initialize(); + } + void Initialize() + { + m_ray = b3MakeVector3(0,0,0); + m_nfree = 0; + m_status = eStatus::Failed; + m_current = 0; + m_distance = 0; + } + eStatus::_ Evaluate(const tShape& shapearg,const b3Vector3& guess) + { + unsigned int iterations=0; + b3Scalar sqdist=0; + b3Scalar alpha=0; + b3Vector3 lastw[4]; + unsigned int clastw=0; + /* Initialize solver */ + m_free[0] = &m_store[0]; + m_free[1] = &m_store[1]; + m_free[2] = &m_store[2]; + m_free[3] = &m_store[3]; + m_nfree = 4; + m_current = 0; + m_status = eStatus::Valid; + m_shape = shapearg; + m_distance = 0; + /* Initialize simplex */ + m_simplices[0].rank = 0; + m_ray = guess; + const b3Scalar sqrl= m_ray.length2(); + appendvertice(m_simplices[0],sqrl>0?-m_ray:b3MakeVector3(1,0,0)); + m_simplices[0].p[0] = 1; + m_ray = m_simplices[0].c[0]->w; + sqdist = sqrl; + lastw[0] = + lastw[1] = + lastw[2] = + lastw[3] = m_ray; + /* Loop */ + do { + const unsigned int next=1-m_current; + sSimplex& cs=m_simplices[m_current]; + sSimplex& ns=m_simplices[next]; + /* Check zero */ + const b3Scalar rl=m_ray.length(); + if(rl<GJK_MIN_DISTANCE) + {/* Touching or inside */ + m_status=eStatus::Inside; + break; + } + /* Append new vertice in -'v' direction */ + appendvertice(cs,-m_ray); + const b3Vector3& w=cs.c[cs.rank-1]->w; + bool found=false; + for(unsigned int i=0;i<4;++i) + { + if((w-lastw[i]).length2()<GJK_DUPLICATED_EPS) + { found=true;break; } + } + if(found) + {/* Return old simplex */ + removevertice(m_simplices[m_current]); + break; + } + else + {/* Update lastw */ + lastw[clastw=(clastw+1)&3]=w; + } + /* Check for termination */ + const b3Scalar omega=b3Dot(m_ray,w)/rl; + alpha=b3Max(omega,alpha); + if(((rl-alpha)-(GJK_ACCURACY*rl))<=0) + {/* Return old simplex */ + removevertice(m_simplices[m_current]); + break; + } + /* Reduce simplex */ + b3Scalar weights[4]; + unsigned int mask=0; + switch(cs.rank) + { + case 2: sqdist=projectorigin( cs.c[0]->w, + cs.c[1]->w, + weights,mask);break; + case 3: sqdist=projectorigin( cs.c[0]->w, + cs.c[1]->w, + cs.c[2]->w, + weights,mask);break; + case 4: sqdist=projectorigin( cs.c[0]->w, + cs.c[1]->w, + cs.c[2]->w, + cs.c[3]->w, + weights,mask);break; + } + if(sqdist>=0) + {/* Valid */ + ns.rank = 0; + m_ray = b3MakeVector3(0,0,0); + m_current = next; + for(unsigned int i=0,ni=cs.rank;i<ni;++i) + { + if(mask&(1<<i)) + { + ns.c[ns.rank] = cs.c[i]; + ns.p[ns.rank++] = weights[i]; + m_ray += cs.c[i]->w*weights[i]; + } + else + { + m_free[m_nfree++] = cs.c[i]; + } + } + if(mask==15) m_status=eStatus::Inside; + } + else + {/* Return old simplex */ + removevertice(m_simplices[m_current]); + break; + } + m_status=((++iterations)<GJK_MAX_ITERATIONS)?m_status:eStatus::Failed; + } while(m_status==eStatus::Valid); + m_simplex=&m_simplices[m_current]; + switch(m_status) + { + case eStatus::Valid: m_distance=m_ray.length();break; + case eStatus::Inside: m_distance=0;break; + default: + { + } + } + return(m_status); + } + bool EncloseOrigin() + { + switch(m_simplex->rank) + { + case 1: + { + for(unsigned int i=0;i<3;++i) + { + b3Vector3 axis=b3MakeVector3(0,0,0); + axis[i]=1; + appendvertice(*m_simplex, axis); + if(EncloseOrigin()) return(true); + removevertice(*m_simplex); + appendvertice(*m_simplex,-axis); + if(EncloseOrigin()) return(true); + removevertice(*m_simplex); + } + } + break; + case 2: + { + const b3Vector3 d=m_simplex->c[1]->w-m_simplex->c[0]->w; + for(unsigned int i=0;i<3;++i) + { + b3Vector3 axis=b3MakeVector3(0,0,0); + axis[i]=1; + const b3Vector3 p=b3Cross(d,axis); + if(p.length2()>0) + { + appendvertice(*m_simplex, p); + if(EncloseOrigin()) return(true); + removevertice(*m_simplex); + appendvertice(*m_simplex,-p); + if(EncloseOrigin()) return(true); + removevertice(*m_simplex); + } + } + } + break; + case 3: + { + const b3Vector3 n=b3Cross(m_simplex->c[1]->w-m_simplex->c[0]->w, + m_simplex->c[2]->w-m_simplex->c[0]->w); + if(n.length2()>0) + { + appendvertice(*m_simplex,n); + if(EncloseOrigin()) return(true); + removevertice(*m_simplex); + appendvertice(*m_simplex,-n); + if(EncloseOrigin()) return(true); + removevertice(*m_simplex); + } + } + break; + case 4: + { + if(b3Fabs(det( m_simplex->c[0]->w-m_simplex->c[3]->w, + m_simplex->c[1]->w-m_simplex->c[3]->w, + m_simplex->c[2]->w-m_simplex->c[3]->w))>0) + return(true); + } + break; + } + return(false); + } + /* Internals */ + void getsupport(const b3Vector3& d,sSV& sv) const + { + sv.d = d/d.length(); + sv.w = m_shape.Support(sv.d,m_verticesA,m_verticesB); + } + void removevertice(sSimplex& simplex) + { + m_free[m_nfree++]=simplex.c[--simplex.rank]; + } + void appendvertice(sSimplex& simplex,const b3Vector3& v) + { + simplex.p[simplex.rank]=0; + simplex.c[simplex.rank]=m_free[--m_nfree]; + getsupport(v,*simplex.c[simplex.rank++]); + } + static b3Scalar det(const b3Vector3& a,const b3Vector3& b,const b3Vector3& c) + { + return( a.y*b.z*c.x+a.z*b.x*c.y- + a.x*b.z*c.y-a.y*b.x*c.z+ + a.x*b.y*c.z-a.z*b.y*c.x); + } + static b3Scalar projectorigin( const b3Vector3& a, + const b3Vector3& b, + b3Scalar* w,unsigned int& m) + { + const b3Vector3 d=b-a; + const b3Scalar l=d.length2(); + if(l>GJK_SIMPLEX2_EPS) + { + const b3Scalar t(l>0?-b3Dot(a,d)/l:0); + if(t>=1) { w[0]=0;w[1]=1;m=2;return(b.length2()); } + else if(t<=0) { w[0]=1;w[1]=0;m=1;return(a.length2()); } + else { w[0]=1-(w[1]=t);m=3;return((a+d*t).length2()); } + } + return(-1); + } + static b3Scalar projectorigin( const b3Vector3& a, + const b3Vector3& b, + const b3Vector3& c, + b3Scalar* w,unsigned int& m) + { + static const unsigned int imd3[]={1,2,0}; + const b3Vector3* vt[]={&a,&b,&c}; + const b3Vector3 dl[]={a-b,b-c,c-a}; + const b3Vector3 n=b3Cross(dl[0],dl[1]); + const b3Scalar l=n.length2(); + if(l>GJK_SIMPLEX3_EPS) + { + b3Scalar mindist=-1; + b3Scalar subw[2]={0.f,0.f}; + unsigned int subm(0); + for(unsigned int i=0;i<3;++i) + { + if(b3Dot(*vt[i],b3Cross(dl[i],n))>0) + { + const unsigned int j=imd3[i]; + const b3Scalar subd(projectorigin(*vt[i],*vt[j],subw,subm)); + if((mindist<0)||(subd<mindist)) + { + mindist = subd; + m = static_cast<unsigned int>(((subm&1)?1<<i:0)+((subm&2)?1<<j:0)); + w[i] = subw[0]; + w[j] = subw[1]; + w[imd3[j]] = 0; + } + } + } + if(mindist<0) + { + const b3Scalar d=b3Dot(a,n); + const b3Scalar s=b3Sqrt(l); + const b3Vector3 p=n*(d/l); + mindist = p.length2(); + m = 7; + w[0] = (b3Cross(dl[1],b-p)).length()/s; + w[1] = (b3Cross(dl[2],c-p)).length()/s; + w[2] = 1-(w[0]+w[1]); + } + return(mindist); + } + return(-1); + } + static b3Scalar projectorigin( const b3Vector3& a, + const b3Vector3& b, + const b3Vector3& c, + const b3Vector3& d, + b3Scalar* w,unsigned int& m) + { + static const unsigned int imd3[]={1,2,0}; + const b3Vector3* vt[]={&a,&b,&c,&d}; + const b3Vector3 dl[]={a-d,b-d,c-d}; + const b3Scalar vl=det(dl[0],dl[1],dl[2]); + const bool ng=(vl*b3Dot(a,b3Cross(b-c,a-b)))<=0; + if(ng&&(b3Fabs(vl)>GJK_SIMPLEX4_EPS)) + { + b3Scalar mindist=-1; + b3Scalar subw[3]={0.f,0.f,0.f}; + unsigned int subm(0); + for(unsigned int i=0;i<3;++i) + { + const unsigned int j=imd3[i]; + const b3Scalar s=vl*b3Dot(d,b3Cross(dl[i],dl[j])); + if(s>0) + { + const b3Scalar subd=projectorigin(*vt[i],*vt[j],d,subw,subm); + if((mindist<0)||(subd<mindist)) + { + mindist = subd; + m = static_cast<unsigned int>((subm&1?1<<i:0)+ + (subm&2?1<<j:0)+ + (subm&4?8:0)); + w[i] = subw[0]; + w[j] = subw[1]; + w[imd3[j]] = 0; + w[3] = subw[2]; + } + } + } + if(mindist<0) + { + mindist = 0; + m = 15; + w[0] = det(c,b,d)/vl; + w[1] = det(a,c,d)/vl; + w[2] = det(b,a,d)/vl; + w[3] = 1-(w[0]+w[1]+w[2]); + } + return(mindist); + } + return(-1); + } + }; + + // EPA + struct b3EPA + { + /* Types */ + typedef b3GJK::sSV sSV; + struct sFace + { + b3Vector3 n; + b3Scalar d; + sSV* c[3]; + sFace* f[3]; + sFace* l[2]; + unsigned char e[3]; + unsigned char pass; + }; + struct sList + { + sFace* root; + unsigned int count; + sList() : root(0),count(0) {} + }; + struct sHorizon + { + sFace* cf; + sFace* ff; + unsigned int nf; + sHorizon() : cf(0),ff(0),nf(0) {} + }; + struct eStatus { enum _ { + Valid, + Touching, + Degenerated, + NonConvex, + InvalidHull, + OutOfFaces, + OutOfVertices, + AccuraryReached, + FallBack, + Failed };}; + /* Fields */ + eStatus::_ m_status; + b3GJK::sSimplex m_result; + b3Vector3 m_normal; + b3Scalar m_depth; + sSV m_sv_store[EPA_MAX_VERTICES]; + sFace m_fc_store[EPA_MAX_FACES]; + unsigned int m_nextsv; + sList m_hull; + sList m_stock; + /* Methods */ + b3EPA() + { + Initialize(); + } + + + static inline void bind(sFace* fa,unsigned int ea,sFace* fb,unsigned int eb) + { + fa->e[ea]=(unsigned char)eb;fa->f[ea]=fb; + fb->e[eb]=(unsigned char)ea;fb->f[eb]=fa; + } + static inline void append(sList& list,sFace* face) + { + face->l[0] = 0; + face->l[1] = list.root; + if(list.root) list.root->l[0]=face; + list.root = face; + ++list.count; + } + static inline void remove(sList& list,sFace* face) + { + if(face->l[1]) face->l[1]->l[0]=face->l[0]; + if(face->l[0]) face->l[0]->l[1]=face->l[1]; + if(face==list.root) list.root=face->l[1]; + --list.count; + } + + + void Initialize() + { + m_status = eStatus::Failed; + m_normal = b3MakeVector3(0,0,0); + m_depth = 0; + m_nextsv = 0; + for(unsigned int i=0;i<EPA_MAX_FACES;++i) + { + append(m_stock,&m_fc_store[EPA_MAX_FACES-i-1]); + } + } + eStatus::_ Evaluate(b3GJK& gjk,const b3Vector3& guess) + { + b3GJK::sSimplex& simplex=*gjk.m_simplex; + if((simplex.rank>1)&&gjk.EncloseOrigin()) + { + + /* Clean up */ + while(m_hull.root) + { + sFace* f = m_hull.root; + remove(m_hull,f); + append(m_stock,f); + } + m_status = eStatus::Valid; + m_nextsv = 0; + /* Orient simplex */ + if(gjk.det( simplex.c[0]->w-simplex.c[3]->w, + simplex.c[1]->w-simplex.c[3]->w, + simplex.c[2]->w-simplex.c[3]->w)<0) + { + b3Swap(simplex.c[0],simplex.c[1]); + b3Swap(simplex.p[0],simplex.p[1]); + } + /* Build initial hull */ + sFace* tetra[]={newface(simplex.c[0],simplex.c[1],simplex.c[2],true), + newface(simplex.c[1],simplex.c[0],simplex.c[3],true), + newface(simplex.c[2],simplex.c[1],simplex.c[3],true), + newface(simplex.c[0],simplex.c[2],simplex.c[3],true)}; + if(m_hull.count==4) + { + sFace* best=findbest(); + sFace outer=*best; + unsigned int pass=0; + unsigned int iterations=0; + bind(tetra[0],0,tetra[1],0); + bind(tetra[0],1,tetra[2],0); + bind(tetra[0],2,tetra[3],0); + bind(tetra[1],1,tetra[3],2); + bind(tetra[1],2,tetra[2],1); + bind(tetra[2],2,tetra[3],1); + m_status=eStatus::Valid; + for(;iterations<EPA_MAX_ITERATIONS;++iterations) + { + if(m_nextsv<EPA_MAX_VERTICES) + { + sHorizon horizon; + sSV* w=&m_sv_store[m_nextsv++]; + bool valid=true; + best->pass = (unsigned char)(++pass); + gjk.getsupport(best->n,*w); + const b3Scalar wdist=b3Dot(best->n,w->w)-best->d; + if(wdist>EPA_ACCURACY) + { + for(unsigned int j=0;(j<3)&&valid;++j) + { + valid&=expand( pass,w, + best->f[j],best->e[j], + horizon); + } + if(valid&&(horizon.nf>=3)) + { + bind(horizon.cf,1,horizon.ff,2); + remove(m_hull,best); + append(m_stock,best); + best=findbest(); + outer=*best; + } else { + m_status=eStatus::Failed; + //m_status=eStatus::InvalidHull; + break; } + } else { m_status=eStatus::AccuraryReached;break; } + } else { m_status=eStatus::OutOfVertices;break; } + } + const b3Vector3 projection=outer.n*outer.d; + m_normal = outer.n; + m_depth = outer.d; + m_result.rank = 3; + m_result.c[0] = outer.c[0]; + m_result.c[1] = outer.c[1]; + m_result.c[2] = outer.c[2]; + m_result.p[0] = b3Cross( outer.c[1]->w-projection, + outer.c[2]->w-projection).length(); + m_result.p[1] = b3Cross( outer.c[2]->w-projection, + outer.c[0]->w-projection).length(); + m_result.p[2] = b3Cross( outer.c[0]->w-projection, + outer.c[1]->w-projection).length(); + const b3Scalar sum=m_result.p[0]+m_result.p[1]+m_result.p[2]; + m_result.p[0] /= sum; + m_result.p[1] /= sum; + m_result.p[2] /= sum; + return(m_status); + } + } + /* Fallback */ + m_status = eStatus::FallBack; + m_normal = -guess; + const b3Scalar nl=m_normal.length(); + if(nl>0) + m_normal = m_normal/nl; + else + m_normal = b3MakeVector3(1,0,0); + m_depth = 0; + m_result.rank=1; + m_result.c[0]=simplex.c[0]; + m_result.p[0]=1; + return(m_status); + } + bool getedgedist(sFace* face, sSV* a, sSV* b, b3Scalar& dist) + { + const b3Vector3 ba = b->w - a->w; + const b3Vector3 n_ab = b3Cross(ba, face->n); // Outward facing edge normal direction, on triangle plane + const b3Scalar a_dot_nab = b3Dot(a->w, n_ab); // Only care about the sign to determine inside/outside, so not normalization required + + if(a_dot_nab < 0) + { + // Outside of edge a->b + + const b3Scalar ba_l2 = ba.length2(); + const b3Scalar a_dot_ba = b3Dot(a->w, ba); + const b3Scalar b_dot_ba = b3Dot(b->w, ba); + + if(a_dot_ba > 0) + { + // Pick distance vertex a + dist = a->w.length(); + } + else if(b_dot_ba < 0) + { + // Pick distance vertex b + dist = b->w.length(); + } + else + { + // Pick distance to edge a->b + const b3Scalar a_dot_b = b3Dot(a->w, b->w); + dist = b3Sqrt(b3Max((a->w.length2() * b->w.length2() - a_dot_b * a_dot_b) / ba_l2, (b3Scalar)0)); + } + + return true; + } + + return false; + } + sFace* newface(sSV* a,sSV* b,sSV* c,bool forced) + { + if(m_stock.root) + { + sFace* face=m_stock.root; + remove(m_stock,face); + append(m_hull,face); + face->pass = 0; + face->c[0] = a; + face->c[1] = b; + face->c[2] = c; + face->n = b3Cross(b->w-a->w,c->w-a->w); + const b3Scalar l=face->n.length(); + const bool v=l>EPA_ACCURACY; + + if(v) + { + if(!(getedgedist(face, a, b, face->d) || + getedgedist(face, b, c, face->d) || + getedgedist(face, c, a, face->d))) + { + // Origin projects to the interior of the triangle + // Use distance to triangle plane + face->d = b3Dot(a->w, face->n) / l; + } + + face->n /= l; + if(forced || (face->d >= -EPA_PLANE_EPS)) + { + return face; + } + else + m_status=eStatus::NonConvex; + } + else + m_status=eStatus::Degenerated; + + remove(m_hull, face); + append(m_stock, face); + return 0; + + } + m_status = m_stock.root ? eStatus::OutOfVertices : eStatus::OutOfFaces; + return 0; + } + sFace* findbest() + { + sFace* minf=m_hull.root; + b3Scalar mind=minf->d*minf->d; + for(sFace* f=minf->l[1];f;f=f->l[1]) + { + const b3Scalar sqd=f->d*f->d; + if(sqd<mind) + { + minf=f; + mind=sqd; + } + } + return(minf); + } + bool expand(unsigned int pass,sSV* w,sFace* f,unsigned int e,sHorizon& horizon) + { + static const unsigned int i1m3[]={1,2,0}; + static const unsigned int i2m3[]={2,0,1}; + if(f->pass!=pass) + { + const unsigned int e1=i1m3[e]; + if((b3Dot(f->n,w->w)-f->d)<-EPA_PLANE_EPS) + { + sFace* nf=newface(f->c[e1],f->c[e],w,false); + if(nf) + { + bind(nf,0,f,e); + if(horizon.cf) bind(horizon.cf,1,nf,2); else horizon.ff=nf; + horizon.cf=nf; + ++horizon.nf; + return(true); + } + } + else + { + const unsigned int e2=i2m3[e]; + f->pass = (unsigned char)pass; + if( expand(pass,w,f->f[e1],f->e[e1],horizon)&& + expand(pass,w,f->f[e2],f->e[e2],horizon)) + { + remove(m_hull,f); + append(m_stock,f); + return(true); + } + } + } + return(false); + } + + }; + + // + static void Initialize(const b3Transform& transA, const b3Transform& transB, + const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, + const b3AlignedObjectArray<b3Vector3>& verticesA, + const b3AlignedObjectArray<b3Vector3>& verticesB, + b3GjkEpaSolver2::sResults& results, + tShape& shape, + bool withmargins) + { + /* Results */ + results.witnesses[0] = + results.witnesses[1] = b3MakeVector3(0,0,0); + results.status = b3GjkEpaSolver2::sResults::Separated; + /* Shape */ + shape.m_shapes[0] = hullA; + shape.m_shapes[1] = hullB; + shape.m_toshape1 = transB.getBasis().transposeTimes(transA.getBasis()); + shape.m_toshape0 = transA.inverseTimes(transB); + shape.EnableMargin(withmargins); + } + +} + +// +// Api +// + +using namespace gjkepa2_impl2; + +// +int b3GjkEpaSolver2::StackSizeRequirement() +{ + return(sizeof(b3GJK)+sizeof(b3EPA)); +} + +// +bool b3GjkEpaSolver2::Distance( const b3Transform& transA, const b3Transform& transB, + const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, + const b3AlignedObjectArray<b3Vector3>& verticesA, + const b3AlignedObjectArray<b3Vector3>& verticesB, + const b3Vector3& guess, + sResults& results) +{ + tShape shape; + Initialize(transA,transB,hullA,hullB,verticesA,verticesB,results,shape,false); + b3GJK gjk(verticesA,verticesB); + b3GJK::eStatus::_ gjk_status=gjk.Evaluate(shape,guess); + if(gjk_status==b3GJK::eStatus::Valid) + { + b3Vector3 w0=b3MakeVector3(0,0,0); + b3Vector3 w1=b3MakeVector3(0,0,0); + for(unsigned int i=0;i<gjk.m_simplex->rank;++i) + { + const b3Scalar p=gjk.m_simplex->p[i]; + w0+=shape.Support( gjk.m_simplex->c[i]->d,0,verticesA,verticesB)*p; + w1+=shape.Support(-gjk.m_simplex->c[i]->d,1,verticesA,verticesB)*p; + } + results.witnesses[0] = transA*w0; + results.witnesses[1] = transA*w1; + results.normal = w0-w1; + results.distance = results.normal.length(); + results.normal /= results.distance>GJK_MIN_DISTANCE?results.distance:1; + return(true); + } + else + { + results.status = gjk_status==b3GJK::eStatus::Inside? + sResults::Penetrating : + sResults::GJK_Failed ; + return(false); + } +} + +// +bool b3GjkEpaSolver2::Penetration( const b3Transform& transA, const b3Transform& transB, + const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, + const b3AlignedObjectArray<b3Vector3>& verticesA, + const b3AlignedObjectArray<b3Vector3>& verticesB, + const b3Vector3& guess, + sResults& results, + bool usemargins) +{ + + tShape shape; + Initialize(transA,transB,hullA,hullB,verticesA,verticesB,results,shape,usemargins); + b3GJK gjk(verticesA,verticesB); + b3GJK::eStatus::_ gjk_status=gjk.Evaluate(shape,guess); + switch(gjk_status) + { + case b3GJK::eStatus::Inside: + { + b3EPA epa; + b3EPA::eStatus::_ epa_status=epa.Evaluate(gjk,-guess); + if(epa_status!=b3EPA::eStatus::Failed) + { + b3Vector3 w0=b3MakeVector3(0,0,0); + for(unsigned int i=0;i<epa.m_result.rank;++i) + { + w0+=shape.Support(epa.m_result.c[i]->d,0,verticesA,verticesB)*epa.m_result.p[i]; + } + results.status = sResults::Penetrating; + results.witnesses[0] = transA*w0; + results.witnesses[1] = transA*(w0-epa.m_normal*epa.m_depth); + results.normal = -epa.m_normal; + results.distance = -epa.m_depth; + return(true); + } else results.status=sResults::EPA_Failed; + } + break; + case b3GJK::eStatus::Failed: + results.status=sResults::GJK_Failed; + break; + default: + { + } + } + return(false); +} + + +#if 0 +// +b3Scalar b3GjkEpaSolver2::SignedDistance(const b3Vector3& position, + b3Scalar margin, + const b3Transform& transA, + const b3ConvexPolyhedronData& hullA, + const b3AlignedObjectArray<b3Vector3>& verticesA, + sResults& results) +{ + tShape shape; + btSphereShape shape1(margin); + b3Transform wtrs1(b3Quaternion(0,0,0,1),position); + Initialize(shape0,wtrs0,&shape1,wtrs1,results,shape,false); + GJK gjk; + GJK::eStatus::_ gjk_status=gjk.Evaluate(shape,b3Vector3(1,1,1)); + if(gjk_status==GJK::eStatus::Valid) + { + b3Vector3 w0=b3Vector3(0,0,0); + b3Vector3 w1=b3Vector3(0,0,0); + for(unsigned int i=0;i<gjk.m_simplex->rank;++i) + { + const b3Scalar p=gjk.m_simplex->p[i]; + w0+=shape.Support( gjk.m_simplex->c[i]->d,0)*p; + w1+=shape.Support(-gjk.m_simplex->c[i]->d,1)*p; + } + results.witnesses[0] = wtrs0*w0; + results.witnesses[1] = wtrs0*w1; + const b3Vector3 delta= results.witnesses[1]- + results.witnesses[0]; + const b3Scalar margin= shape0->getMarginNonVirtual()+ + shape1.getMarginNonVirtual(); + const b3Scalar length= delta.length(); + results.normal = delta/length; + results.witnesses[0] += results.normal*margin; + return(length-margin); + } + else + { + if(gjk_status==GJK::eStatus::Inside) + { + if(Penetration(shape0,wtrs0,&shape1,wtrs1,gjk.m_ray,results)) + { + const b3Vector3 delta= results.witnesses[0]- + results.witnesses[1]; + const b3Scalar length= delta.length(); + if (length >= B3_EPSILON) + results.normal = delta/length; + return(-length); + } + } + } + return(B3_INFINITY); +} + +// +bool b3GjkEpaSolver2::SignedDistance(const btConvexShape* shape0, + const b3Transform& wtrs0, + const btConvexShape* shape1, + const b3Transform& wtrs1, + const b3Vector3& guess, + sResults& results) +{ + if(!Distance(shape0,wtrs0,shape1,wtrs1,guess,results)) + return(Penetration(shape0,wtrs0,shape1,wtrs1,guess,results,false)); + else + return(true); +} +#endif + + +/* Symbols cleanup */ + +#undef GJK_MAX_ITERATIONS +#undef GJK_ACCURACY +#undef GJK_MIN_DISTANCE +#undef GJK_DUPLICATED_EPS +#undef GJK_SIMPLEX2_EPS +#undef GJK_SIMPLEX3_EPS +#undef GJK_SIMPLEX4_EPS + +#undef EPA_MAX_VERTICES +#undef EPA_MAX_FACES +#undef EPA_MAX_ITERATIONS +#undef EPA_ACCURACY +#undef EPA_FALLBACK +#undef EPA_PLANE_EPS +#undef EPA_INSIDE_EPS diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h new file mode 100644 index 0000000000..976238a04c --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h @@ -0,0 +1,82 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2008 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the +use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it +freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not +claim that you wrote the original software. If you use this software in a +product, an acknowledgment in the product documentation would be appreciated +but is not required. +2. Altered source versions must be plainly marked as such, and must not be +misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +/* +GJK-EPA collision solver by Nathanael Presson, 2008 +*/ +#ifndef B3_GJK_EPA2_H +#define B3_GJK_EPA2_H + +#include "Bullet3Common/b3AlignedObjectArray.h" +#include "Bullet3Common/b3Transform.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h" + + +///btGjkEpaSolver contributed under zlib by Nathanael Presson +struct b3GjkEpaSolver2 +{ +struct sResults + { + enum eStatus + { + Separated, /* Shapes doesnt penetrate */ + Penetrating, /* Shapes are penetrating */ + GJK_Failed, /* GJK phase fail, no big issue, shapes are probably just 'touching' */ + EPA_Failed /* EPA phase fail, bigger problem, need to save parameters, and debug */ + } status; + b3Vector3 witnesses[2]; + b3Vector3 normal; + b3Scalar distance; + }; + +static int StackSizeRequirement(); + +static bool Distance( const b3Transform& transA, const b3Transform& transB, + const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, + const b3AlignedObjectArray<b3Vector3>& verticesA, + const b3AlignedObjectArray<b3Vector3>& verticesB, + const b3Vector3& guess, + sResults& results); + +static bool Penetration( const b3Transform& transA, const b3Transform& transB, + const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, + const b3AlignedObjectArray<b3Vector3>& verticesA, + const b3AlignedObjectArray<b3Vector3>& verticesB, + const b3Vector3& guess, + sResults& results, + bool usemargins=true); +#if 0 +static b3Scalar SignedDistance( const b3Vector3& position, + b3Scalar margin, + const btConvexShape* shape, + const btTransform& wtrs, + sResults& results); + +static bool SignedDistance( const btConvexShape* shape0,const btTransform& wtrs0, + const btConvexShape* shape1,const btTransform& wtrs1, + const b3Vector3& guess, + sResults& results); +#endif + +}; + +#endif //B3_GJK_EPA2_H + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp new file mode 100644 index 0000000000..e9e51d5a36 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp @@ -0,0 +1,390 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + +#include "b3OptimizedBvh.h" +#include "b3StridingMeshInterface.h" +#include "Bullet3Geometry/b3AabbUtil.h" + + +b3OptimizedBvh::b3OptimizedBvh() +{ +} + +b3OptimizedBvh::~b3OptimizedBvh() +{ +} + + +void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax) +{ + m_useQuantization = useQuantizedAabbCompression; + + + // NodeArray triangleNodes; + + struct NodeTriangleCallback : public b3InternalTriangleIndexCallback + { + + NodeArray& m_triangleNodes; + + NodeTriangleCallback& operator=(NodeTriangleCallback& other) + { + m_triangleNodes.copyFromArray(other.m_triangleNodes); + return *this; + } + + NodeTriangleCallback(NodeArray& triangleNodes) + :m_triangleNodes(triangleNodes) + { + } + + virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int triangleIndex) + { + b3OptimizedBvhNode node; + b3Vector3 aabbMin,aabbMax; + aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT)); + aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); + aabbMin.setMin(triangle[0]); + aabbMax.setMax(triangle[0]); + aabbMin.setMin(triangle[1]); + aabbMax.setMax(triangle[1]); + aabbMin.setMin(triangle[2]); + aabbMax.setMax(triangle[2]); + + //with quantization? + node.m_aabbMinOrg = aabbMin; + node.m_aabbMaxOrg = aabbMax; + + node.m_escapeIndex = -1; + + //for child nodes + node.m_subPart = partId; + node.m_triangleIndex = triangleIndex; + m_triangleNodes.push_back(node); + } + }; + struct QuantizedNodeTriangleCallback : public b3InternalTriangleIndexCallback + { + QuantizedNodeArray& m_triangleNodes; + const b3QuantizedBvh* m_optimizedTree; // for quantization + + QuantizedNodeTriangleCallback& operator=(QuantizedNodeTriangleCallback& other) + { + m_triangleNodes.copyFromArray(other.m_triangleNodes); + m_optimizedTree = other.m_optimizedTree; + return *this; + } + + QuantizedNodeTriangleCallback(QuantizedNodeArray& triangleNodes,const b3QuantizedBvh* tree) + :m_triangleNodes(triangleNodes),m_optimizedTree(tree) + { + } + + virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int triangleIndex) + { + // The partId and triangle index must fit in the same (positive) integer + b3Assert(partId < (1<<MAX_NUM_PARTS_IN_BITS)); + b3Assert(triangleIndex < (1<<(31-MAX_NUM_PARTS_IN_BITS))); + //negative indices are reserved for escapeIndex + b3Assert(triangleIndex>=0); + + b3QuantizedBvhNode node; + b3Vector3 aabbMin,aabbMax; + aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT)); + aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); + aabbMin.setMin(triangle[0]); + aabbMax.setMax(triangle[0]); + aabbMin.setMin(triangle[1]); + aabbMax.setMax(triangle[1]); + aabbMin.setMin(triangle[2]); + aabbMax.setMax(triangle[2]); + + //PCK: add these checks for zero dimensions of aabb + const b3Scalar MIN_AABB_DIMENSION = b3Scalar(0.002); + const b3Scalar MIN_AABB_HALF_DIMENSION = b3Scalar(0.001); + if (aabbMax.getX() - aabbMin.getX() < MIN_AABB_DIMENSION) + { + aabbMax.setX(aabbMax.getX() + MIN_AABB_HALF_DIMENSION); + aabbMin.setX(aabbMin.getX() - MIN_AABB_HALF_DIMENSION); + } + if (aabbMax.getY() - aabbMin.getY() < MIN_AABB_DIMENSION) + { + aabbMax.setY(aabbMax.getY() + MIN_AABB_HALF_DIMENSION); + aabbMin.setY(aabbMin.getY() - MIN_AABB_HALF_DIMENSION); + } + if (aabbMax.getZ() - aabbMin.getZ() < MIN_AABB_DIMENSION) + { + aabbMax.setZ(aabbMax.getZ() + MIN_AABB_HALF_DIMENSION); + aabbMin.setZ(aabbMin.getZ() - MIN_AABB_HALF_DIMENSION); + } + + m_optimizedTree->quantize(&node.m_quantizedAabbMin[0],aabbMin,0); + m_optimizedTree->quantize(&node.m_quantizedAabbMax[0],aabbMax,1); + + node.m_escapeIndexOrTriangleIndex = (partId<<(31-MAX_NUM_PARTS_IN_BITS)) | triangleIndex; + + m_triangleNodes.push_back(node); + } + }; + + + + int numLeafNodes = 0; + + + if (m_useQuantization) + { + + //initialize quantization values + setQuantizationValues(bvhAabbMin,bvhAabbMax); + + QuantizedNodeTriangleCallback callback(m_quantizedLeafNodes,this); + + + triangles->InternalProcessAllTriangles(&callback,m_bvhAabbMin,m_bvhAabbMax); + + //now we have an array of leafnodes in m_leafNodes + numLeafNodes = m_quantizedLeafNodes.size(); + + + m_quantizedContiguousNodes.resize(2*numLeafNodes); + + + } else + { + NodeTriangleCallback callback(m_leafNodes); + + b3Vector3 aabbMin=b3MakeVector3(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); + b3Vector3 aabbMax=b3MakeVector3(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT)); + + triangles->InternalProcessAllTriangles(&callback,aabbMin,aabbMax); + + //now we have an array of leafnodes in m_leafNodes + numLeafNodes = m_leafNodes.size(); + + m_contiguousNodes.resize(2*numLeafNodes); + } + + m_curNodeIndex = 0; + + buildTree(0,numLeafNodes); + + ///if the entire tree is small then subtree size, we need to create a header info for the tree + if(m_useQuantization && !m_SubtreeHeaders.size()) + { + b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand(); + subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[0]); + subtree.m_rootNodeIndex = 0; + subtree.m_subtreeSize = m_quantizedContiguousNodes[0].isLeafNode() ? 1 : m_quantizedContiguousNodes[0].getEscapeIndex(); + } + + //PCK: update the copy of the size + m_subtreeHeaderCount = m_SubtreeHeaders.size(); + + //PCK: clear m_quantizedLeafNodes and m_leafNodes, they are temporary + m_quantizedLeafNodes.clear(); + m_leafNodes.clear(); +} + + + + +void b3OptimizedBvh::refit(b3StridingMeshInterface* meshInterface,const b3Vector3& aabbMin,const b3Vector3& aabbMax) +{ + if (m_useQuantization) + { + + setQuantizationValues(aabbMin,aabbMax); + + updateBvhNodes(meshInterface,0,m_curNodeIndex,0); + + ///now update all subtree headers + + int i; + for (i=0;i<m_SubtreeHeaders.size();i++) + { + b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i]; + subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]); + } + + } else + { + + } +} + + + + +void b3OptimizedBvh::refitPartial(b3StridingMeshInterface* meshInterface,const b3Vector3& aabbMin,const b3Vector3& aabbMax) +{ + //incrementally initialize quantization values + b3Assert(m_useQuantization); + + b3Assert(aabbMin.getX() > m_bvhAabbMin.getX()); + b3Assert(aabbMin.getY() > m_bvhAabbMin.getY()); + b3Assert(aabbMin.getZ() > m_bvhAabbMin.getZ()); + + b3Assert(aabbMax.getX() < m_bvhAabbMax.getX()); + b3Assert(aabbMax.getY() < m_bvhAabbMax.getY()); + b3Assert(aabbMax.getZ() < m_bvhAabbMax.getZ()); + + ///we should update all quantization values, using updateBvhNodes(meshInterface); + ///but we only update chunks that overlap the given aabb + + unsigned short quantizedQueryAabbMin[3]; + unsigned short quantizedQueryAabbMax[3]; + + quantize(&quantizedQueryAabbMin[0],aabbMin,0); + quantize(&quantizedQueryAabbMax[0],aabbMax,1); + + int i; + for (i=0;i<this->m_SubtreeHeaders.size();i++) + { + b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i]; + + //PCK: unsigned instead of bool + unsigned overlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax); + if (overlap != 0) + { + updateBvhNodes(meshInterface,subtree.m_rootNodeIndex,subtree.m_rootNodeIndex+subtree.m_subtreeSize,i); + + subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]); + } + } + +} + +void b3OptimizedBvh::updateBvhNodes(b3StridingMeshInterface* meshInterface,int firstNode,int endNode,int index) +{ + (void)index; + + b3Assert(m_useQuantization); + + int curNodeSubPart=-1; + + //get access info to trianglemesh data + const unsigned char *vertexbase = 0; + int numverts = 0; + PHY_ScalarType type = PHY_INTEGER; + int stride = 0; + const unsigned char *indexbase = 0; + int indexstride = 0; + int numfaces = 0; + PHY_ScalarType indicestype = PHY_INTEGER; + + b3Vector3 triangleVerts[3]; + b3Vector3 aabbMin,aabbMax; + const b3Vector3& meshScaling = meshInterface->getScaling(); + + int i; + for (i=endNode-1;i>=firstNode;i--) + { + + + b3QuantizedBvhNode& curNode = m_quantizedContiguousNodes[i]; + if (curNode.isLeafNode()) + { + //recalc aabb from triangle data + int nodeSubPart = curNode.getPartId(); + int nodeTriangleIndex = curNode.getTriangleIndex(); + if (nodeSubPart != curNodeSubPart) + { + if (curNodeSubPart >= 0) + meshInterface->unLockReadOnlyVertexBase(curNodeSubPart); + meshInterface->getLockedReadOnlyVertexIndexBase(&vertexbase,numverts, type,stride,&indexbase,indexstride,numfaces,indicestype,nodeSubPart); + + curNodeSubPart = nodeSubPart; + b3Assert(indicestype==PHY_INTEGER||indicestype==PHY_SHORT); + } + //triangles->getLockedReadOnlyVertexIndexBase(vertexBase,numVerts, + + unsigned int* gfxbase = (unsigned int*)(indexbase+nodeTriangleIndex*indexstride); + + + for (int j=2;j>=0;j--) + { + + int graphicsindex = indicestype==PHY_SHORT?((unsigned short*)gfxbase)[j]:gfxbase[j]; + if (type == PHY_FLOAT) + { + float* graphicsbase = (float*)(vertexbase+graphicsindex*stride); + triangleVerts[j] = b3MakeVector3( + graphicsbase[0]*meshScaling.getX(), + graphicsbase[1]*meshScaling.getY(), + graphicsbase[2]*meshScaling.getZ()); + } + else + { + double* graphicsbase = (double*)(vertexbase+graphicsindex*stride); + triangleVerts[j] = b3MakeVector3( b3Scalar(graphicsbase[0]*meshScaling.getX()), b3Scalar(graphicsbase[1]*meshScaling.getY()), b3Scalar(graphicsbase[2]*meshScaling.getZ())); + } + } + + + + aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT)); + aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); + aabbMin.setMin(triangleVerts[0]); + aabbMax.setMax(triangleVerts[0]); + aabbMin.setMin(triangleVerts[1]); + aabbMax.setMax(triangleVerts[1]); + aabbMin.setMin(triangleVerts[2]); + aabbMax.setMax(triangleVerts[2]); + + quantize(&curNode.m_quantizedAabbMin[0],aabbMin,0); + quantize(&curNode.m_quantizedAabbMax[0],aabbMax,1); + + } else + { + //combine aabb from both children + + b3QuantizedBvhNode* leftChildNode = &m_quantizedContiguousNodes[i+1]; + + b3QuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? &m_quantizedContiguousNodes[i+2] : + &m_quantizedContiguousNodes[i+1+leftChildNode->getEscapeIndex()]; + + + { + for (int i=0;i<3;i++) + { + curNode.m_quantizedAabbMin[i] = leftChildNode->m_quantizedAabbMin[i]; + if (curNode.m_quantizedAabbMin[i]>rightChildNode->m_quantizedAabbMin[i]) + curNode.m_quantizedAabbMin[i]=rightChildNode->m_quantizedAabbMin[i]; + + curNode.m_quantizedAabbMax[i] = leftChildNode->m_quantizedAabbMax[i]; + if (curNode.m_quantizedAabbMax[i] < rightChildNode->m_quantizedAabbMax[i]) + curNode.m_quantizedAabbMax[i] = rightChildNode->m_quantizedAabbMax[i]; + } + } + } + + } + + if (curNodeSubPart >= 0) + meshInterface->unLockReadOnlyVertexBase(curNodeSubPart); + + +} + +///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place' +b3OptimizedBvh* b3OptimizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian) +{ + b3QuantizedBvh* bvh = b3QuantizedBvh::deSerializeInPlace(i_alignedDataBuffer,i_dataBufferSize,i_swapEndian); + + //we don't add additional data so just do a static upcast + return static_cast<b3OptimizedBvh*>(bvh); +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h new file mode 100644 index 0000000000..0272ef83bf --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h @@ -0,0 +1,65 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +///Contains contributions from Disney Studio's + +#ifndef B3_OPTIMIZED_BVH_H +#define B3_OPTIMIZED_BVH_H + +#include "b3QuantizedBvh.h" + +class b3StridingMeshInterface; + + +///The b3OptimizedBvh extends the b3QuantizedBvh to create AABB tree for triangle meshes, through the b3StridingMeshInterface. +B3_ATTRIBUTE_ALIGNED16(class) b3OptimizedBvh : public b3QuantizedBvh +{ + +public: + B3_DECLARE_ALIGNED_ALLOCATOR(); + +protected: + +public: + + b3OptimizedBvh(); + + virtual ~b3OptimizedBvh(); + + void build(b3StridingMeshInterface* triangles,bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax); + + void refit(b3StridingMeshInterface* triangles,const b3Vector3& aabbMin,const b3Vector3& aabbMax); + + void refitPartial(b3StridingMeshInterface* triangles,const b3Vector3& aabbMin, const b3Vector3& aabbMax); + + void updateBvhNodes(b3StridingMeshInterface* meshInterface,int firstNode,int endNode,int index); + + /// Data buffer MUST be 16 byte aligned + virtual bool serializeInPlace(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const + { + return b3QuantizedBvh::serialize(o_alignedDataBuffer,i_dataBufferSize,i_swapEndian); + + } + + ///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place' + static b3OptimizedBvh *deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian); + + +}; + + +#endif //B3_OPTIMIZED_BVH_H + + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp new file mode 100644 index 0000000000..52027e1118 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp @@ -0,0 +1,1301 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#include "b3QuantizedBvh.h" + +#include "Bullet3Geometry/b3AabbUtil.h" + + +#define RAYAABB2 + +b3QuantizedBvh::b3QuantizedBvh() : + m_bulletVersion(B3_BULLET_VERSION), + m_useQuantization(false), + m_traversalMode(TRAVERSAL_STACKLESS_CACHE_FRIENDLY) + //m_traversalMode(TRAVERSAL_STACKLESS) + //m_traversalMode(TRAVERSAL_RECURSIVE) + ,m_subtreeHeaderCount(0) //PCK: add this line +{ + m_bvhAabbMin.setValue(-B3_INFINITY,-B3_INFINITY,-B3_INFINITY); + m_bvhAabbMax.setValue(B3_INFINITY,B3_INFINITY,B3_INFINITY); +} + + + + + +void b3QuantizedBvh::buildInternal() +{ + ///assumes that caller filled in the m_quantizedLeafNodes + m_useQuantization = true; + int numLeafNodes = 0; + + if (m_useQuantization) + { + //now we have an array of leafnodes in m_leafNodes + numLeafNodes = m_quantizedLeafNodes.size(); + + m_quantizedContiguousNodes.resize(2*numLeafNodes); + + } + + m_curNodeIndex = 0; + + buildTree(0,numLeafNodes); + + ///if the entire tree is small then subtree size, we need to create a header info for the tree + if(m_useQuantization && !m_SubtreeHeaders.size()) + { + b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand(); + subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[0]); + subtree.m_rootNodeIndex = 0; + subtree.m_subtreeSize = m_quantizedContiguousNodes[0].isLeafNode() ? 1 : m_quantizedContiguousNodes[0].getEscapeIndex(); + } + + //PCK: update the copy of the size + m_subtreeHeaderCount = m_SubtreeHeaders.size(); + + //PCK: clear m_quantizedLeafNodes and m_leafNodes, they are temporary + m_quantizedLeafNodes.clear(); + m_leafNodes.clear(); +} + + + +///just for debugging, to visualize the individual patches/subtrees +#ifdef DEBUG_PATCH_COLORS +b3Vector3 color[4]= +{ + b3Vector3(1,0,0), + b3Vector3(0,1,0), + b3Vector3(0,0,1), + b3Vector3(0,1,1) +}; +#endif //DEBUG_PATCH_COLORS + + + +void b3QuantizedBvh::setQuantizationValues(const b3Vector3& bvhAabbMin,const b3Vector3& bvhAabbMax,b3Scalar quantizationMargin) +{ + //enlarge the AABB to avoid division by zero when initializing the quantization values + b3Vector3 clampValue =b3MakeVector3(quantizationMargin,quantizationMargin,quantizationMargin); + m_bvhAabbMin = bvhAabbMin - clampValue; + m_bvhAabbMax = bvhAabbMax + clampValue; + b3Vector3 aabbSize = m_bvhAabbMax - m_bvhAabbMin; + m_bvhQuantization = b3MakeVector3(b3Scalar(65533.0),b3Scalar(65533.0),b3Scalar(65533.0)) / aabbSize; + m_useQuantization = true; +} + + + + +b3QuantizedBvh::~b3QuantizedBvh() +{ +} + +#ifdef DEBUG_TREE_BUILDING +int gStackDepth = 0; +int gMaxStackDepth = 0; +#endif //DEBUG_TREE_BUILDING + +void b3QuantizedBvh::buildTree (int startIndex,int endIndex) +{ +#ifdef DEBUG_TREE_BUILDING + gStackDepth++; + if (gStackDepth > gMaxStackDepth) + gMaxStackDepth = gStackDepth; +#endif //DEBUG_TREE_BUILDING + + + int splitAxis, splitIndex, i; + int numIndices =endIndex-startIndex; + int curIndex = m_curNodeIndex; + + b3Assert(numIndices>0); + + if (numIndices==1) + { +#ifdef DEBUG_TREE_BUILDING + gStackDepth--; +#endif //DEBUG_TREE_BUILDING + + assignInternalNodeFromLeafNode(m_curNodeIndex,startIndex); + + m_curNodeIndex++; + return; + } + //calculate Best Splitting Axis and where to split it. Sort the incoming 'leafNodes' array within range 'startIndex/endIndex'. + + splitAxis = calcSplittingAxis(startIndex,endIndex); + + splitIndex = sortAndCalcSplittingIndex(startIndex,endIndex,splitAxis); + + int internalNodeIndex = m_curNodeIndex; + + //set the min aabb to 'inf' or a max value, and set the max aabb to a -inf/minimum value. + //the aabb will be expanded during buildTree/mergeInternalNodeAabb with actual node values + setInternalNodeAabbMin(m_curNodeIndex,m_bvhAabbMax);//can't use b3Vector3(B3_INFINITY,B3_INFINITY,B3_INFINITY)) because of quantization + setInternalNodeAabbMax(m_curNodeIndex,m_bvhAabbMin);//can't use b3Vector3(-B3_INFINITY,-B3_INFINITY,-B3_INFINITY)) because of quantization + + + for (i=startIndex;i<endIndex;i++) + { + mergeInternalNodeAabb(m_curNodeIndex,getAabbMin(i),getAabbMax(i)); + } + + m_curNodeIndex++; + + + //internalNode->m_escapeIndex; + + int leftChildNodexIndex = m_curNodeIndex; + + //build left child tree + buildTree(startIndex,splitIndex); + + int rightChildNodexIndex = m_curNodeIndex; + //build right child tree + buildTree(splitIndex,endIndex); + +#ifdef DEBUG_TREE_BUILDING + gStackDepth--; +#endif //DEBUG_TREE_BUILDING + + int escapeIndex = m_curNodeIndex - curIndex; + + if (m_useQuantization) + { + //escapeIndex is the number of nodes of this subtree + const int sizeQuantizedNode =sizeof(b3QuantizedBvhNode); + const int treeSizeInBytes = escapeIndex * sizeQuantizedNode; + if (treeSizeInBytes > MAX_SUBTREE_SIZE_IN_BYTES) + { + updateSubtreeHeaders(leftChildNodexIndex,rightChildNodexIndex); + } + } else + { + + } + + setInternalNodeEscapeIndex(internalNodeIndex,escapeIndex); + +} + +void b3QuantizedBvh::updateSubtreeHeaders(int leftChildNodexIndex,int rightChildNodexIndex) +{ + b3Assert(m_useQuantization); + + b3QuantizedBvhNode& leftChildNode = m_quantizedContiguousNodes[leftChildNodexIndex]; + int leftSubTreeSize = leftChildNode.isLeafNode() ? 1 : leftChildNode.getEscapeIndex(); + int leftSubTreeSizeInBytes = leftSubTreeSize * static_cast<int>(sizeof(b3QuantizedBvhNode)); + + b3QuantizedBvhNode& rightChildNode = m_quantizedContiguousNodes[rightChildNodexIndex]; + int rightSubTreeSize = rightChildNode.isLeafNode() ? 1 : rightChildNode.getEscapeIndex(); + int rightSubTreeSizeInBytes = rightSubTreeSize * static_cast<int>(sizeof(b3QuantizedBvhNode)); + + if(leftSubTreeSizeInBytes <= MAX_SUBTREE_SIZE_IN_BYTES) + { + b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand(); + subtree.setAabbFromQuantizeNode(leftChildNode); + subtree.m_rootNodeIndex = leftChildNodexIndex; + subtree.m_subtreeSize = leftSubTreeSize; + } + + if(rightSubTreeSizeInBytes <= MAX_SUBTREE_SIZE_IN_BYTES) + { + b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand(); + subtree.setAabbFromQuantizeNode(rightChildNode); + subtree.m_rootNodeIndex = rightChildNodexIndex; + subtree.m_subtreeSize = rightSubTreeSize; + } + + //PCK: update the copy of the size + m_subtreeHeaderCount = m_SubtreeHeaders.size(); +} + + +int b3QuantizedBvh::sortAndCalcSplittingIndex(int startIndex,int endIndex,int splitAxis) +{ + int i; + int splitIndex =startIndex; + int numIndices = endIndex - startIndex; + b3Scalar splitValue; + + b3Vector3 means=b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); + for (i=startIndex;i<endIndex;i++) + { + b3Vector3 center = b3Scalar(0.5)*(getAabbMax(i)+getAabbMin(i)); + means+=center; + } + means *= (b3Scalar(1.)/(b3Scalar)numIndices); + + splitValue = means[splitAxis]; + + //sort leafNodes so all values larger then splitValue comes first, and smaller values start from 'splitIndex'. + for (i=startIndex;i<endIndex;i++) + { + b3Vector3 center = b3Scalar(0.5)*(getAabbMax(i)+getAabbMin(i)); + if (center[splitAxis] > splitValue) + { + //swap + swapLeafNodes(i,splitIndex); + splitIndex++; + } + } + + //if the splitIndex causes unbalanced trees, fix this by using the center in between startIndex and endIndex + //otherwise the tree-building might fail due to stack-overflows in certain cases. + //unbalanced1 is unsafe: it can cause stack overflows + //bool unbalanced1 = ((splitIndex==startIndex) || (splitIndex == (endIndex-1))); + + //unbalanced2 should work too: always use center (perfect balanced trees) + //bool unbalanced2 = true; + + //this should be safe too: + int rangeBalancedIndices = numIndices/3; + bool unbalanced = ((splitIndex<=(startIndex+rangeBalancedIndices)) || (splitIndex >=(endIndex-1-rangeBalancedIndices))); + + if (unbalanced) + { + splitIndex = startIndex+ (numIndices>>1); + } + + bool unbal = (splitIndex==startIndex) || (splitIndex == (endIndex)); + (void)unbal; + b3Assert(!unbal); + + return splitIndex; +} + + +int b3QuantizedBvh::calcSplittingAxis(int startIndex,int endIndex) +{ + int i; + + b3Vector3 means=b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); + b3Vector3 variance=b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); + int numIndices = endIndex-startIndex; + + for (i=startIndex;i<endIndex;i++) + { + b3Vector3 center = b3Scalar(0.5)*(getAabbMax(i)+getAabbMin(i)); + means+=center; + } + means *= (b3Scalar(1.)/(b3Scalar)numIndices); + + for (i=startIndex;i<endIndex;i++) + { + b3Vector3 center = b3Scalar(0.5)*(getAabbMax(i)+getAabbMin(i)); + b3Vector3 diff2 = center-means; + diff2 = diff2 * diff2; + variance += diff2; + } + variance *= (b3Scalar(1.)/ ((b3Scalar)numIndices-1) ); + + return variance.maxAxis(); +} + + + +void b3QuantizedBvh::reportAabbOverlappingNodex(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const +{ + //either choose recursive traversal (walkTree) or stackless (walkStacklessTree) + + if (m_useQuantization) + { + ///quantize query AABB + unsigned short int quantizedQueryAabbMin[3]; + unsigned short int quantizedQueryAabbMax[3]; + quantizeWithClamp(quantizedQueryAabbMin,aabbMin,0); + quantizeWithClamp(quantizedQueryAabbMax,aabbMax,1); + + switch (m_traversalMode) + { + case TRAVERSAL_STACKLESS: + walkStacklessQuantizedTree(nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax,0,m_curNodeIndex); + break; + case TRAVERSAL_STACKLESS_CACHE_FRIENDLY: + walkStacklessQuantizedTreeCacheFriendly(nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax); + break; + case TRAVERSAL_RECURSIVE: + { + const b3QuantizedBvhNode* rootNode = &m_quantizedContiguousNodes[0]; + walkRecursiveQuantizedTreeAgainstQueryAabb(rootNode,nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax); + } + break; + default: + //unsupported + b3Assert(0); + } + } else + { + walkStacklessTree(nodeCallback,aabbMin,aabbMax); + } +} + + +static int b3s_maxIterations = 0; + + +void b3QuantizedBvh::walkStacklessTree(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const +{ + b3Assert(!m_useQuantization); + + const b3OptimizedBvhNode* rootNode = &m_contiguousNodes[0]; + int escapeIndex, curIndex = 0; + int walkIterations = 0; + bool isLeafNode; + //PCK: unsigned instead of bool + unsigned aabbOverlap; + + while (curIndex < m_curNodeIndex) + { + //catch bugs in tree data + b3Assert (walkIterations < m_curNodeIndex); + + walkIterations++; + aabbOverlap = b3TestAabbAgainstAabb2(aabbMin,aabbMax,rootNode->m_aabbMinOrg,rootNode->m_aabbMaxOrg); + isLeafNode = rootNode->m_escapeIndex == -1; + + //PCK: unsigned instead of bool + if (isLeafNode && (aabbOverlap != 0)) + { + nodeCallback->processNode(rootNode->m_subPart,rootNode->m_triangleIndex); + } + + //PCK: unsigned instead of bool + if ((aabbOverlap != 0) || isLeafNode) + { + rootNode++; + curIndex++; + } else + { + escapeIndex = rootNode->m_escapeIndex; + rootNode += escapeIndex; + curIndex += escapeIndex; + } + } + if (b3s_maxIterations < walkIterations) + b3s_maxIterations = walkIterations; + +} + +/* +///this was the original recursive traversal, before we optimized towards stackless traversal +void b3QuantizedBvh::walkTree(b3OptimizedBvhNode* rootNode,b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const +{ + bool isLeafNode, aabbOverlap = TestAabbAgainstAabb2(aabbMin,aabbMax,rootNode->m_aabbMin,rootNode->m_aabbMax); + if (aabbOverlap) + { + isLeafNode = (!rootNode->m_leftChild && !rootNode->m_rightChild); + if (isLeafNode) + { + nodeCallback->processNode(rootNode); + } else + { + walkTree(rootNode->m_leftChild,nodeCallback,aabbMin,aabbMax); + walkTree(rootNode->m_rightChild,nodeCallback,aabbMin,aabbMax); + } + } + +} +*/ + +void b3QuantizedBvh::walkRecursiveQuantizedTreeAgainstQueryAabb(const b3QuantizedBvhNode* currentNode,b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const +{ + b3Assert(m_useQuantization); + + bool isLeafNode; + //PCK: unsigned instead of bool + unsigned aabbOverlap; + + //PCK: unsigned instead of bool + aabbOverlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,currentNode->m_quantizedAabbMin,currentNode->m_quantizedAabbMax); + isLeafNode = currentNode->isLeafNode(); + + //PCK: unsigned instead of bool + if (aabbOverlap != 0) + { + if (isLeafNode) + { + nodeCallback->processNode(currentNode->getPartId(),currentNode->getTriangleIndex()); + } else + { + //process left and right children + const b3QuantizedBvhNode* leftChildNode = currentNode+1; + walkRecursiveQuantizedTreeAgainstQueryAabb(leftChildNode,nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax); + + const b3QuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? leftChildNode+1:leftChildNode+leftChildNode->getEscapeIndex(); + walkRecursiveQuantizedTreeAgainstQueryAabb(rightChildNode,nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax); + } + } +} + + + +void b3QuantizedBvh::walkStacklessTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const +{ + b3Assert(!m_useQuantization); + + const b3OptimizedBvhNode* rootNode = &m_contiguousNodes[0]; + int escapeIndex, curIndex = 0; + int walkIterations = 0; + bool isLeafNode; + //PCK: unsigned instead of bool + unsigned aabbOverlap=0; + unsigned rayBoxOverlap=0; + b3Scalar lambda_max = 1.0; + + /* Quick pruning by quantized box */ + b3Vector3 rayAabbMin = raySource; + b3Vector3 rayAabbMax = raySource; + rayAabbMin.setMin(rayTarget); + rayAabbMax.setMax(rayTarget); + + /* Add box cast extents to bounding box */ + rayAabbMin += aabbMin; + rayAabbMax += aabbMax; + +#ifdef RAYAABB2 + b3Vector3 rayDir = (rayTarget-raySource); + rayDir.normalize (); + lambda_max = rayDir.dot(rayTarget-raySource); + ///what about division by zero? --> just set rayDirection[i] to 1.0 + b3Vector3 rayDirectionInverse; + rayDirectionInverse[0] = rayDir[0] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[0]; + rayDirectionInverse[1] = rayDir[1] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[1]; + rayDirectionInverse[2] = rayDir[2] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[2]; + unsigned int sign[3] = { rayDirectionInverse[0] < 0.0, rayDirectionInverse[1] < 0.0, rayDirectionInverse[2] < 0.0}; +#endif + + b3Vector3 bounds[2]; + + while (curIndex < m_curNodeIndex) + { + b3Scalar param = 1.0; + //catch bugs in tree data + b3Assert (walkIterations < m_curNodeIndex); + + walkIterations++; + + bounds[0] = rootNode->m_aabbMinOrg; + bounds[1] = rootNode->m_aabbMaxOrg; + /* Add box cast extents */ + bounds[0] -= aabbMax; + bounds[1] -= aabbMin; + + aabbOverlap = b3TestAabbAgainstAabb2(rayAabbMin,rayAabbMax,rootNode->m_aabbMinOrg,rootNode->m_aabbMaxOrg); + //perhaps profile if it is worth doing the aabbOverlap test first + +#ifdef RAYAABB2 + ///careful with this check: need to check division by zero (above) and fix the unQuantize method + ///thanks Joerg/hiker for the reproduction case! + ///http://www.bulletphysics.com/Bullet/phpBB3/viewtopic.php?f=9&t=1858 + rayBoxOverlap = aabbOverlap ? b3RayAabb2 (raySource, rayDirectionInverse, sign, bounds, param, 0.0f, lambda_max) : false; + +#else + b3Vector3 normal; + rayBoxOverlap = b3RayAabb(raySource, rayTarget,bounds[0],bounds[1],param, normal); +#endif + + isLeafNode = rootNode->m_escapeIndex == -1; + + //PCK: unsigned instead of bool + if (isLeafNode && (rayBoxOverlap != 0)) + { + nodeCallback->processNode(rootNode->m_subPart,rootNode->m_triangleIndex); + } + + //PCK: unsigned instead of bool + if ((rayBoxOverlap != 0) || isLeafNode) + { + rootNode++; + curIndex++; + } else + { + escapeIndex = rootNode->m_escapeIndex; + rootNode += escapeIndex; + curIndex += escapeIndex; + } + } + if (b3s_maxIterations < walkIterations) + b3s_maxIterations = walkIterations; + +} + + + +void b3QuantizedBvh::walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const +{ + b3Assert(m_useQuantization); + + int curIndex = startNodeIndex; + int walkIterations = 0; + int subTreeSize = endNodeIndex - startNodeIndex; + (void)subTreeSize; + + const b3QuantizedBvhNode* rootNode = &m_quantizedContiguousNodes[startNodeIndex]; + int escapeIndex; + + bool isLeafNode; + //PCK: unsigned instead of bool + unsigned boxBoxOverlap = 0; + unsigned rayBoxOverlap = 0; + + b3Scalar lambda_max = 1.0; + +#ifdef RAYAABB2 + b3Vector3 rayDirection = (rayTarget-raySource); + rayDirection.normalize (); + lambda_max = rayDirection.dot(rayTarget-raySource); + ///what about division by zero? --> just set rayDirection[i] to 1.0 + rayDirection[0] = rayDirection[0] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDirection[0]; + rayDirection[1] = rayDirection[1] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDirection[1]; + rayDirection[2] = rayDirection[2] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDirection[2]; + unsigned int sign[3] = { rayDirection[0] < 0.0, rayDirection[1] < 0.0, rayDirection[2] < 0.0}; +#endif + + /* Quick pruning by quantized box */ + b3Vector3 rayAabbMin = raySource; + b3Vector3 rayAabbMax = raySource; + rayAabbMin.setMin(rayTarget); + rayAabbMax.setMax(rayTarget); + + /* Add box cast extents to bounding box */ + rayAabbMin += aabbMin; + rayAabbMax += aabbMax; + + unsigned short int quantizedQueryAabbMin[3]; + unsigned short int quantizedQueryAabbMax[3]; + quantizeWithClamp(quantizedQueryAabbMin,rayAabbMin,0); + quantizeWithClamp(quantizedQueryAabbMax,rayAabbMax,1); + + while (curIndex < endNodeIndex) + { + +//#define VISUALLY_ANALYZE_BVH 1 +#ifdef VISUALLY_ANALYZE_BVH + //some code snippet to debugDraw aabb, to visually analyze bvh structure + static int drawPatch = 0; + //need some global access to a debugDrawer + extern b3IDebugDraw* debugDrawerPtr; + if (curIndex==drawPatch) + { + b3Vector3 aabbMin,aabbMax; + aabbMin = unQuantize(rootNode->m_quantizedAabbMin); + aabbMax = unQuantize(rootNode->m_quantizedAabbMax); + b3Vector3 color(1,0,0); + debugDrawerPtr->drawAabb(aabbMin,aabbMax,color); + } +#endif//VISUALLY_ANALYZE_BVH + + //catch bugs in tree data + b3Assert (walkIterations < subTreeSize); + + walkIterations++; + //PCK: unsigned instead of bool + // only interested if this is closer than any previous hit + b3Scalar param = 1.0; + rayBoxOverlap = 0; + boxBoxOverlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode->m_quantizedAabbMin,rootNode->m_quantizedAabbMax); + isLeafNode = rootNode->isLeafNode(); + if (boxBoxOverlap) + { + b3Vector3 bounds[2]; + bounds[0] = unQuantize(rootNode->m_quantizedAabbMin); + bounds[1] = unQuantize(rootNode->m_quantizedAabbMax); + /* Add box cast extents */ + bounds[0] -= aabbMax; + bounds[1] -= aabbMin; +#if 0 + b3Vector3 normal; + bool ra2 = b3RayAabb2 (raySource, rayDirection, sign, bounds, param, 0.0, lambda_max); + bool ra = b3RayAabb (raySource, rayTarget, bounds[0], bounds[1], param, normal); + if (ra2 != ra) + { + printf("functions don't match\n"); + } +#endif +#ifdef RAYAABB2 + ///careful with this check: need to check division by zero (above) and fix the unQuantize method + ///thanks Joerg/hiker for the reproduction case! + ///http://www.bulletphysics.com/Bullet/phpBB3/viewtopic.php?f=9&t=1858 + + //B3_PROFILE("b3RayAabb2"); + rayBoxOverlap = b3RayAabb2 (raySource, rayDirection, sign, bounds, param, 0.0f, lambda_max); + +#else + rayBoxOverlap = true;//b3RayAabb(raySource, rayTarget, bounds[0], bounds[1], param, normal); +#endif + } + + if (isLeafNode && rayBoxOverlap) + { + nodeCallback->processNode(rootNode->getPartId(),rootNode->getTriangleIndex()); + } + + //PCK: unsigned instead of bool + if ((rayBoxOverlap != 0) || isLeafNode) + { + rootNode++; + curIndex++; + } else + { + escapeIndex = rootNode->getEscapeIndex(); + rootNode += escapeIndex; + curIndex += escapeIndex; + } + } + if (b3s_maxIterations < walkIterations) + b3s_maxIterations = walkIterations; + +} + +void b3QuantizedBvh::walkStacklessQuantizedTree(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,int startNodeIndex,int endNodeIndex) const +{ + b3Assert(m_useQuantization); + + int curIndex = startNodeIndex; + int walkIterations = 0; + int subTreeSize = endNodeIndex - startNodeIndex; + (void)subTreeSize; + + const b3QuantizedBvhNode* rootNode = &m_quantizedContiguousNodes[startNodeIndex]; + int escapeIndex; + + bool isLeafNode; + //PCK: unsigned instead of bool + unsigned aabbOverlap; + + while (curIndex < endNodeIndex) + { + +//#define VISUALLY_ANALYZE_BVH 1 +#ifdef VISUALLY_ANALYZE_BVH + //some code snippet to debugDraw aabb, to visually analyze bvh structure + static int drawPatch = 0; + //need some global access to a debugDrawer + extern b3IDebugDraw* debugDrawerPtr; + if (curIndex==drawPatch) + { + b3Vector3 aabbMin,aabbMax; + aabbMin = unQuantize(rootNode->m_quantizedAabbMin); + aabbMax = unQuantize(rootNode->m_quantizedAabbMax); + b3Vector3 color(1,0,0); + debugDrawerPtr->drawAabb(aabbMin,aabbMax,color); + } +#endif//VISUALLY_ANALYZE_BVH + + //catch bugs in tree data + b3Assert (walkIterations < subTreeSize); + + walkIterations++; + //PCK: unsigned instead of bool + aabbOverlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode->m_quantizedAabbMin,rootNode->m_quantizedAabbMax); + isLeafNode = rootNode->isLeafNode(); + + if (isLeafNode && aabbOverlap) + { + nodeCallback->processNode(rootNode->getPartId(),rootNode->getTriangleIndex()); + } + + //PCK: unsigned instead of bool + if ((aabbOverlap != 0) || isLeafNode) + { + rootNode++; + curIndex++; + } else + { + escapeIndex = rootNode->getEscapeIndex(); + rootNode += escapeIndex; + curIndex += escapeIndex; + } + } + if (b3s_maxIterations < walkIterations) + b3s_maxIterations = walkIterations; + +} + +//This traversal can be called from Playstation 3 SPU +void b3QuantizedBvh::walkStacklessQuantizedTreeCacheFriendly(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const +{ + b3Assert(m_useQuantization); + + int i; + + + for (i=0;i<this->m_SubtreeHeaders.size();i++) + { + const b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i]; + + //PCK: unsigned instead of bool + unsigned overlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax); + if (overlap != 0) + { + walkStacklessQuantizedTree(nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax, + subtree.m_rootNodeIndex, + subtree.m_rootNodeIndex+subtree.m_subtreeSize); + } + } +} + + +void b3QuantizedBvh::reportRayOverlappingNodex (b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget) const +{ + reportBoxCastOverlappingNodex(nodeCallback,raySource,rayTarget,b3MakeVector3(0,0,0),b3MakeVector3(0,0,0)); +} + + +void b3QuantizedBvh::reportBoxCastOverlappingNodex(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin,const b3Vector3& aabbMax) const +{ + //always use stackless + + if (m_useQuantization) + { + walkStacklessQuantizedTreeAgainstRay(nodeCallback, raySource, rayTarget, aabbMin, aabbMax, 0, m_curNodeIndex); + } + else + { + walkStacklessTreeAgainstRay(nodeCallback, raySource, rayTarget, aabbMin, aabbMax, 0, m_curNodeIndex); + } + /* + { + //recursive traversal + b3Vector3 qaabbMin = raySource; + b3Vector3 qaabbMax = raySource; + qaabbMin.setMin(rayTarget); + qaabbMax.setMax(rayTarget); + qaabbMin += aabbMin; + qaabbMax += aabbMax; + reportAabbOverlappingNodex(nodeCallback,qaabbMin,qaabbMax); + } + */ + +} + + +void b3QuantizedBvh::swapLeafNodes(int i,int splitIndex) +{ + if (m_useQuantization) + { + b3QuantizedBvhNode tmp = m_quantizedLeafNodes[i]; + m_quantizedLeafNodes[i] = m_quantizedLeafNodes[splitIndex]; + m_quantizedLeafNodes[splitIndex] = tmp; + } else + { + b3OptimizedBvhNode tmp = m_leafNodes[i]; + m_leafNodes[i] = m_leafNodes[splitIndex]; + m_leafNodes[splitIndex] = tmp; + } +} + +void b3QuantizedBvh::assignInternalNodeFromLeafNode(int internalNode,int leafNodeIndex) +{ + if (m_useQuantization) + { + m_quantizedContiguousNodes[internalNode] = m_quantizedLeafNodes[leafNodeIndex]; + } else + { + m_contiguousNodes[internalNode] = m_leafNodes[leafNodeIndex]; + } +} + +//PCK: include +#include <new> + +#if 0 +//PCK: consts +static const unsigned BVH_ALIGNMENT = 16; +static const unsigned BVH_ALIGNMENT_MASK = BVH_ALIGNMENT-1; + +static const unsigned BVH_ALIGNMENT_BLOCKS = 2; +#endif + + +unsigned int b3QuantizedBvh::getAlignmentSerializationPadding() +{ + // I changed this to 0 since the extra padding is not needed or used. + return 0;//BVH_ALIGNMENT_BLOCKS * BVH_ALIGNMENT; +} + +unsigned b3QuantizedBvh::calculateSerializeBufferSize() const +{ + unsigned baseSize = sizeof(b3QuantizedBvh) + getAlignmentSerializationPadding(); + baseSize += sizeof(b3BvhSubtreeInfo) * m_subtreeHeaderCount; + if (m_useQuantization) + { + return baseSize + m_curNodeIndex * sizeof(b3QuantizedBvhNode); + } + return baseSize + m_curNodeIndex * sizeof(b3OptimizedBvhNode); +} + +bool b3QuantizedBvh::serialize(void *o_alignedDataBuffer, unsigned /*i_dataBufferSize */, bool i_swapEndian) const +{ + b3Assert(m_subtreeHeaderCount == m_SubtreeHeaders.size()); + m_subtreeHeaderCount = m_SubtreeHeaders.size(); + +/* if (i_dataBufferSize < calculateSerializeBufferSize() || o_alignedDataBuffer == NULL || (((unsigned)o_alignedDataBuffer & BVH_ALIGNMENT_MASK) != 0)) + { + ///check alignedment for buffer? + b3Assert(0); + return false; + } +*/ + + b3QuantizedBvh *targetBvh = (b3QuantizedBvh *)o_alignedDataBuffer; + + // construct the class so the virtual function table, etc will be set up + // Also, m_leafNodes and m_quantizedLeafNodes will be initialized to default values by the constructor + new (targetBvh) b3QuantizedBvh; + + if (i_swapEndian) + { + targetBvh->m_curNodeIndex = static_cast<int>(b3SwapEndian(m_curNodeIndex)); + + + b3SwapVector3Endian(m_bvhAabbMin,targetBvh->m_bvhAabbMin); + b3SwapVector3Endian(m_bvhAabbMax,targetBvh->m_bvhAabbMax); + b3SwapVector3Endian(m_bvhQuantization,targetBvh->m_bvhQuantization); + + targetBvh->m_traversalMode = (b3TraversalMode)b3SwapEndian(m_traversalMode); + targetBvh->m_subtreeHeaderCount = static_cast<int>(b3SwapEndian(m_subtreeHeaderCount)); + } + else + { + targetBvh->m_curNodeIndex = m_curNodeIndex; + targetBvh->m_bvhAabbMin = m_bvhAabbMin; + targetBvh->m_bvhAabbMax = m_bvhAabbMax; + targetBvh->m_bvhQuantization = m_bvhQuantization; + targetBvh->m_traversalMode = m_traversalMode; + targetBvh->m_subtreeHeaderCount = m_subtreeHeaderCount; + } + + targetBvh->m_useQuantization = m_useQuantization; + + unsigned char *nodeData = (unsigned char *)targetBvh; + nodeData += sizeof(b3QuantizedBvh); + + unsigned sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK; + nodeData += sizeToAdd; + + int nodeCount = m_curNodeIndex; + + if (m_useQuantization) + { + targetBvh->m_quantizedContiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount); + + if (i_swapEndian) + { + for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) + { + targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0]); + targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1]); + targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2]); + + targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0]); + targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1]); + targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2] = b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2]); + + targetBvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = static_cast<int>(b3SwapEndian(m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex)); + } + } + else + { + for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) + { + + targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0]; + targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1]; + targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2]; + + targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0]; + targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1]; + targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2]; + + targetBvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex; + + + } + } + nodeData += sizeof(b3QuantizedBvhNode) * nodeCount; + + // this clears the pointer in the member variable it doesn't really do anything to the data + // it does call the destructor on the contained objects, but they are all classes with no destructor defined + // so the memory (which is not freed) is left alone + targetBvh->m_quantizedContiguousNodes.initializeFromBuffer(NULL, 0, 0); + } + else + { + targetBvh->m_contiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount); + + if (i_swapEndian) + { + for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) + { + b3SwapVector3Endian(m_contiguousNodes[nodeIndex].m_aabbMinOrg, targetBvh->m_contiguousNodes[nodeIndex].m_aabbMinOrg); + b3SwapVector3Endian(m_contiguousNodes[nodeIndex].m_aabbMaxOrg, targetBvh->m_contiguousNodes[nodeIndex].m_aabbMaxOrg); + + targetBvh->m_contiguousNodes[nodeIndex].m_escapeIndex = static_cast<int>(b3SwapEndian(m_contiguousNodes[nodeIndex].m_escapeIndex)); + targetBvh->m_contiguousNodes[nodeIndex].m_subPart = static_cast<int>(b3SwapEndian(m_contiguousNodes[nodeIndex].m_subPart)); + targetBvh->m_contiguousNodes[nodeIndex].m_triangleIndex = static_cast<int>(b3SwapEndian(m_contiguousNodes[nodeIndex].m_triangleIndex)); + } + } + else + { + for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) + { + targetBvh->m_contiguousNodes[nodeIndex].m_aabbMinOrg = m_contiguousNodes[nodeIndex].m_aabbMinOrg; + targetBvh->m_contiguousNodes[nodeIndex].m_aabbMaxOrg = m_contiguousNodes[nodeIndex].m_aabbMaxOrg; + + targetBvh->m_contiguousNodes[nodeIndex].m_escapeIndex = m_contiguousNodes[nodeIndex].m_escapeIndex; + targetBvh->m_contiguousNodes[nodeIndex].m_subPart = m_contiguousNodes[nodeIndex].m_subPart; + targetBvh->m_contiguousNodes[nodeIndex].m_triangleIndex = m_contiguousNodes[nodeIndex].m_triangleIndex; + } + } + nodeData += sizeof(b3OptimizedBvhNode) * nodeCount; + + // this clears the pointer in the member variable it doesn't really do anything to the data + // it does call the destructor on the contained objects, but they are all classes with no destructor defined + // so the memory (which is not freed) is left alone + targetBvh->m_contiguousNodes.initializeFromBuffer(NULL, 0, 0); + } + + sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK; + nodeData += sizeToAdd; + + // Now serialize the subtree headers + targetBvh->m_SubtreeHeaders.initializeFromBuffer(nodeData, m_subtreeHeaderCount, m_subtreeHeaderCount); + if (i_swapEndian) + { + for (int i = 0; i < m_subtreeHeaderCount; i++) + { + targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMin[0]); + targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMin[1]); + targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMin[2]); + + targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMax[0]); + targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMax[1]); + targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2] = b3SwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMax[2]); + + targetBvh->m_SubtreeHeaders[i].m_rootNodeIndex = static_cast<int>(b3SwapEndian(m_SubtreeHeaders[i].m_rootNodeIndex)); + targetBvh->m_SubtreeHeaders[i].m_subtreeSize = static_cast<int>(b3SwapEndian(m_SubtreeHeaders[i].m_subtreeSize)); + } + } + else + { + for (int i = 0; i < m_subtreeHeaderCount; i++) + { + targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0] = (m_SubtreeHeaders[i].m_quantizedAabbMin[0]); + targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1] = (m_SubtreeHeaders[i].m_quantizedAabbMin[1]); + targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2] = (m_SubtreeHeaders[i].m_quantizedAabbMin[2]); + + targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0] = (m_SubtreeHeaders[i].m_quantizedAabbMax[0]); + targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1] = (m_SubtreeHeaders[i].m_quantizedAabbMax[1]); + targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2] = (m_SubtreeHeaders[i].m_quantizedAabbMax[2]); + + targetBvh->m_SubtreeHeaders[i].m_rootNodeIndex = (m_SubtreeHeaders[i].m_rootNodeIndex); + targetBvh->m_SubtreeHeaders[i].m_subtreeSize = (m_SubtreeHeaders[i].m_subtreeSize); + + // need to clear padding in destination buffer + targetBvh->m_SubtreeHeaders[i].m_padding[0] = 0; + targetBvh->m_SubtreeHeaders[i].m_padding[1] = 0; + targetBvh->m_SubtreeHeaders[i].m_padding[2] = 0; + } + } + nodeData += sizeof(b3BvhSubtreeInfo) * m_subtreeHeaderCount; + + // this clears the pointer in the member variable it doesn't really do anything to the data + // it does call the destructor on the contained objects, but they are all classes with no destructor defined + // so the memory (which is not freed) is left alone + targetBvh->m_SubtreeHeaders.initializeFromBuffer(NULL, 0, 0); + + // this wipes the virtual function table pointer at the start of the buffer for the class + *((void**)o_alignedDataBuffer) = NULL; + + return true; +} + +b3QuantizedBvh *b3QuantizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian) +{ + + if (i_alignedDataBuffer == NULL)// || (((unsigned)i_alignedDataBuffer & BVH_ALIGNMENT_MASK) != 0)) + { + return NULL; + } + b3QuantizedBvh *bvh = (b3QuantizedBvh *)i_alignedDataBuffer; + + if (i_swapEndian) + { + bvh->m_curNodeIndex = static_cast<int>(b3SwapEndian(bvh->m_curNodeIndex)); + + b3UnSwapVector3Endian(bvh->m_bvhAabbMin); + b3UnSwapVector3Endian(bvh->m_bvhAabbMax); + b3UnSwapVector3Endian(bvh->m_bvhQuantization); + + bvh->m_traversalMode = (b3TraversalMode)b3SwapEndian(bvh->m_traversalMode); + bvh->m_subtreeHeaderCount = static_cast<int>(b3SwapEndian(bvh->m_subtreeHeaderCount)); + } + + unsigned int calculatedBufSize = bvh->calculateSerializeBufferSize(); + b3Assert(calculatedBufSize <= i_dataBufferSize); + + if (calculatedBufSize > i_dataBufferSize) + { + return NULL; + } + + unsigned char *nodeData = (unsigned char *)bvh; + nodeData += sizeof(b3QuantizedBvh); + + unsigned sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK; + nodeData += sizeToAdd; + + int nodeCount = bvh->m_curNodeIndex; + + // Must call placement new to fill in virtual function table, etc, but we don't want to overwrite most data, so call a special version of the constructor + // Also, m_leafNodes and m_quantizedLeafNodes will be initialized to default values by the constructor + new (bvh) b3QuantizedBvh(*bvh, false); + + if (bvh->m_useQuantization) + { + bvh->m_quantizedContiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount); + + if (i_swapEndian) + { + for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) + { + bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0]); + bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1]); + bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2]); + + bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0]); + bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1]); + bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2] = b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2]); + + bvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = static_cast<int>(b3SwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex)); + } + } + nodeData += sizeof(b3QuantizedBvhNode) * nodeCount; + } + else + { + bvh->m_contiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount); + + if (i_swapEndian) + { + for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) + { + b3UnSwapVector3Endian(bvh->m_contiguousNodes[nodeIndex].m_aabbMinOrg); + b3UnSwapVector3Endian(bvh->m_contiguousNodes[nodeIndex].m_aabbMaxOrg); + + bvh->m_contiguousNodes[nodeIndex].m_escapeIndex = static_cast<int>(b3SwapEndian(bvh->m_contiguousNodes[nodeIndex].m_escapeIndex)); + bvh->m_contiguousNodes[nodeIndex].m_subPart = static_cast<int>(b3SwapEndian(bvh->m_contiguousNodes[nodeIndex].m_subPart)); + bvh->m_contiguousNodes[nodeIndex].m_triangleIndex = static_cast<int>(b3SwapEndian(bvh->m_contiguousNodes[nodeIndex].m_triangleIndex)); + } + } + nodeData += sizeof(b3OptimizedBvhNode) * nodeCount; + } + + sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK; + nodeData += sizeToAdd; + + // Now serialize the subtree headers + bvh->m_SubtreeHeaders.initializeFromBuffer(nodeData, bvh->m_subtreeHeaderCount, bvh->m_subtreeHeaderCount); + if (i_swapEndian) + { + for (int i = 0; i < bvh->m_subtreeHeaderCount; i++) + { + bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0]); + bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1]); + bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2]); + + bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0]); + bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1]); + bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2] = b3SwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2]); + + bvh->m_SubtreeHeaders[i].m_rootNodeIndex = static_cast<int>(b3SwapEndian(bvh->m_SubtreeHeaders[i].m_rootNodeIndex)); + bvh->m_SubtreeHeaders[i].m_subtreeSize = static_cast<int>(b3SwapEndian(bvh->m_SubtreeHeaders[i].m_subtreeSize)); + } + } + + return bvh; +} + +// Constructor that prevents b3Vector3's default constructor from being called +b3QuantizedBvh::b3QuantizedBvh(b3QuantizedBvh &self, bool /* ownsMemory */) : +m_bvhAabbMin(self.m_bvhAabbMin), +m_bvhAabbMax(self.m_bvhAabbMax), +m_bvhQuantization(self.m_bvhQuantization), +m_bulletVersion(B3_BULLET_VERSION) +{ + +} + +void b3QuantizedBvh::deSerializeFloat(struct b3QuantizedBvhFloatData& quantizedBvhFloatData) +{ + m_bvhAabbMax.deSerializeFloat(quantizedBvhFloatData.m_bvhAabbMax); + m_bvhAabbMin.deSerializeFloat(quantizedBvhFloatData.m_bvhAabbMin); + m_bvhQuantization.deSerializeFloat(quantizedBvhFloatData.m_bvhQuantization); + + m_curNodeIndex = quantizedBvhFloatData.m_curNodeIndex; + m_useQuantization = quantizedBvhFloatData.m_useQuantization!=0; + + { + int numElem = quantizedBvhFloatData.m_numContiguousLeafNodes; + m_contiguousNodes.resize(numElem); + + if (numElem) + { + b3OptimizedBvhNodeFloatData* memPtr = quantizedBvhFloatData.m_contiguousNodesPtr; + + for (int i=0;i<numElem;i++,memPtr++) + { + m_contiguousNodes[i].m_aabbMaxOrg.deSerializeFloat(memPtr->m_aabbMaxOrg); + m_contiguousNodes[i].m_aabbMinOrg.deSerializeFloat(memPtr->m_aabbMinOrg); + m_contiguousNodes[i].m_escapeIndex = memPtr->m_escapeIndex; + m_contiguousNodes[i].m_subPart = memPtr->m_subPart; + m_contiguousNodes[i].m_triangleIndex = memPtr->m_triangleIndex; + } + } + } + + { + int numElem = quantizedBvhFloatData.m_numQuantizedContiguousNodes; + m_quantizedContiguousNodes.resize(numElem); + + if (numElem) + { + b3QuantizedBvhNodeData* memPtr = quantizedBvhFloatData.m_quantizedContiguousNodesPtr; + for (int i=0;i<numElem;i++,memPtr++) + { + m_quantizedContiguousNodes[i].m_escapeIndexOrTriangleIndex = memPtr->m_escapeIndexOrTriangleIndex; + m_quantizedContiguousNodes[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0]; + m_quantizedContiguousNodes[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1]; + m_quantizedContiguousNodes[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2]; + m_quantizedContiguousNodes[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0]; + m_quantizedContiguousNodes[i].m_quantizedAabbMin[1] = memPtr->m_quantizedAabbMin[1]; + m_quantizedContiguousNodes[i].m_quantizedAabbMin[2] = memPtr->m_quantizedAabbMin[2]; + } + } + } + + m_traversalMode = b3TraversalMode(quantizedBvhFloatData.m_traversalMode); + + { + int numElem = quantizedBvhFloatData.m_numSubtreeHeaders; + m_SubtreeHeaders.resize(numElem); + if (numElem) + { + b3BvhSubtreeInfoData* memPtr = quantizedBvhFloatData.m_subTreeInfoPtr; + for (int i=0;i<numElem;i++,memPtr++) + { + m_SubtreeHeaders[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0] ; + m_SubtreeHeaders[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1]; + m_SubtreeHeaders[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2]; + m_SubtreeHeaders[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0]; + m_SubtreeHeaders[i].m_quantizedAabbMin[1] = memPtr->m_quantizedAabbMin[1]; + m_SubtreeHeaders[i].m_quantizedAabbMin[2] = memPtr->m_quantizedAabbMin[2]; + m_SubtreeHeaders[i].m_rootNodeIndex = memPtr->m_rootNodeIndex; + m_SubtreeHeaders[i].m_subtreeSize = memPtr->m_subtreeSize; + } + } + } +} + +void b3QuantizedBvh::deSerializeDouble(struct b3QuantizedBvhDoubleData& quantizedBvhDoubleData) +{ + m_bvhAabbMax.deSerializeDouble(quantizedBvhDoubleData.m_bvhAabbMax); + m_bvhAabbMin.deSerializeDouble(quantizedBvhDoubleData.m_bvhAabbMin); + m_bvhQuantization.deSerializeDouble(quantizedBvhDoubleData.m_bvhQuantization); + + m_curNodeIndex = quantizedBvhDoubleData.m_curNodeIndex; + m_useQuantization = quantizedBvhDoubleData.m_useQuantization!=0; + + { + int numElem = quantizedBvhDoubleData.m_numContiguousLeafNodes; + m_contiguousNodes.resize(numElem); + + if (numElem) + { + b3OptimizedBvhNodeDoubleData* memPtr = quantizedBvhDoubleData.m_contiguousNodesPtr; + + for (int i=0;i<numElem;i++,memPtr++) + { + m_contiguousNodes[i].m_aabbMaxOrg.deSerializeDouble(memPtr->m_aabbMaxOrg); + m_contiguousNodes[i].m_aabbMinOrg.deSerializeDouble(memPtr->m_aabbMinOrg); + m_contiguousNodes[i].m_escapeIndex = memPtr->m_escapeIndex; + m_contiguousNodes[i].m_subPart = memPtr->m_subPart; + m_contiguousNodes[i].m_triangleIndex = memPtr->m_triangleIndex; + } + } + } + + { + int numElem = quantizedBvhDoubleData.m_numQuantizedContiguousNodes; + m_quantizedContiguousNodes.resize(numElem); + + if (numElem) + { + b3QuantizedBvhNodeData* memPtr = quantizedBvhDoubleData.m_quantizedContiguousNodesPtr; + for (int i=0;i<numElem;i++,memPtr++) + { + m_quantizedContiguousNodes[i].m_escapeIndexOrTriangleIndex = memPtr->m_escapeIndexOrTriangleIndex; + m_quantizedContiguousNodes[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0]; + m_quantizedContiguousNodes[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1]; + m_quantizedContiguousNodes[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2]; + m_quantizedContiguousNodes[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0]; + m_quantizedContiguousNodes[i].m_quantizedAabbMin[1] = memPtr->m_quantizedAabbMin[1]; + m_quantizedContiguousNodes[i].m_quantizedAabbMin[2] = memPtr->m_quantizedAabbMin[2]; + } + } + } + + m_traversalMode = b3TraversalMode(quantizedBvhDoubleData.m_traversalMode); + + { + int numElem = quantizedBvhDoubleData.m_numSubtreeHeaders; + m_SubtreeHeaders.resize(numElem); + if (numElem) + { + b3BvhSubtreeInfoData* memPtr = quantizedBvhDoubleData.m_subTreeInfoPtr; + for (int i=0;i<numElem;i++,memPtr++) + { + m_SubtreeHeaders[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0] ; + m_SubtreeHeaders[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1]; + m_SubtreeHeaders[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2]; + m_SubtreeHeaders[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0]; + m_SubtreeHeaders[i].m_quantizedAabbMin[1] = memPtr->m_quantizedAabbMin[1]; + m_SubtreeHeaders[i].m_quantizedAabbMin[2] = memPtr->m_quantizedAabbMin[2]; + m_SubtreeHeaders[i].m_rootNodeIndex = memPtr->m_rootNodeIndex; + m_SubtreeHeaders[i].m_subtreeSize = memPtr->m_subtreeSize; + } + } + } + +} + + + +///fills the dataBuffer and returns the struct name (and 0 on failure) +const char* b3QuantizedBvh::serialize(void* dataBuffer, b3Serializer* serializer) const +{ + b3Assert(0); + return 0; +} + + + + + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h new file mode 100644 index 0000000000..63c523c758 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h @@ -0,0 +1,556 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef B3_QUANTIZED_BVH_H +#define B3_QUANTIZED_BVH_H + +class b3Serializer; + +//#define DEBUG_CHECK_DEQUANTIZATION 1 +#ifdef DEBUG_CHECK_DEQUANTIZATION +#ifdef __SPU__ +#define printf spu_printf +#endif //__SPU__ + +#include <stdio.h> +#include <stdlib.h> +#endif //DEBUG_CHECK_DEQUANTIZATION + +#include "Bullet3Common/b3Vector3.h" +#include "Bullet3Common/b3AlignedAllocator.h" + +#ifdef B3_USE_DOUBLE_PRECISION +#define b3QuantizedBvhData b3QuantizedBvhDoubleData +#define b3OptimizedBvhNodeData b3OptimizedBvhNodeDoubleData +#define b3QuantizedBvhDataName "b3QuantizedBvhDoubleData" +#else +#define b3QuantizedBvhData b3QuantizedBvhFloatData +#define b3OptimizedBvhNodeData b3OptimizedBvhNodeFloatData +#define b3QuantizedBvhDataName "b3QuantizedBvhFloatData" +#endif + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h" + + + +//http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclang/html/vclrf__m128.asp + + +//Note: currently we have 16 bytes per quantized node +#define MAX_SUBTREE_SIZE_IN_BYTES 2048 + +// 10 gives the potential for 1024 parts, with at most 2^21 (2097152) (minus one +// actually) triangles each (since the sign bit is reserved +#define MAX_NUM_PARTS_IN_BITS 10 + +///b3QuantizedBvhNode is a compressed aabb node, 16 bytes. +///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range). +B3_ATTRIBUTE_ALIGNED16 (struct) b3QuantizedBvhNode : public b3QuantizedBvhNodeData +{ + B3_DECLARE_ALIGNED_ALLOCATOR(); + + bool isLeafNode() const + { + //skipindex is negative (internal node), triangleindex >=0 (leafnode) + return (m_escapeIndexOrTriangleIndex >= 0); + } + int getEscapeIndex() const + { + b3Assert(!isLeafNode()); + return -m_escapeIndexOrTriangleIndex; + } + int getTriangleIndex() const + { + b3Assert(isLeafNode()); + unsigned int x=0; + unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS); + // Get only the lower bits where the triangle index is stored + return (m_escapeIndexOrTriangleIndex&~(y)); + } + int getPartId() const + { + b3Assert(isLeafNode()); + // Get only the highest bits where the part index is stored + return (m_escapeIndexOrTriangleIndex>>(31-MAX_NUM_PARTS_IN_BITS)); + } +} +; + +/// b3OptimizedBvhNode contains both internal and leaf node information. +/// Total node size is 44 bytes / node. You can use the compressed version of 16 bytes. +B3_ATTRIBUTE_ALIGNED16 (struct) b3OptimizedBvhNode +{ + B3_DECLARE_ALIGNED_ALLOCATOR(); + + //32 bytes + b3Vector3 m_aabbMinOrg; + b3Vector3 m_aabbMaxOrg; + + //4 + int m_escapeIndex; + + //8 + //for child nodes + int m_subPart; + int m_triangleIndex; + +//pad the size to 64 bytes + char m_padding[20]; +}; + + +///b3BvhSubtreeInfo provides info to gather a subtree of limited size +B3_ATTRIBUTE_ALIGNED16(class) b3BvhSubtreeInfo : public b3BvhSubtreeInfoData +{ +public: + B3_DECLARE_ALIGNED_ALLOCATOR(); + + b3BvhSubtreeInfo() + { + //memset(&m_padding[0], 0, sizeof(m_padding)); + } + + + void setAabbFromQuantizeNode(const b3QuantizedBvhNode& quantizedNode) + { + m_quantizedAabbMin[0] = quantizedNode.m_quantizedAabbMin[0]; + m_quantizedAabbMin[1] = quantizedNode.m_quantizedAabbMin[1]; + m_quantizedAabbMin[2] = quantizedNode.m_quantizedAabbMin[2]; + m_quantizedAabbMax[0] = quantizedNode.m_quantizedAabbMax[0]; + m_quantizedAabbMax[1] = quantizedNode.m_quantizedAabbMax[1]; + m_quantizedAabbMax[2] = quantizedNode.m_quantizedAabbMax[2]; + } +} +; + + +class b3NodeOverlapCallback +{ +public: + virtual ~b3NodeOverlapCallback() {}; + + virtual void processNode(int subPart, int triangleIndex) = 0; +}; + +#include "Bullet3Common/b3AlignedAllocator.h" +#include "Bullet3Common/b3AlignedObjectArray.h" + + + +///for code readability: +typedef b3AlignedObjectArray<b3OptimizedBvhNode> NodeArray; +typedef b3AlignedObjectArray<b3QuantizedBvhNode> QuantizedNodeArray; +typedef b3AlignedObjectArray<b3BvhSubtreeInfo> BvhSubtreeInfoArray; + + +///The b3QuantizedBvh class stores an AABB tree that can be quickly traversed on CPU and Cell SPU. +///It is used by the b3BvhTriangleMeshShape as midphase +///It is recommended to use quantization for better performance and lower memory requirements. +B3_ATTRIBUTE_ALIGNED16(class) b3QuantizedBvh +{ +public: + enum b3TraversalMode + { + TRAVERSAL_STACKLESS = 0, + TRAVERSAL_STACKLESS_CACHE_FRIENDLY, + TRAVERSAL_RECURSIVE + }; + + + + + b3Vector3 m_bvhAabbMin; + b3Vector3 m_bvhAabbMax; + b3Vector3 m_bvhQuantization; + +protected: + int m_bulletVersion; //for serialization versioning. It could also be used to detect endianess. + + int m_curNodeIndex; + //quantization data + bool m_useQuantization; + + + + NodeArray m_leafNodes; + NodeArray m_contiguousNodes; + QuantizedNodeArray m_quantizedLeafNodes; + QuantizedNodeArray m_quantizedContiguousNodes; + + b3TraversalMode m_traversalMode; + BvhSubtreeInfoArray m_SubtreeHeaders; + + //This is only used for serialization so we don't have to add serialization directly to b3AlignedObjectArray + mutable int m_subtreeHeaderCount; + + + + + + ///two versions, one for quantized and normal nodes. This allows code-reuse while maintaining readability (no template/macro!) + ///this might be refactored into a virtual, it is usually not calculated at run-time + void setInternalNodeAabbMin(int nodeIndex, const b3Vector3& aabbMin) + { + if (m_useQuantization) + { + quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] ,aabbMin,0); + } else + { + m_contiguousNodes[nodeIndex].m_aabbMinOrg = aabbMin; + + } + } + void setInternalNodeAabbMax(int nodeIndex,const b3Vector3& aabbMax) + { + if (m_useQuantization) + { + quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0],aabbMax,1); + } else + { + m_contiguousNodes[nodeIndex].m_aabbMaxOrg = aabbMax; + } + } + + b3Vector3 getAabbMin(int nodeIndex) const + { + if (m_useQuantization) + { + return unQuantize(&m_quantizedLeafNodes[nodeIndex].m_quantizedAabbMin[0]); + } + //non-quantized + return m_leafNodes[nodeIndex].m_aabbMinOrg; + + } + b3Vector3 getAabbMax(int nodeIndex) const + { + if (m_useQuantization) + { + return unQuantize(&m_quantizedLeafNodes[nodeIndex].m_quantizedAabbMax[0]); + } + //non-quantized + return m_leafNodes[nodeIndex].m_aabbMaxOrg; + + } + + + void setInternalNodeEscapeIndex(int nodeIndex, int escapeIndex) + { + if (m_useQuantization) + { + m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = -escapeIndex; + } + else + { + m_contiguousNodes[nodeIndex].m_escapeIndex = escapeIndex; + } + + } + + void mergeInternalNodeAabb(int nodeIndex,const b3Vector3& newAabbMin,const b3Vector3& newAabbMax) + { + if (m_useQuantization) + { + unsigned short int quantizedAabbMin[3]; + unsigned short int quantizedAabbMax[3]; + quantize(quantizedAabbMin,newAabbMin,0); + quantize(quantizedAabbMax,newAabbMax,1); + for (int i=0;i<3;i++) + { + if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] > quantizedAabbMin[i]) + m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] = quantizedAabbMin[i]; + + if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] < quantizedAabbMax[i]) + m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] = quantizedAabbMax[i]; + + } + } else + { + //non-quantized + m_contiguousNodes[nodeIndex].m_aabbMinOrg.setMin(newAabbMin); + m_contiguousNodes[nodeIndex].m_aabbMaxOrg.setMax(newAabbMax); + } + } + + void swapLeafNodes(int firstIndex,int secondIndex); + + void assignInternalNodeFromLeafNode(int internalNode,int leafNodeIndex); + +protected: + + + + void buildTree (int startIndex,int endIndex); + + int calcSplittingAxis(int startIndex,int endIndex); + + int sortAndCalcSplittingIndex(int startIndex,int endIndex,int splitAxis); + + void walkStacklessTree(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const; + + void walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const; + void walkStacklessQuantizedTree(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,int startNodeIndex,int endNodeIndex) const; + void walkStacklessTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const; + + ///tree traversal designed for small-memory processors like PS3 SPU + void walkStacklessQuantizedTreeCacheFriendly(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const; + + ///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal + void walkRecursiveQuantizedTreeAgainstQueryAabb(const b3QuantizedBvhNode* currentNode,b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const; + + ///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal + void walkRecursiveQuantizedTreeAgainstQuantizedTree(const b3QuantizedBvhNode* treeNodeA,const b3QuantizedBvhNode* treeNodeB,b3NodeOverlapCallback* nodeCallback) const; + + + + + void updateSubtreeHeaders(int leftChildNodexIndex,int rightChildNodexIndex); + +public: + + B3_DECLARE_ALIGNED_ALLOCATOR(); + + b3QuantizedBvh(); + + virtual ~b3QuantizedBvh(); + + + ///***************************************** expert/internal use only ************************* + void setQuantizationValues(const b3Vector3& bvhAabbMin,const b3Vector3& bvhAabbMax,b3Scalar quantizationMargin=b3Scalar(1.0)); + QuantizedNodeArray& getLeafNodeArray() { return m_quantizedLeafNodes; } + ///buildInternal is expert use only: assumes that setQuantizationValues and LeafNodeArray are initialized + void buildInternal(); + ///***************************************** expert/internal use only ************************* + + void reportAabbOverlappingNodex(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const; + void reportRayOverlappingNodex (b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget) const; + void reportBoxCastOverlappingNodex(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin,const b3Vector3& aabbMax) const; + + B3_FORCE_INLINE void quantize(unsigned short* out, const b3Vector3& point,int isMax) const + { + + b3Assert(m_useQuantization); + + b3Assert(point.getX() <= m_bvhAabbMax.getX()); + b3Assert(point.getY() <= m_bvhAabbMax.getY()); + b3Assert(point.getZ() <= m_bvhAabbMax.getZ()); + + b3Assert(point.getX() >= m_bvhAabbMin.getX()); + b3Assert(point.getY() >= m_bvhAabbMin.getY()); + b3Assert(point.getZ() >= m_bvhAabbMin.getZ()); + + b3Vector3 v = (point - m_bvhAabbMin) * m_bvhQuantization; + ///Make sure rounding is done in a way that unQuantize(quantizeWithClamp(...)) is conservative + ///end-points always set the first bit, so that they are sorted properly (so that neighbouring AABBs overlap properly) + ///@todo: double-check this + if (isMax) + { + out[0] = (unsigned short) (((unsigned short)(v.getX()+b3Scalar(1.)) | 1)); + out[1] = (unsigned short) (((unsigned short)(v.getY()+b3Scalar(1.)) | 1)); + out[2] = (unsigned short) (((unsigned short)(v.getZ()+b3Scalar(1.)) | 1)); + } else + { + out[0] = (unsigned short) (((unsigned short)(v.getX()) & 0xfffe)); + out[1] = (unsigned short) (((unsigned short)(v.getY()) & 0xfffe)); + out[2] = (unsigned short) (((unsigned short)(v.getZ()) & 0xfffe)); + } + + +#ifdef DEBUG_CHECK_DEQUANTIZATION + b3Vector3 newPoint = unQuantize(out); + if (isMax) + { + if (newPoint.getX() < point.getX()) + { + printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n",newPoint.getX()-point.getX(), newPoint.getX(),point.getX()); + } + if (newPoint.getY() < point.getY()) + { + printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n",newPoint.getY()-point.getY(), newPoint.getY(),point.getY()); + } + if (newPoint.getZ() < point.getZ()) + { + + printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n",newPoint.getZ()-point.getZ(), newPoint.getZ(),point.getZ()); + } + } else + { + if (newPoint.getX() > point.getX()) + { + printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n",newPoint.getX()-point.getX(), newPoint.getX(),point.getX()); + } + if (newPoint.getY() > point.getY()) + { + printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n",newPoint.getY()-point.getY(), newPoint.getY(),point.getY()); + } + if (newPoint.getZ() > point.getZ()) + { + printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n",newPoint.getZ()-point.getZ(), newPoint.getZ(),point.getZ()); + } + } +#endif //DEBUG_CHECK_DEQUANTIZATION + + } + + + B3_FORCE_INLINE void quantizeWithClamp(unsigned short* out, const b3Vector3& point2,int isMax) const + { + + b3Assert(m_useQuantization); + + b3Vector3 clampedPoint(point2); + clampedPoint.setMax(m_bvhAabbMin); + clampedPoint.setMin(m_bvhAabbMax); + + quantize(out,clampedPoint,isMax); + + } + + B3_FORCE_INLINE b3Vector3 unQuantize(const unsigned short* vecIn) const + { + b3Vector3 vecOut; + vecOut.setValue( + (b3Scalar)(vecIn[0]) / (m_bvhQuantization.getX()), + (b3Scalar)(vecIn[1]) / (m_bvhQuantization.getY()), + (b3Scalar)(vecIn[2]) / (m_bvhQuantization.getZ())); + vecOut += m_bvhAabbMin; + return vecOut; + } + + ///setTraversalMode let's you choose between stackless, recursive or stackless cache friendly tree traversal. Note this is only implemented for quantized trees. + void setTraversalMode(b3TraversalMode traversalMode) + { + m_traversalMode = traversalMode; + } + + + B3_FORCE_INLINE QuantizedNodeArray& getQuantizedNodeArray() + { + return m_quantizedContiguousNodes; + } + + + B3_FORCE_INLINE BvhSubtreeInfoArray& getSubtreeInfoArray() + { + return m_SubtreeHeaders; + } + +//////////////////////////////////////////////////////////////////// + + /////Calculate space needed to store BVH for serialization + unsigned calculateSerializeBufferSize() const; + + /// Data buffer MUST be 16 byte aligned + virtual bool serialize(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const; + + ///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place' + static b3QuantizedBvh *deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian); + + static unsigned int getAlignmentSerializationPadding(); +////////////////////////////////////////////////////////////////////// + + + virtual int calculateSerializeBufferSizeNew() const; + + ///fills the dataBuffer and returns the struct name (and 0 on failure) + virtual const char* serialize(void* dataBuffer, b3Serializer* serializer) const; + + virtual void deSerializeFloat(struct b3QuantizedBvhFloatData& quantizedBvhFloatData); + + virtual void deSerializeDouble(struct b3QuantizedBvhDoubleData& quantizedBvhDoubleData); + + +//////////////////////////////////////////////////////////////////// + + B3_FORCE_INLINE bool isQuantized() + { + return m_useQuantization; + } + +private: + // Special "copy" constructor that allows for in-place deserialization + // Prevents b3Vector3's default constructor from being called, but doesn't inialize much else + // ownsMemory should most likely be false if deserializing, and if you are not, don't call this (it also changes the function signature, which we need) + b3QuantizedBvh(b3QuantizedBvh &other, bool ownsMemory); + +} +; + + +struct b3OptimizedBvhNodeFloatData +{ + b3Vector3FloatData m_aabbMinOrg; + b3Vector3FloatData m_aabbMaxOrg; + int m_escapeIndex; + int m_subPart; + int m_triangleIndex; + char m_pad[4]; +}; + +struct b3OptimizedBvhNodeDoubleData +{ + b3Vector3DoubleData m_aabbMinOrg; + b3Vector3DoubleData m_aabbMaxOrg; + int m_escapeIndex; + int m_subPart; + int m_triangleIndex; + char m_pad[4]; +}; + + + +struct b3QuantizedBvhFloatData +{ + b3Vector3FloatData m_bvhAabbMin; + b3Vector3FloatData m_bvhAabbMax; + b3Vector3FloatData m_bvhQuantization; + int m_curNodeIndex; + int m_useQuantization; + int m_numContiguousLeafNodes; + int m_numQuantizedContiguousNodes; + b3OptimizedBvhNodeFloatData *m_contiguousNodesPtr; + b3QuantizedBvhNodeData *m_quantizedContiguousNodesPtr; + b3BvhSubtreeInfoData *m_subTreeInfoPtr; + int m_traversalMode; + int m_numSubtreeHeaders; + +}; + +struct b3QuantizedBvhDoubleData +{ + b3Vector3DoubleData m_bvhAabbMin; + b3Vector3DoubleData m_bvhAabbMax; + b3Vector3DoubleData m_bvhQuantization; + int m_curNodeIndex; + int m_useQuantization; + int m_numContiguousLeafNodes; + int m_numQuantizedContiguousNodes; + b3OptimizedBvhNodeDoubleData *m_contiguousNodesPtr; + b3QuantizedBvhNodeData *m_quantizedContiguousNodesPtr; + + int m_traversalMode; + int m_numSubtreeHeaders; + b3BvhSubtreeInfoData *m_subTreeInfoPtr; +}; + + +B3_FORCE_INLINE int b3QuantizedBvh::calculateSerializeBufferSizeNew() const +{ + return sizeof(b3QuantizedBvhData); +} + + + +#endif //B3_QUANTIZED_BVH_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp new file mode 100644 index 0000000000..4d97f7f62b --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp @@ -0,0 +1,214 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#include "b3StridingMeshInterface.h" + + +b3StridingMeshInterface::~b3StridingMeshInterface() +{ + +} + + +void b3StridingMeshInterface::InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const +{ + (void)aabbMin; + (void)aabbMax; + int numtotalphysicsverts = 0; + int part,graphicssubparts = getNumSubParts(); + const unsigned char * vertexbase; + const unsigned char * indexbase; + int indexstride; + PHY_ScalarType type; + PHY_ScalarType gfxindextype; + int stride,numverts,numtriangles; + int gfxindex; + b3Vector3 triangle[3]; + + b3Vector3 meshScaling = getScaling(); + + ///if the number of parts is big, the performance might drop due to the innerloop switch on indextype + for (part=0;part<graphicssubparts ;part++) + { + getLockedReadOnlyVertexIndexBase(&vertexbase,numverts,type,stride,&indexbase,indexstride,numtriangles,gfxindextype,part); + numtotalphysicsverts+=numtriangles*3; //upper bound + + ///unlike that developers want to pass in double-precision meshes in single-precision Bullet build + ///so disable this feature by default + ///see patch http://code.google.com/p/bullet/issues/detail?id=213 + + switch (type) + { + case PHY_FLOAT: + { + + float* graphicsbase; + + switch (gfxindextype) + { + case PHY_INTEGER: + { + for (gfxindex=0;gfxindex<numtriangles;gfxindex++) + { + unsigned int* tri_indices= (unsigned int*)(indexbase+gfxindex*indexstride); + graphicsbase = (float*)(vertexbase+tri_indices[0]*stride); + triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ()); + graphicsbase = (float*)(vertexbase+tri_indices[1]*stride); + triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ()); + graphicsbase = (float*)(vertexbase+tri_indices[2]*stride); + triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ()); + callback->internalProcessTriangleIndex(triangle,part,gfxindex); + } + break; + } + case PHY_SHORT: + { + for (gfxindex=0;gfxindex<numtriangles;gfxindex++) + { + unsigned short int* tri_indices= (unsigned short int*)(indexbase+gfxindex*indexstride); + graphicsbase = (float*)(vertexbase+tri_indices[0]*stride); + triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ()); + graphicsbase = (float*)(vertexbase+tri_indices[1]*stride); + triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ()); + graphicsbase = (float*)(vertexbase+tri_indices[2]*stride); + triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ()); + callback->internalProcessTriangleIndex(triangle,part,gfxindex); + } + break; + } + case PHY_UCHAR: + { + for (gfxindex=0;gfxindex<numtriangles;gfxindex++) + { + unsigned char* tri_indices= (unsigned char*)(indexbase+gfxindex*indexstride); + graphicsbase = (float*)(vertexbase+tri_indices[0]*stride); + triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ()); + graphicsbase = (float*)(vertexbase+tri_indices[1]*stride); + triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ()); + graphicsbase = (float*)(vertexbase+tri_indices[2]*stride); + triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ()); + callback->internalProcessTriangleIndex(triangle,part,gfxindex); + } + break; + } + default: + b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT)); + } + break; + } + + case PHY_DOUBLE: + { + double* graphicsbase; + + switch (gfxindextype) + { + case PHY_INTEGER: + { + for (gfxindex=0;gfxindex<numtriangles;gfxindex++) + { + unsigned int* tri_indices= (unsigned int*)(indexbase+gfxindex*indexstride); + graphicsbase = (double*)(vertexbase+tri_indices[0]*stride); + triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ()); + graphicsbase = (double*)(vertexbase+tri_indices[1]*stride); + triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ()); + graphicsbase = (double*)(vertexbase+tri_indices[2]*stride); + triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ()); + callback->internalProcessTriangleIndex(triangle,part,gfxindex); + } + break; + } + case PHY_SHORT: + { + for (gfxindex=0;gfxindex<numtriangles;gfxindex++) + { + unsigned short int* tri_indices= (unsigned short int*)(indexbase+gfxindex*indexstride); + graphicsbase = (double*)(vertexbase+tri_indices[0]*stride); + triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ()); + graphicsbase = (double*)(vertexbase+tri_indices[1]*stride); + triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ()); + graphicsbase = (double*)(vertexbase+tri_indices[2]*stride); + triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ()); + callback->internalProcessTriangleIndex(triangle,part,gfxindex); + } + break; + } + case PHY_UCHAR: + { + for (gfxindex=0;gfxindex<numtriangles;gfxindex++) + { + unsigned char* tri_indices= (unsigned char*)(indexbase+gfxindex*indexstride); + graphicsbase = (double*)(vertexbase+tri_indices[0]*stride); + triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ()); + graphicsbase = (double*)(vertexbase+tri_indices[1]*stride); + triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ()); + graphicsbase = (double*)(vertexbase+tri_indices[2]*stride); + triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ()); + callback->internalProcessTriangleIndex(triangle,part,gfxindex); + } + break; + } + default: + b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT)); + } + break; + } + default: + b3Assert((type == PHY_FLOAT) || (type == PHY_DOUBLE)); + } + + unLockReadOnlyVertexBase(part); + } +} + +void b3StridingMeshInterface::calculateAabbBruteForce(b3Vector3& aabbMin,b3Vector3& aabbMax) +{ + + struct AabbCalculationCallback : public b3InternalTriangleIndexCallback + { + b3Vector3 m_aabbMin; + b3Vector3 m_aabbMax; + + AabbCalculationCallback() + { + m_aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT)); + m_aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); + } + + virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int triangleIndex) + { + (void)partId; + (void)triangleIndex; + + m_aabbMin.setMin(triangle[0]); + m_aabbMax.setMax(triangle[0]); + m_aabbMin.setMin(triangle[1]); + m_aabbMax.setMax(triangle[1]); + m_aabbMin.setMin(triangle[2]); + m_aabbMax.setMax(triangle[2]); + } + }; + + //first calculate the total aabb for all triangles + AabbCalculationCallback aabbCallback; + aabbMin.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); + aabbMax.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT)); + InternalProcessAllTriangles(&aabbCallback,aabbMin,aabbMax); + + aabbMin = aabbCallback.m_aabbMin; + aabbMax = aabbCallback.m_aabbMax; +} + + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h new file mode 100644 index 0000000000..9513f68f77 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h @@ -0,0 +1,167 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef B3_STRIDING_MESHINTERFACE_H +#define B3_STRIDING_MESHINTERFACE_H + +#include "Bullet3Common/b3Vector3.h" +#include "b3TriangleCallback.h" +//#include "b3ConcaveShape.h" + + +enum PHY_ScalarType { + PHY_FLOAT, PHY_DOUBLE, PHY_INTEGER, PHY_SHORT, + PHY_FIXEDPOINT88, PHY_UCHAR +}; + + +/// The b3StridingMeshInterface is the interface class for high performance generic access to triangle meshes, used in combination with b3BvhTriangleMeshShape and some other collision shapes. +/// Using index striding of 3*sizeof(integer) it can use triangle arrays, using index striding of 1*sizeof(integer) it can handle triangle strips. +/// It allows for sharing graphics and collision meshes. Also it provides locking/unlocking of graphics meshes that are in gpu memory. +B3_ATTRIBUTE_ALIGNED16(class ) b3StridingMeshInterface +{ + protected: + + b3Vector3 m_scaling; + + public: + B3_DECLARE_ALIGNED_ALLOCATOR(); + + b3StridingMeshInterface() :m_scaling(b3MakeVector3(b3Scalar(1.),b3Scalar(1.),b3Scalar(1.))) + { + + } + + virtual ~b3StridingMeshInterface(); + + + + virtual void InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const; + + ///brute force method to calculate aabb + void calculateAabbBruteForce(b3Vector3& aabbMin,b3Vector3& aabbMax); + + /// get read and write access to a subpart of a triangle mesh + /// this subpart has a continuous array of vertices and indices + /// in this way the mesh can be handled as chunks of memory with striding + /// very similar to OpenGL vertexarray support + /// make a call to unLockVertexBase when the read and write access is finished + virtual void getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& stride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0)=0; + + virtual void getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& stride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0) const=0; + + /// unLockVertexBase finishes the access to a subpart of the triangle mesh + /// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished + virtual void unLockVertexBase(int subpart)=0; + + virtual void unLockReadOnlyVertexBase(int subpart) const=0; + + + /// getNumSubParts returns the number of seperate subparts + /// each subpart has a continuous array of vertices and indices + virtual int getNumSubParts() const=0; + + virtual void preallocateVertices(int numverts)=0; + virtual void preallocateIndices(int numindices)=0; + + virtual bool hasPremadeAabb() const { return false; } + virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const + { + (void) aabbMin; + (void) aabbMax; + } + virtual void getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const + { + (void) aabbMin; + (void) aabbMax; + } + + const b3Vector3& getScaling() const { + return m_scaling; + } + void setScaling(const b3Vector3& scaling) + { + m_scaling = scaling; + } + + virtual int calculateSerializeBufferSize() const; + + ///fills the dataBuffer and returns the struct name (and 0 on failure) + //virtual const char* serialize(void* dataBuffer, b3Serializer* serializer) const; + + +}; + +struct b3IntIndexData +{ + int m_value; +}; + +struct b3ShortIntIndexData +{ + short m_value; + char m_pad[2]; +}; + +struct b3ShortIntIndexTripletData +{ + short m_values[3]; + char m_pad[2]; +}; + +struct b3CharIndexTripletData +{ + unsigned char m_values[3]; + char m_pad; +}; + + +///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64 +struct b3MeshPartData +{ + b3Vector3FloatData *m_vertices3f; + b3Vector3DoubleData *m_vertices3d; + + b3IntIndexData *m_indices32; + b3ShortIntIndexTripletData *m_3indices16; + b3CharIndexTripletData *m_3indices8; + + b3ShortIntIndexData *m_indices16;//backwards compatibility + + int m_numTriangles;//length of m_indices = m_numTriangles + int m_numVertices; +}; + + +///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64 +struct b3StridingMeshInterfaceData +{ + b3MeshPartData *m_meshPartsPtr; + b3Vector3FloatData m_scaling; + int m_numMeshParts; + char m_padding[4]; +}; + + + + +B3_FORCE_INLINE int b3StridingMeshInterface::calculateSerializeBufferSize() const +{ + return sizeof(b3StridingMeshInterfaceData); +} + + + +#endif //B3_STRIDING_MESHINTERFACE_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h new file mode 100644 index 0000000000..d073ee57c3 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h @@ -0,0 +1,38 @@ + +#ifndef B3_SUPPORT_MAPPINGS_H +#define B3_SUPPORT_MAPPINGS_H + +#include "Bullet3Common/b3Transform.h" +#include "Bullet3Common/b3AlignedObjectArray.h" +#include "b3VectorFloat4.h" + + +struct b3GjkPairDetector; + + + +inline b3Vector3 localGetSupportVertexWithMargin(const float4& supportVec,const struct b3ConvexPolyhedronData* hull, + const b3AlignedObjectArray<b3Vector3>& verticesA, b3Scalar margin) +{ + b3Vector3 supVec = b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); + b3Scalar maxDot = b3Scalar(-B3_LARGE_FLOAT); + + // Here we take advantage of dot(a, b*c) = dot(a*b, c). Note: This is true mathematically, but not numerically. + if( 0 < hull->m_numVertices ) + { + const b3Vector3 scaled = supportVec; + int index = (int) scaled.maxDot( &verticesA[hull->m_vertexOffset], hull->m_numVertices, maxDot); + return verticesA[hull->m_vertexOffset+index]; + } + + return supVec; + +} + +inline b3Vector3 localGetSupportVertexWithoutMargin(const float4& supportVec,const struct b3ConvexPolyhedronData* hull, + const b3AlignedObjectArray<b3Vector3>& verticesA) +{ + return localGetSupportVertexWithMargin(supportVec,hull,verticesA,0.f); +} + +#endif //B3_SUPPORT_MAPPINGS_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp new file mode 100644 index 0000000000..9066451884 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp @@ -0,0 +1,28 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#include "b3TriangleCallback.h" + +b3TriangleCallback::~b3TriangleCallback() +{ + +} + + +b3InternalTriangleIndexCallback::~b3InternalTriangleIndexCallback() +{ + +} + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h new file mode 100644 index 0000000000..3059fa4f21 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h @@ -0,0 +1,42 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef B3_TRIANGLE_CALLBACK_H +#define B3_TRIANGLE_CALLBACK_H + +#include "Bullet3Common/b3Vector3.h" + + +///The b3TriangleCallback provides a callback for each overlapping triangle when calling processAllTriangles. +///This callback is called by processAllTriangles for all b3ConcaveShape derived class, such as b3BvhTriangleMeshShape, b3StaticPlaneShape and b3HeightfieldTerrainShape. +class b3TriangleCallback +{ +public: + + virtual ~b3TriangleCallback(); + virtual void processTriangle(b3Vector3* triangle, int partId, int triangleIndex) = 0; +}; + +class b3InternalTriangleIndexCallback +{ +public: + + virtual ~b3InternalTriangleIndexCallback(); + virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int triangleIndex) = 0; +}; + + + +#endif //B3_TRIANGLE_CALLBACK_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp new file mode 100644 index 0000000000..a0f59babbe --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp @@ -0,0 +1,95 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#include "b3TriangleIndexVertexArray.h" + +b3TriangleIndexVertexArray::b3TriangleIndexVertexArray(int numTriangles,int* triangleIndexBase,int triangleIndexStride,int numVertices,b3Scalar* vertexBase,int vertexStride) +: m_hasAabb(0) +{ + b3IndexedMesh mesh; + + mesh.m_numTriangles = numTriangles; + mesh.m_triangleIndexBase = (const unsigned char *)triangleIndexBase; + mesh.m_triangleIndexStride = triangleIndexStride; + mesh.m_numVertices = numVertices; + mesh.m_vertexBase = (const unsigned char *)vertexBase; + mesh.m_vertexStride = vertexStride; + + addIndexedMesh(mesh); + +} + +b3TriangleIndexVertexArray::~b3TriangleIndexVertexArray() +{ + +} + +void b3TriangleIndexVertexArray::getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart) +{ + b3Assert(subpart< getNumSubParts() ); + + b3IndexedMesh& mesh = m_indexedMeshes[subpart]; + + numverts = mesh.m_numVertices; + (*vertexbase) = (unsigned char *) mesh.m_vertexBase; + + type = mesh.m_vertexType; + + vertexStride = mesh.m_vertexStride; + + numfaces = mesh.m_numTriangles; + + (*indexbase) = (unsigned char *)mesh.m_triangleIndexBase; + indexstride = mesh.m_triangleIndexStride; + indicestype = mesh.m_indexType; +} + +void b3TriangleIndexVertexArray::getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart) const +{ + const b3IndexedMesh& mesh = m_indexedMeshes[subpart]; + + numverts = mesh.m_numVertices; + (*vertexbase) = (const unsigned char *)mesh.m_vertexBase; + + type = mesh.m_vertexType; + + vertexStride = mesh.m_vertexStride; + + numfaces = mesh.m_numTriangles; + (*indexbase) = (const unsigned char *)mesh.m_triangleIndexBase; + indexstride = mesh.m_triangleIndexStride; + indicestype = mesh.m_indexType; +} + +bool b3TriangleIndexVertexArray::hasPremadeAabb() const +{ + return (m_hasAabb == 1); +} + + +void b3TriangleIndexVertexArray::setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const +{ + m_aabbMin = aabbMin; + m_aabbMax = aabbMax; + m_hasAabb = 1; // this is intentionally an int see notes in header +} + +void b3TriangleIndexVertexArray::getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const +{ + *aabbMin = m_aabbMin; + *aabbMax = m_aabbMax; +} + + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h new file mode 100644 index 0000000000..d26b2893bc --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h @@ -0,0 +1,133 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef B3_TRIANGLE_INDEX_VERTEX_ARRAY_H +#define B3_TRIANGLE_INDEX_VERTEX_ARRAY_H + +#include "b3StridingMeshInterface.h" +#include "Bullet3Common/b3AlignedObjectArray.h" +#include "Bullet3Common/b3Scalar.h" + + +///The b3IndexedMesh indexes a single vertex and index array. Multiple b3IndexedMesh objects can be passed into a b3TriangleIndexVertexArray using addIndexedMesh. +///Instead of the number of indices, we pass the number of triangles. +B3_ATTRIBUTE_ALIGNED16( struct) b3IndexedMesh +{ + B3_DECLARE_ALIGNED_ALLOCATOR(); + + int m_numTriangles; + const unsigned char * m_triangleIndexBase; + // Size in byte of the indices for one triangle (3*sizeof(index_type) if the indices are tightly packed) + int m_triangleIndexStride; + int m_numVertices; + const unsigned char * m_vertexBase; + // Size of a vertex, in bytes + int m_vertexStride; + + // The index type is set when adding an indexed mesh to the + // b3TriangleIndexVertexArray, do not set it manually + PHY_ScalarType m_indexType; + + // The vertex type has a default type similar to Bullet's precision mode (float or double) + // but can be set manually if you for example run Bullet with double precision but have + // mesh data in single precision.. + PHY_ScalarType m_vertexType; + + + b3IndexedMesh() + :m_indexType(PHY_INTEGER), +#ifdef B3_USE_DOUBLE_PRECISION + m_vertexType(PHY_DOUBLE) +#else // B3_USE_DOUBLE_PRECISION + m_vertexType(PHY_FLOAT) +#endif // B3_USE_DOUBLE_PRECISION + { + } +} +; + + +typedef b3AlignedObjectArray<b3IndexedMesh> IndexedMeshArray; + +///The b3TriangleIndexVertexArray allows to access multiple triangle meshes, by indexing into existing triangle/index arrays. +///Additional meshes can be added using addIndexedMesh +///No duplcate is made of the vertex/index data, it only indexes into external vertex/index arrays. +///So keep those arrays around during the lifetime of this b3TriangleIndexVertexArray. +B3_ATTRIBUTE_ALIGNED16( class) b3TriangleIndexVertexArray : public b3StridingMeshInterface +{ +protected: + IndexedMeshArray m_indexedMeshes; + int m_pad[2]; + mutable int m_hasAabb; // using int instead of bool to maintain alignment + mutable b3Vector3 m_aabbMin; + mutable b3Vector3 m_aabbMax; + +public: + + B3_DECLARE_ALIGNED_ALLOCATOR(); + + b3TriangleIndexVertexArray() : m_hasAabb(0) + { + } + + virtual ~b3TriangleIndexVertexArray(); + + //just to be backwards compatible + b3TriangleIndexVertexArray(int numTriangles,int* triangleIndexBase,int triangleIndexStride,int numVertices,b3Scalar* vertexBase,int vertexStride); + + void addIndexedMesh(const b3IndexedMesh& mesh, PHY_ScalarType indexType = PHY_INTEGER) + { + m_indexedMeshes.push_back(mesh); + m_indexedMeshes[m_indexedMeshes.size()-1].m_indexType = indexType; + } + + + virtual void getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0); + + virtual void getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0) const; + + /// unLockVertexBase finishes the access to a subpart of the triangle mesh + /// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished + virtual void unLockVertexBase(int subpart) {(void)subpart;} + + virtual void unLockReadOnlyVertexBase(int subpart) const {(void)subpart;} + + /// getNumSubParts returns the number of seperate subparts + /// each subpart has a continuous array of vertices and indices + virtual int getNumSubParts() const { + return (int)m_indexedMeshes.size(); + } + + IndexedMeshArray& getIndexedMeshArray() + { + return m_indexedMeshes; + } + + const IndexedMeshArray& getIndexedMeshArray() const + { + return m_indexedMeshes; + } + + virtual void preallocateVertices(int numverts){(void) numverts;} + virtual void preallocateIndices(int numindices){(void) numindices;} + + virtual bool hasPremadeAabb() const; + virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const; + virtual void getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const; + +} +; + +#endif //B3_TRIANGLE_INDEX_VERTEX_ARRAY_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h new file mode 100644 index 0000000000..f6f65f7719 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h @@ -0,0 +1,11 @@ +#ifndef B3_VECTOR_FLOAT4_H +#define B3_VECTOR_FLOAT4_H + +#include "Bullet3Common/b3Transform.h" + +//#define cross3(a,b) (a.cross(b)) +#define float4 b3Vector3 +//#define make_float4(x,y,z,w) b3Vector4(x,y,z,w) + + +#endif //B3_VECTOR_FLOAT4_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp new file mode 100644 index 0000000000..cf3d5ef49d --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp @@ -0,0 +1,609 @@ + +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. + + Elsevier CDROM license agreements grants nonexclusive license to use the software + for any purpose, commercial or non-commercial as long as the following credit is included + identifying the original source of the software: + + Parts of the source are "from the book Real-Time Collision Detection by + Christer Ericson, published by Morgan Kaufmann Publishers, + (c) 2005 Elsevier Inc." + +*/ + + +#include "b3VoronoiSimplexSolver.h" + +#define VERTA 0 +#define VERTB 1 +#define VERTC 2 +#define VERTD 3 + +#define B3_CATCH_DEGENERATE_TETRAHEDRON 1 +void b3VoronoiSimplexSolver::removeVertex(int index) +{ + + b3Assert(m_numVertices>0); + m_numVertices--; + m_simplexVectorW[index] = m_simplexVectorW[m_numVertices]; + m_simplexPointsP[index] = m_simplexPointsP[m_numVertices]; + m_simplexPointsQ[index] = m_simplexPointsQ[m_numVertices]; +} + +void b3VoronoiSimplexSolver::reduceVertices (const b3UsageBitfield& usedVerts) +{ + if ((numVertices() >= 4) && (!usedVerts.usedVertexD)) + removeVertex(3); + + if ((numVertices() >= 3) && (!usedVerts.usedVertexC)) + removeVertex(2); + + if ((numVertices() >= 2) && (!usedVerts.usedVertexB)) + removeVertex(1); + + if ((numVertices() >= 1) && (!usedVerts.usedVertexA)) + removeVertex(0); + +} + + + + + +//clear the simplex, remove all the vertices +void b3VoronoiSimplexSolver::reset() +{ + m_cachedValidClosest = false; + m_numVertices = 0; + m_needsUpdate = true; + m_lastW = b3MakeVector3(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT)); + m_cachedBC.reset(); +} + + + + //add a vertex +void b3VoronoiSimplexSolver::addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q) +{ + m_lastW = w; + m_needsUpdate = true; + + m_simplexVectorW[m_numVertices] = w; + m_simplexPointsP[m_numVertices] = p; + m_simplexPointsQ[m_numVertices] = q; + + m_numVertices++; +} + +bool b3VoronoiSimplexSolver::updateClosestVectorAndPoints() +{ + + if (m_needsUpdate) + { + m_cachedBC.reset(); + + m_needsUpdate = false; + + switch (numVertices()) + { + case 0: + m_cachedValidClosest = false; + break; + case 1: + { + m_cachedP1 = m_simplexPointsP[0]; + m_cachedP2 = m_simplexPointsQ[0]; + m_cachedV = m_cachedP1-m_cachedP2; //== m_simplexVectorW[0] + m_cachedBC.reset(); + m_cachedBC.setBarycentricCoordinates(b3Scalar(1.),b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); + m_cachedValidClosest = m_cachedBC.isValid(); + break; + }; + case 2: + { + //closest point origin from line segment + const b3Vector3& from = m_simplexVectorW[0]; + const b3Vector3& to = m_simplexVectorW[1]; + b3Vector3 nearest; + + b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); + b3Vector3 diff = p - from; + b3Vector3 v = to - from; + b3Scalar t = v.dot(diff); + + if (t > 0) { + b3Scalar dotVV = v.dot(v); + if (t < dotVV) { + t /= dotVV; + diff -= t*v; + m_cachedBC.m_usedVertices.usedVertexA = true; + m_cachedBC.m_usedVertices.usedVertexB = true; + } else { + t = 1; + diff -= v; + //reduce to 1 point + m_cachedBC.m_usedVertices.usedVertexB = true; + } + } else + { + t = 0; + //reduce to 1 point + m_cachedBC.m_usedVertices.usedVertexA = true; + } + m_cachedBC.setBarycentricCoordinates(1-t,t); + nearest = from + t*v; + + m_cachedP1 = m_simplexPointsP[0] + t * (m_simplexPointsP[1] - m_simplexPointsP[0]); + m_cachedP2 = m_simplexPointsQ[0] + t * (m_simplexPointsQ[1] - m_simplexPointsQ[0]); + m_cachedV = m_cachedP1 - m_cachedP2; + + reduceVertices(m_cachedBC.m_usedVertices); + + m_cachedValidClosest = m_cachedBC.isValid(); + break; + } + case 3: + { + //closest point origin from triangle + b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); + + const b3Vector3& a = m_simplexVectorW[0]; + const b3Vector3& b = m_simplexVectorW[1]; + const b3Vector3& c = m_simplexVectorW[2]; + + closestPtPointTriangle(p,a,b,c,m_cachedBC); + m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] + + m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] + + m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2]; + + m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] + + m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] + + m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2]; + + m_cachedV = m_cachedP1-m_cachedP2; + + reduceVertices (m_cachedBC.m_usedVertices); + m_cachedValidClosest = m_cachedBC.isValid(); + + break; + } + case 4: + { + + + b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); + + const b3Vector3& a = m_simplexVectorW[0]; + const b3Vector3& b = m_simplexVectorW[1]; + const b3Vector3& c = m_simplexVectorW[2]; + const b3Vector3& d = m_simplexVectorW[3]; + + bool hasSeperation = closestPtPointTetrahedron(p,a,b,c,d,m_cachedBC); + + if (hasSeperation) + { + + m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] + + m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] + + m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2] + + m_simplexPointsP[3] * m_cachedBC.m_barycentricCoords[3]; + + m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] + + m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] + + m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2] + + m_simplexPointsQ[3] * m_cachedBC.m_barycentricCoords[3]; + + m_cachedV = m_cachedP1-m_cachedP2; + reduceVertices (m_cachedBC.m_usedVertices); + } else + { +// printf("sub distance got penetration\n"); + + if (m_cachedBC.m_degenerate) + { + m_cachedValidClosest = false; + } else + { + m_cachedValidClosest = true; + //degenerate case == false, penetration = true + zero + m_cachedV.setValue(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); + } + break; + } + + m_cachedValidClosest = m_cachedBC.isValid(); + + //closest point origin from tetrahedron + break; + } + default: + { + m_cachedValidClosest = false; + } + }; + } + + return m_cachedValidClosest; + +} + +//return/calculate the closest vertex +bool b3VoronoiSimplexSolver::closest(b3Vector3& v) +{ + bool succes = updateClosestVectorAndPoints(); + v = m_cachedV; + return succes; +} + + + +b3Scalar b3VoronoiSimplexSolver::maxVertex() +{ + int i, numverts = numVertices(); + b3Scalar maxV = b3Scalar(0.); + for (i=0;i<numverts;i++) + { + b3Scalar curLen2 = m_simplexVectorW[i].length2(); + if (maxV < curLen2) + maxV = curLen2; + } + return maxV; +} + + + + //return the current simplex +int b3VoronoiSimplexSolver::getSimplex(b3Vector3 *pBuf, b3Vector3 *qBuf, b3Vector3 *yBuf) const +{ + int i; + for (i=0;i<numVertices();i++) + { + yBuf[i] = m_simplexVectorW[i]; + pBuf[i] = m_simplexPointsP[i]; + qBuf[i] = m_simplexPointsQ[i]; + } + return numVertices(); +} + + + + +bool b3VoronoiSimplexSolver::inSimplex(const b3Vector3& w) +{ + bool found = false; + int i, numverts = numVertices(); + //b3Scalar maxV = b3Scalar(0.); + + //w is in the current (reduced) simplex + for (i=0;i<numverts;i++) + { +#ifdef BT_USE_EQUAL_VERTEX_THRESHOLD + if ( m_simplexVectorW[i].distance2(w) <= m_equalVertexThreshold) +#else + if (m_simplexVectorW[i] == w) +#endif + found = true; + } + + //check in case lastW is already removed + if (w == m_lastW) + return true; + + return found; +} + +void b3VoronoiSimplexSolver::backup_closest(b3Vector3& v) +{ + v = m_cachedV; +} + + +bool b3VoronoiSimplexSolver::emptySimplex() const +{ + return (numVertices() == 0); + +} + +void b3VoronoiSimplexSolver::compute_points(b3Vector3& p1, b3Vector3& p2) +{ + updateClosestVectorAndPoints(); + p1 = m_cachedP1; + p2 = m_cachedP2; + +} + + + + +bool b3VoronoiSimplexSolver::closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c,b3SubSimplexClosestResult& result) +{ + result.m_usedVertices.reset(); + + // Check if P in vertex region outside A + b3Vector3 ab = b - a; + b3Vector3 ac = c - a; + b3Vector3 ap = p - a; + b3Scalar d1 = ab.dot(ap); + b3Scalar d2 = ac.dot(ap); + if (d1 <= b3Scalar(0.0) && d2 <= b3Scalar(0.0)) + { + result.m_closestPointOnSimplex = a; + result.m_usedVertices.usedVertexA = true; + result.setBarycentricCoordinates(1,0,0); + return true;// a; // barycentric coordinates (1,0,0) + } + + // Check if P in vertex region outside B + b3Vector3 bp = p - b; + b3Scalar d3 = ab.dot(bp); + b3Scalar d4 = ac.dot(bp); + if (d3 >= b3Scalar(0.0) && d4 <= d3) + { + result.m_closestPointOnSimplex = b; + result.m_usedVertices.usedVertexB = true; + result.setBarycentricCoordinates(0,1,0); + + return true; // b; // barycentric coordinates (0,1,0) + } + // Check if P in edge region of AB, if so return projection of P onto AB + b3Scalar vc = d1*d4 - d3*d2; + if (vc <= b3Scalar(0.0) && d1 >= b3Scalar(0.0) && d3 <= b3Scalar(0.0)) { + b3Scalar v = d1 / (d1 - d3); + result.m_closestPointOnSimplex = a + v * ab; + result.m_usedVertices.usedVertexA = true; + result.m_usedVertices.usedVertexB = true; + result.setBarycentricCoordinates(1-v,v,0); + return true; + //return a + v * ab; // barycentric coordinates (1-v,v,0) + } + + // Check if P in vertex region outside C + b3Vector3 cp = p - c; + b3Scalar d5 = ab.dot(cp); + b3Scalar d6 = ac.dot(cp); + if (d6 >= b3Scalar(0.0) && d5 <= d6) + { + result.m_closestPointOnSimplex = c; + result.m_usedVertices.usedVertexC = true; + result.setBarycentricCoordinates(0,0,1); + return true;//c; // barycentric coordinates (0,0,1) + } + + // Check if P in edge region of AC, if so return projection of P onto AC + b3Scalar vb = d5*d2 - d1*d6; + if (vb <= b3Scalar(0.0) && d2 >= b3Scalar(0.0) && d6 <= b3Scalar(0.0)) { + b3Scalar w = d2 / (d2 - d6); + result.m_closestPointOnSimplex = a + w * ac; + result.m_usedVertices.usedVertexA = true; + result.m_usedVertices.usedVertexC = true; + result.setBarycentricCoordinates(1-w,0,w); + return true; + //return a + w * ac; // barycentric coordinates (1-w,0,w) + } + + // Check if P in edge region of BC, if so return projection of P onto BC + b3Scalar va = d3*d6 - d5*d4; + if (va <= b3Scalar(0.0) && (d4 - d3) >= b3Scalar(0.0) && (d5 - d6) >= b3Scalar(0.0)) { + b3Scalar w = (d4 - d3) / ((d4 - d3) + (d5 - d6)); + + result.m_closestPointOnSimplex = b + w * (c - b); + result.m_usedVertices.usedVertexB = true; + result.m_usedVertices.usedVertexC = true; + result.setBarycentricCoordinates(0,1-w,w); + return true; + // return b + w * (c - b); // barycentric coordinates (0,1-w,w) + } + + // P inside face region. Compute Q through its barycentric coordinates (u,v,w) + b3Scalar denom = b3Scalar(1.0) / (va + vb + vc); + b3Scalar v = vb * denom; + b3Scalar w = vc * denom; + + result.m_closestPointOnSimplex = a + ab * v + ac * w; + result.m_usedVertices.usedVertexA = true; + result.m_usedVertices.usedVertexB = true; + result.m_usedVertices.usedVertexC = true; + result.setBarycentricCoordinates(1-v-w,v,w); + + return true; +// return a + ab * v + ac * w; // = u*a + v*b + w*c, u = va * denom = b3Scalar(1.0) - v - w + +} + + + + + +/// Test if point p and d lie on opposite sides of plane through abc +int b3VoronoiSimplexSolver::pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d) +{ + b3Vector3 normal = (b-a).cross(c-a); + + b3Scalar signp = (p - a).dot(normal); // [AP AB AC] + b3Scalar signd = (d - a).dot( normal); // [AD AB AC] + +#ifdef B3_CATCH_DEGENERATE_TETRAHEDRON +#ifdef BT_USE_DOUBLE_PRECISION +if (signd * signd < (b3Scalar(1e-8) * b3Scalar(1e-8))) + { + return -1; + } +#else + if (signd * signd < (b3Scalar(1e-4) * b3Scalar(1e-4))) + { +// printf("affine dependent/degenerate\n");// + return -1; + } +#endif + +#endif + // Points on opposite sides if expression signs are opposite + return signp * signd < b3Scalar(0.); +} + + +bool b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult) +{ + b3SubSimplexClosestResult tempResult; + + // Start out assuming point inside all halfspaces, so closest to itself + finalResult.m_closestPointOnSimplex = p; + finalResult.m_usedVertices.reset(); + finalResult.m_usedVertices.usedVertexA = true; + finalResult.m_usedVertices.usedVertexB = true; + finalResult.m_usedVertices.usedVertexC = true; + finalResult.m_usedVertices.usedVertexD = true; + + int pointOutsideABC = pointOutsideOfPlane(p, a, b, c, d); + int pointOutsideACD = pointOutsideOfPlane(p, a, c, d, b); + int pointOutsideADB = pointOutsideOfPlane(p, a, d, b, c); + int pointOutsideBDC = pointOutsideOfPlane(p, b, d, c, a); + + if (pointOutsideABC < 0 || pointOutsideACD < 0 || pointOutsideADB < 0 || pointOutsideBDC < 0) + { + finalResult.m_degenerate = true; + return false; + } + + if (!pointOutsideABC && !pointOutsideACD && !pointOutsideADB && !pointOutsideBDC) + { + return false; + } + + + b3Scalar bestSqDist = FLT_MAX; + // If point outside face abc then compute closest point on abc + if (pointOutsideABC) + { + closestPtPointTriangle(p, a, b, c,tempResult); + b3Vector3 q = tempResult.m_closestPointOnSimplex; + + b3Scalar sqDist = (q - p).dot( q - p); + // Update best closest point if (squared) distance is less than current best + if (sqDist < bestSqDist) { + bestSqDist = sqDist; + finalResult.m_closestPointOnSimplex = q; + //convert result bitmask! + finalResult.m_usedVertices.reset(); + finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA; + finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexB; + finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexC; + finalResult.setBarycentricCoordinates( + tempResult.m_barycentricCoords[VERTA], + tempResult.m_barycentricCoords[VERTB], + tempResult.m_barycentricCoords[VERTC], + 0 + ); + + } + } + + + // Repeat test for face acd + if (pointOutsideACD) + { + closestPtPointTriangle(p, a, c, d,tempResult); + b3Vector3 q = tempResult.m_closestPointOnSimplex; + //convert result bitmask! + + b3Scalar sqDist = (q - p).dot( q - p); + if (sqDist < bestSqDist) + { + bestSqDist = sqDist; + finalResult.m_closestPointOnSimplex = q; + finalResult.m_usedVertices.reset(); + finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA; + + finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexB; + finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexC; + finalResult.setBarycentricCoordinates( + tempResult.m_barycentricCoords[VERTA], + 0, + tempResult.m_barycentricCoords[VERTB], + tempResult.m_barycentricCoords[VERTC] + ); + + } + } + // Repeat test for face adb + + + if (pointOutsideADB) + { + closestPtPointTriangle(p, a, d, b,tempResult); + b3Vector3 q = tempResult.m_closestPointOnSimplex; + //convert result bitmask! + + b3Scalar sqDist = (q - p).dot( q - p); + if (sqDist < bestSqDist) + { + bestSqDist = sqDist; + finalResult.m_closestPointOnSimplex = q; + finalResult.m_usedVertices.reset(); + finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA; + finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexC; + + finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB; + finalResult.setBarycentricCoordinates( + tempResult.m_barycentricCoords[VERTA], + tempResult.m_barycentricCoords[VERTC], + 0, + tempResult.m_barycentricCoords[VERTB] + ); + + } + } + // Repeat test for face bdc + + + if (pointOutsideBDC) + { + closestPtPointTriangle(p, b, d, c,tempResult); + b3Vector3 q = tempResult.m_closestPointOnSimplex; + //convert result bitmask! + b3Scalar sqDist = (q - p).dot( q - p); + if (sqDist < bestSqDist) + { + bestSqDist = sqDist; + finalResult.m_closestPointOnSimplex = q; + finalResult.m_usedVertices.reset(); + // + finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexA; + finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexC; + finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB; + + finalResult.setBarycentricCoordinates( + 0, + tempResult.m_barycentricCoords[VERTA], + tempResult.m_barycentricCoords[VERTC], + tempResult.m_barycentricCoords[VERTB] + ); + + } + } + + //help! we ended up full ! + + if (finalResult.m_usedVertices.usedVertexA && + finalResult.m_usedVertices.usedVertexB && + finalResult.m_usedVertices.usedVertexC && + finalResult.m_usedVertices.usedVertexD) + { + return true; + } + + return true; +} + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h new file mode 100644 index 0000000000..a6e27667d8 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h @@ -0,0 +1,177 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + + +#ifndef B3_VORONOI_SIMPLEX_SOLVER_H +#define B3_VORONOI_SIMPLEX_SOLVER_H + +#include "Bullet3Common/b3Vector3.h" + + +#define VORONOI_SIMPLEX_MAX_VERTS 5 + +///disable next define, or use defaultCollisionConfiguration->getSimplexSolver()->setEqualVertexThreshold(0.f) to disable/configure +//#define BT_USE_EQUAL_VERTEX_THRESHOLD +#define VORONOI_DEFAULT_EQUAL_VERTEX_THRESHOLD 0.0001f + + +struct b3UsageBitfield{ + b3UsageBitfield() + { + reset(); + } + + void reset() + { + usedVertexA = false; + usedVertexB = false; + usedVertexC = false; + usedVertexD = false; + } + unsigned short usedVertexA : 1; + unsigned short usedVertexB : 1; + unsigned short usedVertexC : 1; + unsigned short usedVertexD : 1; + unsigned short unused1 : 1; + unsigned short unused2 : 1; + unsigned short unused3 : 1; + unsigned short unused4 : 1; +}; + + +struct b3SubSimplexClosestResult +{ + b3Vector3 m_closestPointOnSimplex; + //MASK for m_usedVertices + //stores the simplex vertex-usage, using the MASK, + // if m_usedVertices & MASK then the related vertex is used + b3UsageBitfield m_usedVertices; + b3Scalar m_barycentricCoords[4]; + bool m_degenerate; + + void reset() + { + m_degenerate = false; + setBarycentricCoordinates(); + m_usedVertices.reset(); + } + bool isValid() + { + bool valid = (m_barycentricCoords[0] >= b3Scalar(0.)) && + (m_barycentricCoords[1] >= b3Scalar(0.)) && + (m_barycentricCoords[2] >= b3Scalar(0.)) && + (m_barycentricCoords[3] >= b3Scalar(0.)); + + + return valid; + } + void setBarycentricCoordinates(b3Scalar a=b3Scalar(0.),b3Scalar b=b3Scalar(0.),b3Scalar c=b3Scalar(0.),b3Scalar d=b3Scalar(0.)) + { + m_barycentricCoords[0] = a; + m_barycentricCoords[1] = b; + m_barycentricCoords[2] = c; + m_barycentricCoords[3] = d; + } + +}; + +/// b3VoronoiSimplexSolver is an implementation of the closest point distance algorithm from a 1-4 points simplex to the origin. +/// Can be used with GJK, as an alternative to Johnson distance algorithm. + +B3_ATTRIBUTE_ALIGNED16(class) b3VoronoiSimplexSolver +{ +public: + + B3_DECLARE_ALIGNED_ALLOCATOR(); + + int m_numVertices; + + b3Vector3 m_simplexVectorW[VORONOI_SIMPLEX_MAX_VERTS]; + b3Vector3 m_simplexPointsP[VORONOI_SIMPLEX_MAX_VERTS]; + b3Vector3 m_simplexPointsQ[VORONOI_SIMPLEX_MAX_VERTS]; + + + + b3Vector3 m_cachedP1; + b3Vector3 m_cachedP2; + b3Vector3 m_cachedV; + b3Vector3 m_lastW; + + b3Scalar m_equalVertexThreshold; + bool m_cachedValidClosest; + + + b3SubSimplexClosestResult m_cachedBC; + + bool m_needsUpdate; + + void removeVertex(int index); + void reduceVertices (const b3UsageBitfield& usedVerts); + bool updateClosestVectorAndPoints(); + + bool closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult); + int pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d); + bool closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c,b3SubSimplexClosestResult& result); + +public: + + b3VoronoiSimplexSolver() + : m_equalVertexThreshold(VORONOI_DEFAULT_EQUAL_VERTEX_THRESHOLD) + { + } + void reset(); + + void addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q); + + void setEqualVertexThreshold(b3Scalar threshold) + { + m_equalVertexThreshold = threshold; + } + + b3Scalar getEqualVertexThreshold() const + { + return m_equalVertexThreshold; + } + + bool closest(b3Vector3& v); + + b3Scalar maxVertex(); + + bool fullSimplex() const + { + return (m_numVertices == 4); + } + + int getSimplex(b3Vector3 *pBuf, b3Vector3 *qBuf, b3Vector3 *yBuf) const; + + bool inSimplex(const b3Vector3& w); + + void backup_closest(b3Vector3& v) ; + + bool emptySimplex() const ; + + void compute_points(b3Vector3& p1, b3Vector3& p2) ; + + int numVertices() const + { + return m_numVertices; + } + + +}; + +#endif //B3_VORONOI_SIMPLEX_SOLVER_H + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl new file mode 100644 index 0000000000..faa413441c --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl @@ -0,0 +1,283 @@ +//keep this enum in sync with the CPU version (in btCollidable.h) +//written by Erwin Coumans + +#define SHAPE_CONVEX_HULL 3 +#define SHAPE_CONCAVE_TRIMESH 5 +#define TRIANGLE_NUM_CONVEX_FACES 5 +#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6 +#define SHAPE_SPHERE 7 + +typedef unsigned int u32; + +#define MAX_NUM_PARTS_IN_BITS 10 + +///btQuantizedBvhNode is a compressed aabb node, 16 bytes. +///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range). +typedef struct +{ + //12 bytes + unsigned short int m_quantizedAabbMin[3]; + unsigned short int m_quantizedAabbMax[3]; + //4 bytes + int m_escapeIndexOrTriangleIndex; +} btQuantizedBvhNode; + +typedef struct +{ + float4 m_aabbMin; + float4 m_aabbMax; + float4 m_quantization; + int m_numNodes; + int m_numSubTrees; + int m_nodeOffset; + int m_subTreeOffset; + +} b3BvhInfo; + +int getTriangleIndex(const btQuantizedBvhNode* rootNode) +{ + unsigned int x=0; + unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS); + // Get only the lower bits where the triangle index is stored + return (rootNode->m_escapeIndexOrTriangleIndex&~(y)); +} + +int isLeaf(const btQuantizedBvhNode* rootNode) +{ + //skipindex is negative (internal node), triangleindex >=0 (leafnode) + return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0; +} + +int getEscapeIndex(const btQuantizedBvhNode* rootNode) +{ + return -rootNode->m_escapeIndexOrTriangleIndex; +} + +typedef struct +{ + //12 bytes + unsigned short int m_quantizedAabbMin[3]; + unsigned short int m_quantizedAabbMax[3]; + //4 bytes, points to the root of the subtree + int m_rootNodeIndex; + //4 bytes + int m_subtreeSize; + int m_padding[3]; +} btBvhSubtreeInfo; + +///keep this in sync with btCollidable.h +typedef struct +{ + int m_numChildShapes; + int blaat2; + int m_shapeType; + int m_shapeIndex; + +} btCollidableGpu; + +typedef struct +{ + float4 m_childPosition; + float4 m_childOrientation; + int m_shapeIndex; + int m_unused0; + int m_unused1; + int m_unused2; +} btGpuChildShape; + + +typedef struct +{ + float4 m_pos; + float4 m_quat; + float4 m_linVel; + float4 m_angVel; + + u32 m_collidableIdx; + float m_invMass; + float m_restituitionCoeff; + float m_frictionCoeff; +} BodyData; + +typedef struct +{ + union + { + float4 m_min; + float m_minElems[4]; + int m_minIndices[4]; + }; + union + { + float4 m_max; + float m_maxElems[4]; + int m_maxIndices[4]; + }; +} btAabbCL; + + +int testQuantizedAabbAgainstQuantizedAabb( + const unsigned short int* aabbMin1, + const unsigned short int* aabbMax1, + const unsigned short int* aabbMin2, + const unsigned short int* aabbMax2) +{ + //int overlap = 1; + if (aabbMin1[0] > aabbMax2[0]) + return 0; + if (aabbMax1[0] < aabbMin2[0]) + return 0; + if (aabbMin1[1] > aabbMax2[1]) + return 0; + if (aabbMax1[1] < aabbMin2[1]) + return 0; + if (aabbMin1[2] > aabbMax2[2]) + return 0; + if (aabbMax1[2] < aabbMin2[2]) + return 0; + return 1; + //overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap; + //overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap; + //overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap; + //return overlap; +} + + +void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization) +{ + float4 clampedPoint = max(point2,bvhAabbMin); + clampedPoint = min (clampedPoint, bvhAabbMax); + + float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization; + if (isMax) + { + out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1)); + out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1)); + out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1)); + } else + { + out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe)); + out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe)); + out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe)); + } + +} + + +// work-in-progress +__kernel void bvhTraversalKernel( __global const int4* pairs, + __global const BodyData* rigidBodies, + __global const btCollidableGpu* collidables, + __global btAabbCL* aabbs, + __global int4* concavePairsOut, + __global volatile int* numConcavePairsOut, + __global const btBvhSubtreeInfo* subtreeHeadersRoot, + __global const btQuantizedBvhNode* quantizedNodesRoot, + __global const b3BvhInfo* bvhInfos, + int numPairs, + int maxNumConcavePairsCapacity) +{ + int id = get_global_id(0); + if (id>=numPairs) + return; + + int bodyIndexA = pairs[id].x; + int bodyIndexB = pairs[id].y; + int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + + //once the broadphase avoids static-static pairs, we can remove this test + if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0)) + { + return; + } + + if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH) + return; + + int shapeTypeB = collidables[collidableIndexB].m_shapeType; + + if (shapeTypeB!=SHAPE_CONVEX_HULL && + shapeTypeB!=SHAPE_SPHERE && + shapeTypeB!=SHAPE_COMPOUND_OF_CONVEX_HULLS + ) + return; + + b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes]; + + float4 bvhAabbMin = bvhInfo.m_aabbMin; + float4 bvhAabbMax = bvhInfo.m_aabbMax; + float4 bvhQuantization = bvhInfo.m_quantization; + int numSubtreeHeaders = bvhInfo.m_numSubTrees; + __global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset]; + __global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset]; + + + unsigned short int quantizedQueryAabbMin[3]; + unsigned short int quantizedQueryAabbMax[3]; + quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization); + quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization); + + for (int i=0;i<numSubtreeHeaders;i++) + { + btBvhSubtreeInfo subtree = subtreeHeaders[i]; + + int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax); + if (overlap != 0) + { + int startNodeIndex = subtree.m_rootNodeIndex; + int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize; + int curIndex = startNodeIndex; + int escapeIndex; + int isLeafNode; + int aabbOverlap; + while (curIndex < endNodeIndex) + { + btQuantizedBvhNode rootNode = quantizedNodes[curIndex]; + aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax); + isLeafNode = isLeaf(&rootNode); + if (aabbOverlap) + { + if (isLeafNode) + { + int triangleIndex = getTriangleIndex(&rootNode); + if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS) + { + int numChildrenB = collidables[collidableIndexB].m_numChildShapes; + int pairIdx = atomic_add(numConcavePairsOut,numChildrenB); + for (int b=0;b<numChildrenB;b++) + { + if ((pairIdx+b)<maxNumConcavePairsCapacity) + { + int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b; + int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB); + concavePairsOut[pairIdx+b] = newPair; + } + } + } else + { + int pairIdx = atomic_inc(numConcavePairsOut); + if (pairIdx<maxNumConcavePairsCapacity) + { + int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0); + concavePairsOut[pairIdx] = newPair; + } + } + } + curIndex++; + } else + { + if (isLeafNode) + { + curIndex++; + } else + { + escapeIndex = getEscapeIndex(&rootNode); + curIndex += escapeIndex; + } + } + } + } + } + +}
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h new file mode 100644 index 0000000000..4b3b49eae8 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h @@ -0,0 +1,258 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* bvhTraversalKernelCL= \ +"//keep this enum in sync with the CPU version (in btCollidable.h)\n" +"//written by Erwin Coumans\n" +"#define SHAPE_CONVEX_HULL 3\n" +"#define SHAPE_CONCAVE_TRIMESH 5\n" +"#define TRIANGLE_NUM_CONVEX_FACES 5\n" +"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n" +"#define SHAPE_SPHERE 7\n" +"typedef unsigned int u32;\n" +"#define MAX_NUM_PARTS_IN_BITS 10\n" +"///btQuantizedBvhNode is a compressed aabb node, 16 bytes.\n" +"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n" +"typedef struct\n" +"{\n" +" //12 bytes\n" +" unsigned short int m_quantizedAabbMin[3];\n" +" unsigned short int m_quantizedAabbMax[3];\n" +" //4 bytes\n" +" int m_escapeIndexOrTriangleIndex;\n" +"} btQuantizedBvhNode;\n" +"typedef struct\n" +"{\n" +" float4 m_aabbMin;\n" +" float4 m_aabbMax;\n" +" float4 m_quantization;\n" +" int m_numNodes;\n" +" int m_numSubTrees;\n" +" int m_nodeOffset;\n" +" int m_subTreeOffset;\n" +"} b3BvhInfo;\n" +"int getTriangleIndex(const btQuantizedBvhNode* rootNode)\n" +"{\n" +" unsigned int x=0;\n" +" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n" +" // Get only the lower bits where the triangle index is stored\n" +" return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n" +"}\n" +"int isLeaf(const btQuantizedBvhNode* rootNode)\n" +"{\n" +" //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n" +" return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n" +"}\n" +" \n" +"int getEscapeIndex(const btQuantizedBvhNode* rootNode)\n" +"{\n" +" return -rootNode->m_escapeIndexOrTriangleIndex;\n" +"}\n" +"typedef struct\n" +"{\n" +" //12 bytes\n" +" unsigned short int m_quantizedAabbMin[3];\n" +" unsigned short int m_quantizedAabbMax[3];\n" +" //4 bytes, points to the root of the subtree\n" +" int m_rootNodeIndex;\n" +" //4 bytes\n" +" int m_subtreeSize;\n" +" int m_padding[3];\n" +"} btBvhSubtreeInfo;\n" +"///keep this in sync with btCollidable.h\n" +"typedef struct\n" +"{\n" +" int m_numChildShapes;\n" +" int blaat2;\n" +" int m_shapeType;\n" +" int m_shapeIndex;\n" +" \n" +"} btCollidableGpu;\n" +"typedef struct\n" +"{\n" +" float4 m_childPosition;\n" +" float4 m_childOrientation;\n" +" int m_shapeIndex;\n" +" int m_unused0;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"} btGpuChildShape;\n" +"typedef struct\n" +"{\n" +" float4 m_pos;\n" +" float4 m_quat;\n" +" float4 m_linVel;\n" +" float4 m_angVel;\n" +" u32 m_collidableIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"} BodyData;\n" +"typedef struct \n" +"{\n" +" union\n" +" {\n" +" float4 m_min;\n" +" float m_minElems[4];\n" +" int m_minIndices[4];\n" +" };\n" +" union\n" +" {\n" +" float4 m_max;\n" +" float m_maxElems[4];\n" +" int m_maxIndices[4];\n" +" };\n" +"} btAabbCL;\n" +"int testQuantizedAabbAgainstQuantizedAabb(\n" +" const unsigned short int* aabbMin1,\n" +" const unsigned short int* aabbMax1,\n" +" const unsigned short int* aabbMin2,\n" +" const unsigned short int* aabbMax2)\n" +"{\n" +" //int overlap = 1;\n" +" if (aabbMin1[0] > aabbMax2[0])\n" +" return 0;\n" +" if (aabbMax1[0] < aabbMin2[0])\n" +" return 0;\n" +" if (aabbMin1[1] > aabbMax2[1])\n" +" return 0;\n" +" if (aabbMax1[1] < aabbMin2[1])\n" +" return 0;\n" +" if (aabbMin1[2] > aabbMax2[2])\n" +" return 0;\n" +" if (aabbMax1[2] < aabbMin2[2])\n" +" return 0;\n" +" return 1;\n" +" //overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;\n" +" //overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;\n" +" //overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;\n" +" //return overlap;\n" +"}\n" +"void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)\n" +"{\n" +" float4 clampedPoint = max(point2,bvhAabbMin);\n" +" clampedPoint = min (clampedPoint, bvhAabbMax);\n" +" float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;\n" +" if (isMax)\n" +" {\n" +" out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));\n" +" out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));\n" +" out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));\n" +" } else\n" +" {\n" +" out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));\n" +" out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));\n" +" out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));\n" +" }\n" +"}\n" +"// work-in-progress\n" +"__kernel void bvhTraversalKernel( __global const int4* pairs, \n" +" __global const BodyData* rigidBodies, \n" +" __global const btCollidableGpu* collidables,\n" +" __global btAabbCL* aabbs,\n" +" __global int4* concavePairsOut,\n" +" __global volatile int* numConcavePairsOut,\n" +" __global const btBvhSubtreeInfo* subtreeHeadersRoot,\n" +" __global const btQuantizedBvhNode* quantizedNodesRoot,\n" +" __global const b3BvhInfo* bvhInfos,\n" +" int numPairs,\n" +" int maxNumConcavePairsCapacity)\n" +"{\n" +" int id = get_global_id(0);\n" +" if (id>=numPairs)\n" +" return;\n" +" \n" +" int bodyIndexA = pairs[id].x;\n" +" int bodyIndexB = pairs[id].y;\n" +" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" +" \n" +" //once the broadphase avoids static-static pairs, we can remove this test\n" +" if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n" +" {\n" +" return;\n" +" }\n" +" \n" +" if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)\n" +" return;\n" +" int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n" +" \n" +" if (shapeTypeB!=SHAPE_CONVEX_HULL &&\n" +" shapeTypeB!=SHAPE_SPHERE &&\n" +" shapeTypeB!=SHAPE_COMPOUND_OF_CONVEX_HULLS\n" +" )\n" +" return;\n" +" b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];\n" +" float4 bvhAabbMin = bvhInfo.m_aabbMin;\n" +" float4 bvhAabbMax = bvhInfo.m_aabbMax;\n" +" float4 bvhQuantization = bvhInfo.m_quantization;\n" +" int numSubtreeHeaders = bvhInfo.m_numSubTrees;\n" +" __global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n" +" __global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n" +" \n" +" unsigned short int quantizedQueryAabbMin[3];\n" +" unsigned short int quantizedQueryAabbMax[3];\n" +" quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);\n" +" quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);\n" +" \n" +" for (int i=0;i<numSubtreeHeaders;i++)\n" +" {\n" +" btBvhSubtreeInfo subtree = subtreeHeaders[i];\n" +" \n" +" int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);\n" +" if (overlap != 0)\n" +" {\n" +" int startNodeIndex = subtree.m_rootNodeIndex;\n" +" int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;\n" +" int curIndex = startNodeIndex;\n" +" int escapeIndex;\n" +" int isLeafNode;\n" +" int aabbOverlap;\n" +" while (curIndex < endNodeIndex)\n" +" {\n" +" btQuantizedBvhNode rootNode = quantizedNodes[curIndex];\n" +" aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);\n" +" isLeafNode = isLeaf(&rootNode);\n" +" if (aabbOverlap)\n" +" {\n" +" if (isLeafNode)\n" +" {\n" +" int triangleIndex = getTriangleIndex(&rootNode);\n" +" if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" +" {\n" +" int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n" +" int pairIdx = atomic_add(numConcavePairsOut,numChildrenB);\n" +" for (int b=0;b<numChildrenB;b++)\n" +" {\n" +" if ((pairIdx+b)<maxNumConcavePairsCapacity)\n" +" {\n" +" int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n" +" int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);\n" +" concavePairsOut[pairIdx+b] = newPair;\n" +" }\n" +" }\n" +" } else\n" +" {\n" +" int pairIdx = atomic_inc(numConcavePairsOut);\n" +" if (pairIdx<maxNumConcavePairsCapacity)\n" +" {\n" +" int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0);\n" +" concavePairsOut[pairIdx] = newPair;\n" +" }\n" +" }\n" +" } \n" +" curIndex++;\n" +" } else\n" +" {\n" +" if (isLeafNode)\n" +" {\n" +" curIndex++;\n" +" } else\n" +" {\n" +" escapeIndex = getEscapeIndex(&rootNode);\n" +" curIndex += escapeIndex;\n" +" }\n" +" }\n" +" }\n" +" }\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl new file mode 100644 index 0000000000..e754f4e1da --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl @@ -0,0 +1,311 @@ + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3MprPenetration.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h" + +#define AppendInc(x, out) out = atomic_inc(x) +#define GET_NPOINTS(x) (x).m_worldNormalOnB.w +#ifdef cl_ext_atomic_counters_32 + #pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else + #define counter32_t volatile __global int* +#endif + + +__kernel void mprPenetrationKernel( __global int4* pairs, + __global const b3RigidBodyData_t* rigidBodies, + __global const b3Collidable_t* collidables, + __global const b3ConvexPolyhedronData_t* convexShapes, + __global const float4* vertices, + __global float4* separatingNormals, + __global int* hasSeparatingAxis, + __global struct b3Contact4Data* restrict globalContactsOut, + counter32_t nGlobalContactsOut, + int contactCapacity, + int numPairs) +{ + int i = get_global_id(0); + int pairIndex = i; + if (i<numPairs) + { + int bodyIndexA = pairs[i].x; + int bodyIndexB = pairs[i].y; + + int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + + //once the broadphase avoids static-static pairs, we can remove this test + if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0)) + { + return; + } + + + if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL)) + { + return; + } + + float depthOut; + b3Float4 dirOut; + b3Float4 posOut; + + + int res = b3MprPenetration(pairIndex, bodyIndexA, bodyIndexB,rigidBodies,convexShapes,collidables,vertices,separatingNormals,hasSeparatingAxis,&depthOut, &dirOut, &posOut); + + + + + + if (res==0) + { + //add a contact + + int dstIdx; + AppendInc( nGlobalContactsOut, dstIdx ); + if (dstIdx<contactCapacity) + { + pairs[pairIndex].z = dstIdx; + __global struct b3Contact4Data* c = globalContactsOut + dstIdx; + c->m_worldNormalOnB = -dirOut;//normal; + c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff); + c->m_batchIdx = pairIndex; + int bodyA = pairs[pairIndex].x; + int bodyB = pairs[pairIndex].y; + c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB; + c->m_childIndexA = -1; + c->m_childIndexB = -1; + //for (int i=0;i<nContacts;i++) + posOut.w = -depthOut; + c->m_worldPosB[0] = posOut;//localPoints[contactIdx[i]]; + GET_NPOINTS(*c) = 1;//nContacts; + } + } + + } +} + +typedef float4 Quaternion; +#define make_float4 (float4) + +__inline +float dot3F4(float4 a, float4 b) +{ + float4 a1 = make_float4(a.xyz,0.f); + float4 b1 = make_float4(b.xyz,0.f); + return dot(a1, b1); +} + + + + +__inline +float4 cross3(float4 a, float4 b) +{ + return cross(a,b); +} +__inline +Quaternion qtMul(Quaternion a, Quaternion b) +{ + Quaternion ans; + ans = cross3( a, b ); + ans += a.w*b+b.w*a; +// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z); + ans.w = a.w*b.w - dot3F4(a, b); + return ans; +} + +__inline +Quaternion qtInvert(Quaternion q) +{ + return (Quaternion)(-q.xyz, q.w); +} + +__inline +float4 qtRotate(Quaternion q, float4 vec) +{ + Quaternion qInv = qtInvert( q ); + float4 vcpy = vec; + vcpy.w = 0.f; + float4 out = qtMul(qtMul(q,vcpy),qInv); + return out; +} + +__inline +float4 transform(const float4* p, const float4* translation, const Quaternion* orientation) +{ + return qtRotate( *orientation, *p ) + (*translation); +} + + +__inline +float4 qtInvRotate(const Quaternion q, float4 vec) +{ + return qtRotate( qtInvert( q ), vec ); +} + + +inline void project(__global const b3ConvexPolyhedronData_t* hull, const float4 pos, const float4 orn, +const float4* dir, __global const float4* vertices, float* min, float* max) +{ + min[0] = FLT_MAX; + max[0] = -FLT_MAX; + int numVerts = hull->m_numVertices; + + const float4 localDir = qtInvRotate(orn,*dir); + float offset = dot(pos,*dir); + for(int i=0;i<numVerts;i++) + { + float dp = dot(vertices[hull->m_vertexOffset+i],localDir); + if(dp < min[0]) + min[0] = dp; + if(dp > max[0]) + max[0] = dp; + } + if(min[0]>max[0]) + { + float tmp = min[0]; + min[0] = max[0]; + max[0] = tmp; + } + min[0] += offset; + max[0] += offset; +} + + +bool findSeparatingAxisUnitSphere( __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, + const float4 posA1, + const float4 ornA, + const float4 posB1, + const float4 ornB, + const float4 DeltaC2, + __global const float4* vertices, + __global const float4* unitSphereDirections, + int numUnitSphereDirections, + float4* sep, + float* dmin) +{ + + float4 posA = posA1; + posA.w = 0.f; + float4 posB = posB1; + posB.w = 0.f; + + int curPlaneTests=0; + + int curEdgeEdge = 0; + // Test unit sphere directions + for (int i=0;i<numUnitSphereDirections;i++) + { + + float4 crossje; + crossje = unitSphereDirections[i]; + + if (dot3F4(DeltaC2,crossje)>0) + crossje *= -1.f; + { + float dist; + bool result = true; + float Min0,Max0; + float Min1,Max1; + project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0); + project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1); + + if(Max0<Min1 || Max1<Min0) + return false; + + float d0 = Max0 - Min1; + float d1 = Max1 - Min0; + dist = d0<d1 ? d0:d1; + result = true; + + if(dist<*dmin) + { + *dmin = dist; + *sep = crossje; + } + } + } + + + if((dot3F4(-DeltaC2,*sep))>0.0f) + { + *sep = -(*sep); + } + return true; +} + + + +__kernel void findSeparatingAxisUnitSphereKernel( __global const int4* pairs, + __global const b3RigidBodyData_t* rigidBodies, + __global const b3Collidable_t* collidables, + __global const b3ConvexPolyhedronData_t* convexShapes, + __global const float4* vertices, + __global const float4* unitSphereDirections, + __global float4* separatingNormals, + __global int* hasSeparatingAxis, + __global float* dmins, + int numUnitSphereDirections, + int numPairs + ) +{ + + int i = get_global_id(0); + + if (i<numPairs) + { + + if (hasSeparatingAxis[i]) + { + + int bodyIndexA = pairs[i].x; + int bodyIndexB = pairs[i].y; + + int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + + int numFacesA = convexShapes[shapeIndexA].m_numFaces; + + float dmin = dmins[i]; + + float4 posA = rigidBodies[bodyIndexA].m_pos; + posA.w = 0.f; + float4 posB = rigidBodies[bodyIndexB].m_pos; + posB.w = 0.f; + float4 c0local = convexShapes[shapeIndexA].m_localCenter; + float4 ornA = rigidBodies[bodyIndexA].m_quat; + float4 c0 = transform(&c0local, &posA, &ornA); + float4 c1local = convexShapes[shapeIndexB].m_localCenter; + float4 ornB =rigidBodies[bodyIndexB].m_quat; + float4 c1 = transform(&c1local,&posB,&ornB); + const float4 DeltaC2 = c0 - c1; + float4 sepNormal = separatingNormals[i]; + + int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges; + if (numEdgeEdgeDirections>numUnitSphereDirections) + { + bool sepEE = findSeparatingAxisUnitSphere( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA, + posB,ornB, + DeltaC2, + vertices,unitSphereDirections,numUnitSphereDirections,&sepNormal,&dmin); + if (!sepEE) + { + hasSeparatingAxis[i] = 0; + } else + { + hasSeparatingAxis[i] = 1; + separatingNormals[i] = sepNormal; + } + } + } //if (hasSeparatingAxis[i]) + }//(i<numPairs) +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h new file mode 100644 index 0000000000..7ed4b382c3 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h @@ -0,0 +1,1446 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* mprKernelsCL= \ +"/***\n" +" * ---------------------------------\n" +" * Copyright (c)2012 Daniel Fiser <danfis@danfis.cz>\n" +" *\n" +" * This file was ported from mpr.c file, part of libccd.\n" +" * The Minkoski Portal Refinement implementation was ported \n" +" * to OpenCL by Erwin Coumans for the Bullet 3 Physics library.\n" +" * at http://github.com/erwincoumans/bullet3\n" +" *\n" +" * Distributed under the OSI-approved BSD License (the \"License\");\n" +" * see <http://www.opensource.org/licenses/bsd-license.php>.\n" +" * This software is distributed WITHOUT ANY WARRANTY; without even the\n" +" * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n" +" * See the License for more information.\n" +" */\n" +"#ifndef B3_MPR_PENETRATION_H\n" +"#define B3_MPR_PENETRATION_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_RIGIDBODY_DATA_H\n" +"#define B3_RIGIDBODY_DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_QUAT_H\n" +"#define B3_QUAT_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif\n" +"#endif\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Quat;\n" +" #define b3QuatConstArg const b3Quat\n" +" \n" +" \n" +"inline float4 b3FastNormalize4(float4 v)\n" +"{\n" +" v = (float4)(v.xyz,0.f);\n" +" return fast_normalize(v);\n" +"}\n" +" \n" +"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" +"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" +"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" +"{\n" +" b3Quat ans;\n" +" ans = b3Cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - b3Dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" +"{\n" +" b3Quat q;\n" +" q=in;\n" +" //return b3FastNormalize4(in);\n" +" float len = native_sqrt(dot(q, q));\n" +" if(len > 0.f)\n" +" {\n" +" q *= 1.f / len;\n" +" }\n" +" else\n" +" {\n" +" q.x = q.y = q.z = 0.f;\n" +" q.w = 1.f;\n" +" }\n" +" return q;\n" +"}\n" +"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" b3Quat qInv = b3QuatInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" return b3QuatRotate( b3QuatInvert( q ), vec );\n" +"}\n" +"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" +"{\n" +" return b3QuatRotate( orientation, point ) + (translation);\n" +"}\n" +" \n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifndef B3_MAT3x3_H\n" +"#define B3_MAT3x3_H\n" +"#ifndef B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"typedef struct\n" +"{\n" +" b3Float4 m_row[3];\n" +"}b3Mat3x3;\n" +"#define b3Mat3x3ConstArg const b3Mat3x3\n" +"#define b3GetRow(m,row) (m.m_row[row])\n" +"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" +"{\n" +" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" +" b3Mat3x3 out;\n" +" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" +" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" +" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" +" out.m_row[0].w = 0.f;\n" +" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" +" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" +" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" +" out.m_row[1].w = 0.f;\n" +" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" +" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" +" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" +" out.m_row[2].w = 0.f;\n" +" return out;\n" +"}\n" +"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = fabs(matIn.m_row[0]);\n" +" out.m_row[1] = fabs(matIn.m_row[1]);\n" +" out.m_row[2] = fabs(matIn.m_row[2]);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtZero();\n" +"__inline\n" +"b3Mat3x3 mtIdentity();\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Mat3x3 mtZero()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(0.f);\n" +" m.m_row[1] = (b3Float4)(0.f);\n" +" m.m_row[2] = (b3Float4)(0.f);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtIdentity()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(1,0,0,0);\n" +" m.m_row[1] = (b3Float4)(0,1,0,0);\n" +" m.m_row[2] = (b3Float4)(0,0,1,0);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" +" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" +" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" +"{\n" +" b3Mat3x3 transB;\n" +" transB = mtTranspose( b );\n" +" b3Mat3x3 ans;\n" +" // why this doesn't run when 0ing in the for{}\n" +" a.m_row[0].w = 0.f;\n" +" a.m_row[1].w = 0.f;\n" +" a.m_row[2].w = 0.f;\n" +" for(int i=0; i<3; i++)\n" +" {\n" +"// a.m_row[i].w = 0.f;\n" +" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" +" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" +" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" +" ans.m_row[i].w = 0.f;\n" +" }\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" +"{\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a.m_row[0], b );\n" +" ans.y = b3Dot3F4( a.m_row[1], b );\n" +" ans.z = b3Dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" +"{\n" +" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a, colx );\n" +" ans.y = b3Dot3F4( a, coly );\n" +" ans.z = b3Dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"#endif\n" +"#endif //B3_MAT3x3_H\n" +"typedef struct b3RigidBodyData b3RigidBodyData_t;\n" +"struct b3RigidBodyData\n" +"{\n" +" b3Float4 m_pos;\n" +" b3Quat m_quat;\n" +" b3Float4 m_linVel;\n" +" b3Float4 m_angVel;\n" +" int m_collidableIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"};\n" +"typedef struct b3InertiaData b3InertiaData_t;\n" +"struct b3InertiaData\n" +"{\n" +" b3Mat3x3 m_invInertiaWorld;\n" +" b3Mat3x3 m_initInvInertia;\n" +"};\n" +"#endif //B3_RIGIDBODY_DATA_H\n" +" \n" +"#ifndef B3_CONVEX_POLYHEDRON_DATA_H\n" +"#define B3_CONVEX_POLYHEDRON_DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"typedef struct b3GpuFace b3GpuFace_t;\n" +"struct b3GpuFace\n" +"{\n" +" b3Float4 m_plane;\n" +" int m_indexOffset;\n" +" int m_numIndices;\n" +" int m_unusedPadding1;\n" +" int m_unusedPadding2;\n" +"};\n" +"typedef struct b3ConvexPolyhedronData b3ConvexPolyhedronData_t;\n" +"struct b3ConvexPolyhedronData\n" +"{\n" +" b3Float4 m_localCenter;\n" +" b3Float4 m_extents;\n" +" b3Float4 mC;\n" +" b3Float4 mE;\n" +" float m_radius;\n" +" int m_faceOffset;\n" +" int m_numFaces;\n" +" int m_numVertices;\n" +" int m_vertexOffset;\n" +" int m_uniqueEdgesOffset;\n" +" int m_numUniqueEdges;\n" +" int m_unused;\n" +"};\n" +"#endif //B3_CONVEX_POLYHEDRON_DATA_H\n" +"#ifndef B3_COLLIDABLE_H\n" +"#define B3_COLLIDABLE_H\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"enum b3ShapeTypes\n" +"{\n" +" SHAPE_HEIGHT_FIELD=1,\n" +" SHAPE_CONVEX_HULL=3,\n" +" SHAPE_PLANE=4,\n" +" SHAPE_CONCAVE_TRIMESH=5,\n" +" SHAPE_COMPOUND_OF_CONVEX_HULLS=6,\n" +" SHAPE_SPHERE=7,\n" +" MAX_NUM_SHAPE_TYPES,\n" +"};\n" +"typedef struct b3Collidable b3Collidable_t;\n" +"struct b3Collidable\n" +"{\n" +" union {\n" +" int m_numChildShapes;\n" +" int m_bvhIndex;\n" +" };\n" +" union\n" +" {\n" +" float m_radius;\n" +" int m_compoundBvhIndex;\n" +" };\n" +" int m_shapeType;\n" +" int m_shapeIndex;\n" +"};\n" +"typedef struct b3GpuChildShape b3GpuChildShape_t;\n" +"struct b3GpuChildShape\n" +"{\n" +" b3Float4 m_childPosition;\n" +" b3Quat m_childOrientation;\n" +" int m_shapeIndex;\n" +" int m_unused0;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"};\n" +"struct b3CompoundOverlappingPair\n" +"{\n" +" int m_bodyIndexA;\n" +" int m_bodyIndexB;\n" +"// int m_pairType;\n" +" int m_childShapeIndexA;\n" +" int m_childShapeIndexB;\n" +"};\n" +"#endif //B3_COLLIDABLE_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#define B3_MPR_SQRT sqrt\n" +"#endif\n" +"#define B3_MPR_FMIN(x, y) ((x) < (y) ? (x) : (y))\n" +"#define B3_MPR_FABS fabs\n" +"#define B3_MPR_TOLERANCE 1E-6f\n" +"#define B3_MPR_MAX_ITERATIONS 1000\n" +"struct _b3MprSupport_t \n" +"{\n" +" b3Float4 v; //!< Support point in minkowski sum\n" +" b3Float4 v1; //!< Support point in obj1\n" +" b3Float4 v2; //!< Support point in obj2\n" +"};\n" +"typedef struct _b3MprSupport_t b3MprSupport_t;\n" +"struct _b3MprSimplex_t \n" +"{\n" +" b3MprSupport_t ps[4];\n" +" int last; //!< index of last added point\n" +"};\n" +"typedef struct _b3MprSimplex_t b3MprSimplex_t;\n" +"inline b3MprSupport_t* b3MprSimplexPointW(b3MprSimplex_t *s, int idx)\n" +"{\n" +" return &s->ps[idx];\n" +"}\n" +"inline void b3MprSimplexSetSize(b3MprSimplex_t *s, int size)\n" +"{\n" +" s->last = size - 1;\n" +"}\n" +"inline int b3MprSimplexSize(const b3MprSimplex_t *s)\n" +"{\n" +" return s->last + 1;\n" +"}\n" +"inline const b3MprSupport_t* b3MprSimplexPoint(const b3MprSimplex_t* s, int idx)\n" +"{\n" +" // here is no check on boundaries\n" +" return &s->ps[idx];\n" +"}\n" +"inline void b3MprSupportCopy(b3MprSupport_t *d, const b3MprSupport_t *s)\n" +"{\n" +" *d = *s;\n" +"}\n" +"inline void b3MprSimplexSet(b3MprSimplex_t *s, size_t pos, const b3MprSupport_t *a)\n" +"{\n" +" b3MprSupportCopy(s->ps + pos, a);\n" +"}\n" +"inline void b3MprSimplexSwap(b3MprSimplex_t *s, size_t pos1, size_t pos2)\n" +"{\n" +" b3MprSupport_t supp;\n" +" b3MprSupportCopy(&supp, &s->ps[pos1]);\n" +" b3MprSupportCopy(&s->ps[pos1], &s->ps[pos2]);\n" +" b3MprSupportCopy(&s->ps[pos2], &supp);\n" +"}\n" +"inline int b3MprIsZero(float val)\n" +"{\n" +" return B3_MPR_FABS(val) < FLT_EPSILON;\n" +"}\n" +"inline int b3MprEq(float _a, float _b)\n" +"{\n" +" float ab;\n" +" float a, b;\n" +" ab = B3_MPR_FABS(_a - _b);\n" +" if (B3_MPR_FABS(ab) < FLT_EPSILON)\n" +" return 1;\n" +" a = B3_MPR_FABS(_a);\n" +" b = B3_MPR_FABS(_b);\n" +" if (b > a){\n" +" return ab < FLT_EPSILON * b;\n" +" }else{\n" +" return ab < FLT_EPSILON * a;\n" +" }\n" +"}\n" +"inline int b3MprVec3Eq(const b3Float4* a, const b3Float4 *b)\n" +"{\n" +" return b3MprEq((*a).x, (*b).x)\n" +" && b3MprEq((*a).y, (*b).y)\n" +" && b3MprEq((*a).z, (*b).z);\n" +"}\n" +"inline b3Float4 b3LocalGetSupportVertex(b3Float4ConstArg supportVec,__global const b3ConvexPolyhedronData_t* hull, b3ConstArray(b3Float4) verticesA)\n" +"{\n" +" b3Float4 supVec = b3MakeFloat4(0,0,0,0);\n" +" float maxDot = -B3_LARGE_FLOAT;\n" +" if( 0 < hull->m_numVertices )\n" +" {\n" +" const b3Float4 scaled = supportVec;\n" +" int index = b3MaxDot(scaled, &verticesA[hull->m_vertexOffset], hull->m_numVertices, &maxDot);\n" +" return verticesA[hull->m_vertexOffset+index];\n" +" }\n" +" return supVec;\n" +"}\n" +"B3_STATIC void b3MprConvexSupport(int pairIndex,int bodyIndex, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n" +" b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" +" b3ConstArray(b3Collidable_t) cpuCollidables,\n" +" b3ConstArray(b3Float4) cpuVertices,\n" +" __global b3Float4* sepAxis,\n" +" const b3Float4* _dir, b3Float4* outp, int logme)\n" +"{\n" +" //dir is in worldspace, move to local space\n" +" \n" +" b3Float4 pos = cpuBodyBuf[bodyIndex].m_pos;\n" +" b3Quat orn = cpuBodyBuf[bodyIndex].m_quat;\n" +" \n" +" b3Float4 dir = b3MakeFloat4((*_dir).x,(*_dir).y,(*_dir).z,0.f);\n" +" \n" +" const b3Float4 localDir = b3QuatRotate(b3QuatInverse(orn),dir);\n" +" \n" +" //find local support vertex\n" +" int colIndex = cpuBodyBuf[bodyIndex].m_collidableIdx;\n" +" \n" +" b3Assert(cpuCollidables[colIndex].m_shapeType==SHAPE_CONVEX_HULL);\n" +" __global const b3ConvexPolyhedronData_t* hull = &cpuConvexData[cpuCollidables[colIndex].m_shapeIndex];\n" +" \n" +" b3Float4 pInA;\n" +" if (logme)\n" +" {\n" +" b3Float4 supVec = b3MakeFloat4(0,0,0,0);\n" +" float maxDot = -B3_LARGE_FLOAT;\n" +" if( 0 < hull->m_numVertices )\n" +" {\n" +" const b3Float4 scaled = localDir;\n" +" int index = b3MaxDot(scaled, &cpuVertices[hull->m_vertexOffset], hull->m_numVertices, &maxDot);\n" +" pInA = cpuVertices[hull->m_vertexOffset+index];\n" +" \n" +" }\n" +" } else\n" +" {\n" +" pInA = b3LocalGetSupportVertex(localDir,hull,cpuVertices);\n" +" }\n" +" //move vertex to world space\n" +" *outp = b3TransformPoint(pInA,pos,orn);\n" +" \n" +"}\n" +"inline void b3MprSupport(int pairIndex,int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n" +" b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" +" b3ConstArray(b3Collidable_t) cpuCollidables,\n" +" b3ConstArray(b3Float4) cpuVertices,\n" +" __global b3Float4* sepAxis,\n" +" const b3Float4* _dir, b3MprSupport_t *supp)\n" +"{\n" +" b3Float4 dir;\n" +" dir = *_dir;\n" +" b3MprConvexSupport(pairIndex,bodyIndexA,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,&dir, &supp->v1,0);\n" +" dir = *_dir*-1.f;\n" +" b3MprConvexSupport(pairIndex,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,&dir, &supp->v2,0);\n" +" supp->v = supp->v1 - supp->v2;\n" +"}\n" +"inline void b3FindOrigin(int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, b3MprSupport_t *center)\n" +"{\n" +" center->v1 = cpuBodyBuf[bodyIndexA].m_pos;\n" +" center->v2 = cpuBodyBuf[bodyIndexB].m_pos;\n" +" center->v = center->v1 - center->v2;\n" +"}\n" +"inline void b3MprVec3Set(b3Float4 *v, float x, float y, float z)\n" +"{\n" +" (*v).x = x;\n" +" (*v).y = y;\n" +" (*v).z = z;\n" +" (*v).w = 0.f;\n" +"}\n" +"inline void b3MprVec3Add(b3Float4 *v, const b3Float4 *w)\n" +"{\n" +" (*v).x += (*w).x;\n" +" (*v).y += (*w).y;\n" +" (*v).z += (*w).z;\n" +"}\n" +"inline void b3MprVec3Copy(b3Float4 *v, const b3Float4 *w)\n" +"{\n" +" *v = *w;\n" +"}\n" +"inline void b3MprVec3Scale(b3Float4 *d, float k)\n" +"{\n" +" *d *= k;\n" +"}\n" +"inline float b3MprVec3Dot(const b3Float4 *a, const b3Float4 *b)\n" +"{\n" +" float dot;\n" +" dot = b3Dot3F4(*a,*b);\n" +" return dot;\n" +"}\n" +"inline float b3MprVec3Len2(const b3Float4 *v)\n" +"{\n" +" return b3MprVec3Dot(v, v);\n" +"}\n" +"inline void b3MprVec3Normalize(b3Float4 *d)\n" +"{\n" +" float k = 1.f / B3_MPR_SQRT(b3MprVec3Len2(d));\n" +" b3MprVec3Scale(d, k);\n" +"}\n" +"inline void b3MprVec3Cross(b3Float4 *d, const b3Float4 *a, const b3Float4 *b)\n" +"{\n" +" *d = b3Cross3(*a,*b);\n" +" \n" +"}\n" +"inline void b3MprVec3Sub2(b3Float4 *d, const b3Float4 *v, const b3Float4 *w)\n" +"{\n" +" *d = *v - *w;\n" +"}\n" +"inline void b3PortalDir(const b3MprSimplex_t *portal, b3Float4 *dir)\n" +"{\n" +" b3Float4 v2v1, v3v1;\n" +" b3MprVec3Sub2(&v2v1, &b3MprSimplexPoint(portal, 2)->v,\n" +" &b3MprSimplexPoint(portal, 1)->v);\n" +" b3MprVec3Sub2(&v3v1, &b3MprSimplexPoint(portal, 3)->v,\n" +" &b3MprSimplexPoint(portal, 1)->v);\n" +" b3MprVec3Cross(dir, &v2v1, &v3v1);\n" +" b3MprVec3Normalize(dir);\n" +"}\n" +"inline int portalEncapsulesOrigin(const b3MprSimplex_t *portal,\n" +" const b3Float4 *dir)\n" +"{\n" +" float dot;\n" +" dot = b3MprVec3Dot(dir, &b3MprSimplexPoint(portal, 1)->v);\n" +" return b3MprIsZero(dot) || dot > 0.f;\n" +"}\n" +"inline int portalReachTolerance(const b3MprSimplex_t *portal,\n" +" const b3MprSupport_t *v4,\n" +" const b3Float4 *dir)\n" +"{\n" +" float dv1, dv2, dv3, dv4;\n" +" float dot1, dot2, dot3;\n" +" // find the smallest dot product of dir and {v1-v4, v2-v4, v3-v4}\n" +" dv1 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, dir);\n" +" dv2 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, dir);\n" +" dv3 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, dir);\n" +" dv4 = b3MprVec3Dot(&v4->v, dir);\n" +" dot1 = dv4 - dv1;\n" +" dot2 = dv4 - dv2;\n" +" dot3 = dv4 - dv3;\n" +" dot1 = B3_MPR_FMIN(dot1, dot2);\n" +" dot1 = B3_MPR_FMIN(dot1, dot3);\n" +" return b3MprEq(dot1, B3_MPR_TOLERANCE) || dot1 < B3_MPR_TOLERANCE;\n" +"}\n" +"inline int portalCanEncapsuleOrigin(const b3MprSimplex_t *portal, \n" +" const b3MprSupport_t *v4,\n" +" const b3Float4 *dir)\n" +"{\n" +" float dot;\n" +" dot = b3MprVec3Dot(&v4->v, dir);\n" +" return b3MprIsZero(dot) || dot > 0.f;\n" +"}\n" +"inline void b3ExpandPortal(b3MprSimplex_t *portal,\n" +" const b3MprSupport_t *v4)\n" +"{\n" +" float dot;\n" +" b3Float4 v4v0;\n" +" b3MprVec3Cross(&v4v0, &v4->v, &b3MprSimplexPoint(portal, 0)->v);\n" +" dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, &v4v0);\n" +" if (dot > 0.f){\n" +" dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, &v4v0);\n" +" if (dot > 0.f){\n" +" b3MprSimplexSet(portal, 1, v4);\n" +" }else{\n" +" b3MprSimplexSet(portal, 3, v4);\n" +" }\n" +" }else{\n" +" dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, &v4v0);\n" +" if (dot > 0.f){\n" +" b3MprSimplexSet(portal, 2, v4);\n" +" }else{\n" +" b3MprSimplexSet(portal, 1, v4);\n" +" }\n" +" }\n" +"}\n" +"B3_STATIC int b3DiscoverPortal(int pairIndex, int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n" +" b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" +" b3ConstArray(b3Collidable_t) cpuCollidables,\n" +" b3ConstArray(b3Float4) cpuVertices,\n" +" __global b3Float4* sepAxis,\n" +" __global int* hasSepAxis,\n" +" b3MprSimplex_t *portal)\n" +"{\n" +" b3Float4 dir, va, vb;\n" +" float dot;\n" +" int cont;\n" +" \n" +" \n" +" // vertex 0 is center of portal\n" +" b3FindOrigin(bodyIndexA,bodyIndexB,cpuBodyBuf, b3MprSimplexPointW(portal, 0));\n" +" // vertex 0 is center of portal\n" +" b3MprSimplexSetSize(portal, 1);\n" +" \n" +" b3Float4 zero = b3MakeFloat4(0,0,0,0);\n" +" b3Float4* b3mpr_vec3_origin = &zero;\n" +" if (b3MprVec3Eq(&b3MprSimplexPoint(portal, 0)->v, b3mpr_vec3_origin)){\n" +" // Portal's center lies on origin (0,0,0) => we know that objects\n" +" // intersect but we would need to know penetration info.\n" +" // So move center little bit...\n" +" b3MprVec3Set(&va, FLT_EPSILON * 10.f, 0.f, 0.f);\n" +" b3MprVec3Add(&b3MprSimplexPointW(portal, 0)->v, &va);\n" +" }\n" +" // vertex 1 = support in direction of origin\n" +" b3MprVec3Copy(&dir, &b3MprSimplexPoint(portal, 0)->v);\n" +" b3MprVec3Scale(&dir, -1.f);\n" +" b3MprVec3Normalize(&dir);\n" +" b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 1));\n" +" b3MprSimplexSetSize(portal, 2);\n" +" // test if origin isn't outside of v1\n" +" dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, &dir);\n" +" \n" +" if (b3MprIsZero(dot) || dot < 0.f)\n" +" return -1;\n" +" // vertex 2\n" +" b3MprVec3Cross(&dir, &b3MprSimplexPoint(portal, 0)->v,\n" +" &b3MprSimplexPoint(portal, 1)->v);\n" +" if (b3MprIsZero(b3MprVec3Len2(&dir))){\n" +" if (b3MprVec3Eq(&b3MprSimplexPoint(portal, 1)->v, b3mpr_vec3_origin)){\n" +" // origin lies on v1\n" +" return 1;\n" +" }else{\n" +" // origin lies on v0-v1 segment\n" +" return 2;\n" +" }\n" +" }\n" +" b3MprVec3Normalize(&dir);\n" +" b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 2));\n" +" \n" +" dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, &dir);\n" +" if (b3MprIsZero(dot) || dot < 0.f)\n" +" return -1;\n" +" b3MprSimplexSetSize(portal, 3);\n" +" // vertex 3 direction\n" +" b3MprVec3Sub2(&va, &b3MprSimplexPoint(portal, 1)->v,\n" +" &b3MprSimplexPoint(portal, 0)->v);\n" +" b3MprVec3Sub2(&vb, &b3MprSimplexPoint(portal, 2)->v,\n" +" &b3MprSimplexPoint(portal, 0)->v);\n" +" b3MprVec3Cross(&dir, &va, &vb);\n" +" b3MprVec3Normalize(&dir);\n" +" // it is better to form portal faces to be oriented \"outside\" origin\n" +" dot = b3MprVec3Dot(&dir, &b3MprSimplexPoint(portal, 0)->v);\n" +" if (dot > 0.f){\n" +" b3MprSimplexSwap(portal, 1, 2);\n" +" b3MprVec3Scale(&dir, -1.f);\n" +" }\n" +" while (b3MprSimplexSize(portal) < 4){\n" +" b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 3));\n" +" \n" +" dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, &dir);\n" +" if (b3MprIsZero(dot) || dot < 0.f)\n" +" return -1;\n" +" cont = 0;\n" +" // test if origin is outside (v1, v0, v3) - set v2 as v3 and\n" +" // continue\n" +" b3MprVec3Cross(&va, &b3MprSimplexPoint(portal, 1)->v,\n" +" &b3MprSimplexPoint(portal, 3)->v);\n" +" dot = b3MprVec3Dot(&va, &b3MprSimplexPoint(portal, 0)->v);\n" +" if (dot < 0.f && !b3MprIsZero(dot)){\n" +" b3MprSimplexSet(portal, 2, b3MprSimplexPoint(portal, 3));\n" +" cont = 1;\n" +" }\n" +" if (!cont){\n" +" // test if origin is outside (v3, v0, v2) - set v1 as v3 and\n" +" // continue\n" +" b3MprVec3Cross(&va, &b3MprSimplexPoint(portal, 3)->v,\n" +" &b3MprSimplexPoint(portal, 2)->v);\n" +" dot = b3MprVec3Dot(&va, &b3MprSimplexPoint(portal, 0)->v);\n" +" if (dot < 0.f && !b3MprIsZero(dot)){\n" +" b3MprSimplexSet(portal, 1, b3MprSimplexPoint(portal, 3));\n" +" cont = 1;\n" +" }\n" +" }\n" +" if (cont){\n" +" b3MprVec3Sub2(&va, &b3MprSimplexPoint(portal, 1)->v,\n" +" &b3MprSimplexPoint(portal, 0)->v);\n" +" b3MprVec3Sub2(&vb, &b3MprSimplexPoint(portal, 2)->v,\n" +" &b3MprSimplexPoint(portal, 0)->v);\n" +" b3MprVec3Cross(&dir, &va, &vb);\n" +" b3MprVec3Normalize(&dir);\n" +" }else{\n" +" b3MprSimplexSetSize(portal, 4);\n" +" }\n" +" }\n" +" return 0;\n" +"}\n" +"B3_STATIC int b3RefinePortal(int pairIndex,int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n" +" b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" +" b3ConstArray(b3Collidable_t) cpuCollidables,\n" +" b3ConstArray(b3Float4) cpuVertices,\n" +" __global b3Float4* sepAxis,\n" +" b3MprSimplex_t *portal)\n" +"{\n" +" b3Float4 dir;\n" +" b3MprSupport_t v4;\n" +" for (int i=0;i<B3_MPR_MAX_ITERATIONS;i++)\n" +" //while (1)\n" +" {\n" +" // compute direction outside the portal (from v0 throught v1,v2,v3\n" +" // face)\n" +" b3PortalDir(portal, &dir);\n" +" // test if origin is inside the portal\n" +" if (portalEncapsulesOrigin(portal, &dir))\n" +" return 0;\n" +" // get next support point\n" +" \n" +" b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, &v4);\n" +" // test if v4 can expand portal to contain origin and if portal\n" +" // expanding doesn't reach given tolerance\n" +" if (!portalCanEncapsuleOrigin(portal, &v4, &dir)\n" +" || portalReachTolerance(portal, &v4, &dir))\n" +" {\n" +" return -1;\n" +" }\n" +" // v1-v2-v3 triangle must be rearranged to face outside Minkowski\n" +" // difference (direction from v0).\n" +" b3ExpandPortal(portal, &v4);\n" +" }\n" +" return -1;\n" +"}\n" +"B3_STATIC void b3FindPos(const b3MprSimplex_t *portal, b3Float4 *pos)\n" +"{\n" +" b3Float4 zero = b3MakeFloat4(0,0,0,0);\n" +" b3Float4* b3mpr_vec3_origin = &zero;\n" +" b3Float4 dir;\n" +" size_t i;\n" +" float b[4], sum, inv;\n" +" b3Float4 vec, p1, p2;\n" +" b3PortalDir(portal, &dir);\n" +" // use barycentric coordinates of tetrahedron to find origin\n" +" b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 1)->v,\n" +" &b3MprSimplexPoint(portal, 2)->v);\n" +" b[0] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 3)->v);\n" +" b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 3)->v,\n" +" &b3MprSimplexPoint(portal, 2)->v);\n" +" b[1] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 0)->v);\n" +" b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 0)->v,\n" +" &b3MprSimplexPoint(portal, 1)->v);\n" +" b[2] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 3)->v);\n" +" b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 2)->v,\n" +" &b3MprSimplexPoint(portal, 1)->v);\n" +" b[3] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 0)->v);\n" +" sum = b[0] + b[1] + b[2] + b[3];\n" +" if (b3MprIsZero(sum) || sum < 0.f){\n" +" b[0] = 0.f;\n" +" b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 2)->v,\n" +" &b3MprSimplexPoint(portal, 3)->v);\n" +" b[1] = b3MprVec3Dot(&vec, &dir);\n" +" b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 3)->v,\n" +" &b3MprSimplexPoint(portal, 1)->v);\n" +" b[2] = b3MprVec3Dot(&vec, &dir);\n" +" b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 1)->v,\n" +" &b3MprSimplexPoint(portal, 2)->v);\n" +" b[3] = b3MprVec3Dot(&vec, &dir);\n" +" sum = b[1] + b[2] + b[3];\n" +" }\n" +" inv = 1.f / sum;\n" +" b3MprVec3Copy(&p1, b3mpr_vec3_origin);\n" +" b3MprVec3Copy(&p2, b3mpr_vec3_origin);\n" +" for (i = 0; i < 4; i++){\n" +" b3MprVec3Copy(&vec, &b3MprSimplexPoint(portal, i)->v1);\n" +" b3MprVec3Scale(&vec, b[i]);\n" +" b3MprVec3Add(&p1, &vec);\n" +" b3MprVec3Copy(&vec, &b3MprSimplexPoint(portal, i)->v2);\n" +" b3MprVec3Scale(&vec, b[i]);\n" +" b3MprVec3Add(&p2, &vec);\n" +" }\n" +" b3MprVec3Scale(&p1, inv);\n" +" b3MprVec3Scale(&p2, inv);\n" +" b3MprVec3Copy(pos, &p1);\n" +" b3MprVec3Add(pos, &p2);\n" +" b3MprVec3Scale(pos, 0.5);\n" +"}\n" +"inline float b3MprVec3Dist2(const b3Float4 *a, const b3Float4 *b)\n" +"{\n" +" b3Float4 ab;\n" +" b3MprVec3Sub2(&ab, a, b);\n" +" return b3MprVec3Len2(&ab);\n" +"}\n" +"inline float _b3MprVec3PointSegmentDist2(const b3Float4 *P,\n" +" const b3Float4 *x0,\n" +" const b3Float4 *b,\n" +" b3Float4 *witness)\n" +"{\n" +" // The computation comes from solving equation of segment:\n" +" // S(t) = x0 + t.d\n" +" // where - x0 is initial point of segment\n" +" // - d is direction of segment from x0 (|d| > 0)\n" +" // - t belongs to <0, 1> interval\n" +" // \n" +" // Than, distance from a segment to some point P can be expressed:\n" +" // D(t) = |x0 + t.d - P|^2\n" +" // which is distance from any point on segment. Minimization\n" +" // of this function brings distance from P to segment.\n" +" // Minimization of D(t) leads to simple quadratic equation that's\n" +" // solving is straightforward.\n" +" //\n" +" // Bonus of this method is witness point for free.\n" +" float dist, t;\n" +" b3Float4 d, a;\n" +" // direction of segment\n" +" b3MprVec3Sub2(&d, b, x0);\n" +" // precompute vector from P to x0\n" +" b3MprVec3Sub2(&a, x0, P);\n" +" t = -1.f * b3MprVec3Dot(&a, &d);\n" +" t /= b3MprVec3Len2(&d);\n" +" if (t < 0.f || b3MprIsZero(t)){\n" +" dist = b3MprVec3Dist2(x0, P);\n" +" if (witness)\n" +" b3MprVec3Copy(witness, x0);\n" +" }else if (t > 1.f || b3MprEq(t, 1.f)){\n" +" dist = b3MprVec3Dist2(b, P);\n" +" if (witness)\n" +" b3MprVec3Copy(witness, b);\n" +" }else{\n" +" if (witness){\n" +" b3MprVec3Copy(witness, &d);\n" +" b3MprVec3Scale(witness, t);\n" +" b3MprVec3Add(witness, x0);\n" +" dist = b3MprVec3Dist2(witness, P);\n" +" }else{\n" +" // recycling variables\n" +" b3MprVec3Scale(&d, t);\n" +" b3MprVec3Add(&d, &a);\n" +" dist = b3MprVec3Len2(&d);\n" +" }\n" +" }\n" +" return dist;\n" +"}\n" +"inline float b3MprVec3PointTriDist2(const b3Float4 *P,\n" +" const b3Float4 *x0, const b3Float4 *B,\n" +" const b3Float4 *C,\n" +" b3Float4 *witness)\n" +"{\n" +" // Computation comes from analytic expression for triangle (x0, B, C)\n" +" // T(s, t) = x0 + s.d1 + t.d2, where d1 = B - x0 and d2 = C - x0 and\n" +" // Then equation for distance is:\n" +" // D(s, t) = | T(s, t) - P |^2\n" +" // This leads to minimization of quadratic function of two variables.\n" +" // The solution from is taken only if s is between 0 and 1, t is\n" +" // between 0 and 1 and t + s < 1, otherwise distance from segment is\n" +" // computed.\n" +" b3Float4 d1, d2, a;\n" +" float u, v, w, p, q, r;\n" +" float s, t, dist, dist2;\n" +" b3Float4 witness2;\n" +" b3MprVec3Sub2(&d1, B, x0);\n" +" b3MprVec3Sub2(&d2, C, x0);\n" +" b3MprVec3Sub2(&a, x0, P);\n" +" u = b3MprVec3Dot(&a, &a);\n" +" v = b3MprVec3Dot(&d1, &d1);\n" +" w = b3MprVec3Dot(&d2, &d2);\n" +" p = b3MprVec3Dot(&a, &d1);\n" +" q = b3MprVec3Dot(&a, &d2);\n" +" r = b3MprVec3Dot(&d1, &d2);\n" +" s = (q * r - w * p) / (w * v - r * r);\n" +" t = (-s * r - q) / w;\n" +" if ((b3MprIsZero(s) || s > 0.f)\n" +" && (b3MprEq(s, 1.f) || s < 1.f)\n" +" && (b3MprIsZero(t) || t > 0.f)\n" +" && (b3MprEq(t, 1.f) || t < 1.f)\n" +" && (b3MprEq(t + s, 1.f) || t + s < 1.f)){\n" +" if (witness){\n" +" b3MprVec3Scale(&d1, s);\n" +" b3MprVec3Scale(&d2, t);\n" +" b3MprVec3Copy(witness, x0);\n" +" b3MprVec3Add(witness, &d1);\n" +" b3MprVec3Add(witness, &d2);\n" +" dist = b3MprVec3Dist2(witness, P);\n" +" }else{\n" +" dist = s * s * v;\n" +" dist += t * t * w;\n" +" dist += 2.f * s * t * r;\n" +" dist += 2.f * s * p;\n" +" dist += 2.f * t * q;\n" +" dist += u;\n" +" }\n" +" }else{\n" +" dist = _b3MprVec3PointSegmentDist2(P, x0, B, witness);\n" +" dist2 = _b3MprVec3PointSegmentDist2(P, x0, C, &witness2);\n" +" if (dist2 < dist){\n" +" dist = dist2;\n" +" if (witness)\n" +" b3MprVec3Copy(witness, &witness2);\n" +" }\n" +" dist2 = _b3MprVec3PointSegmentDist2(P, B, C, &witness2);\n" +" if (dist2 < dist){\n" +" dist = dist2;\n" +" if (witness)\n" +" b3MprVec3Copy(witness, &witness2);\n" +" }\n" +" }\n" +" return dist;\n" +"}\n" +"B3_STATIC void b3FindPenetr(int pairIndex,int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n" +" b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" +" b3ConstArray(b3Collidable_t) cpuCollidables,\n" +" b3ConstArray(b3Float4) cpuVertices,\n" +" __global b3Float4* sepAxis,\n" +" b3MprSimplex_t *portal,\n" +" float *depth, b3Float4 *pdir, b3Float4 *pos)\n" +"{\n" +" b3Float4 dir;\n" +" b3MprSupport_t v4;\n" +" unsigned long iterations;\n" +" b3Float4 zero = b3MakeFloat4(0,0,0,0);\n" +" b3Float4* b3mpr_vec3_origin = &zero;\n" +" iterations = 1UL;\n" +" for (int i=0;i<B3_MPR_MAX_ITERATIONS;i++)\n" +" //while (1)\n" +" {\n" +" // compute portal direction and obtain next support point\n" +" b3PortalDir(portal, &dir);\n" +" \n" +" b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, &v4);\n" +" // reached tolerance -> find penetration info\n" +" if (portalReachTolerance(portal, &v4, &dir)\n" +" || iterations ==B3_MPR_MAX_ITERATIONS)\n" +" {\n" +" *depth = b3MprVec3PointTriDist2(b3mpr_vec3_origin,&b3MprSimplexPoint(portal, 1)->v,&b3MprSimplexPoint(portal, 2)->v,&b3MprSimplexPoint(portal, 3)->v,pdir);\n" +" *depth = B3_MPR_SQRT(*depth);\n" +" \n" +" if (b3MprIsZero((*pdir).x) && b3MprIsZero((*pdir).y) && b3MprIsZero((*pdir).z))\n" +" {\n" +" \n" +" *pdir = dir;\n" +" } \n" +" b3MprVec3Normalize(pdir);\n" +" \n" +" // barycentric coordinates:\n" +" b3FindPos(portal, pos);\n" +" return;\n" +" }\n" +" b3ExpandPortal(portal, &v4);\n" +" iterations++;\n" +" }\n" +"}\n" +"B3_STATIC void b3FindPenetrTouch(b3MprSimplex_t *portal,float *depth, b3Float4 *dir, b3Float4 *pos)\n" +"{\n" +" // Touching contact on portal's v1 - so depth is zero and direction\n" +" // is unimportant and pos can be guessed\n" +" *depth = 0.f;\n" +" b3Float4 zero = b3MakeFloat4(0,0,0,0);\n" +" b3Float4* b3mpr_vec3_origin = &zero;\n" +" b3MprVec3Copy(dir, b3mpr_vec3_origin);\n" +" b3MprVec3Copy(pos, &b3MprSimplexPoint(portal, 1)->v1);\n" +" b3MprVec3Add(pos, &b3MprSimplexPoint(portal, 1)->v2);\n" +" b3MprVec3Scale(pos, 0.5);\n" +"}\n" +"B3_STATIC void b3FindPenetrSegment(b3MprSimplex_t *portal,\n" +" float *depth, b3Float4 *dir, b3Float4 *pos)\n" +"{\n" +" \n" +" // Origin lies on v0-v1 segment.\n" +" // Depth is distance to v1, direction also and position must be\n" +" // computed\n" +" b3MprVec3Copy(pos, &b3MprSimplexPoint(portal, 1)->v1);\n" +" b3MprVec3Add(pos, &b3MprSimplexPoint(portal, 1)->v2);\n" +" b3MprVec3Scale(pos, 0.5f);\n" +" \n" +" b3MprVec3Copy(dir, &b3MprSimplexPoint(portal, 1)->v);\n" +" *depth = B3_MPR_SQRT(b3MprVec3Len2(dir));\n" +" b3MprVec3Normalize(dir);\n" +"}\n" +"inline int b3MprPenetration(int pairIndex, int bodyIndexA, int bodyIndexB,\n" +" b3ConstArray(b3RigidBodyData_t) cpuBodyBuf,\n" +" b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" +" b3ConstArray(b3Collidable_t) cpuCollidables,\n" +" b3ConstArray(b3Float4) cpuVertices,\n" +" __global b3Float4* sepAxis,\n" +" __global int* hasSepAxis,\n" +" float *depthOut, b3Float4* dirOut, b3Float4* posOut)\n" +"{\n" +" \n" +" b3MprSimplex_t portal;\n" +" \n" +"// if (!hasSepAxis[pairIndex])\n" +" // return -1;\n" +" \n" +" hasSepAxis[pairIndex] = 0;\n" +" int res;\n" +" // Phase 1: Portal discovery\n" +" res = b3DiscoverPortal(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,hasSepAxis, &portal);\n" +" \n" +" \n" +" //sepAxis[pairIndex] = *pdir;//or -dir?\n" +" switch (res)\n" +" {\n" +" case 0:\n" +" {\n" +" // Phase 2: Portal refinement\n" +" \n" +" res = b3RefinePortal(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&portal);\n" +" if (res < 0)\n" +" return -1;\n" +" // Phase 3. Penetration info\n" +" b3FindPenetr(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&portal, depthOut, dirOut, posOut);\n" +" hasSepAxis[pairIndex] = 1;\n" +" sepAxis[pairIndex] = -*dirOut;\n" +" break;\n" +" }\n" +" case 1:\n" +" {\n" +" // Touching contact on portal's v1.\n" +" b3FindPenetrTouch(&portal, depthOut, dirOut, posOut);\n" +" break;\n" +" }\n" +" case 2:\n" +" {\n" +" \n" +" b3FindPenetrSegment( &portal, depthOut, dirOut, posOut);\n" +" break;\n" +" }\n" +" default:\n" +" {\n" +" hasSepAxis[pairIndex]=0;\n" +" //if (res < 0)\n" +" //{\n" +" // Origin isn't inside portal - no collision.\n" +" return -1;\n" +" //}\n" +" }\n" +" };\n" +" \n" +" return 0;\n" +"};\n" +"#endif //B3_MPR_PENETRATION_H\n" +"#ifndef B3_CONTACT4DATA_H\n" +"#define B3_CONTACT4DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"typedef struct b3Contact4Data b3Contact4Data_t;\n" +"struct b3Contact4Data\n" +"{\n" +" b3Float4 m_worldPosB[4];\n" +"// b3Float4 m_localPosA[4];\n" +"// b3Float4 m_localPosB[4];\n" +" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" +" unsigned short m_restituitionCoeffCmp;\n" +" unsigned short m_frictionCoeffCmp;\n" +" int m_batchIdx;\n" +" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" +" int m_bodyBPtrAndSignBit;\n" +" int m_childIndexA;\n" +" int m_childIndexB;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"};\n" +"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" +"{\n" +" return (int)contact->m_worldNormalOnB.w;\n" +"};\n" +"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" +"{\n" +" contact->m_worldNormalOnB.w = (float)numPoints;\n" +"};\n" +"#endif //B3_CONTACT4DATA_H\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define GET_NPOINTS(x) (x).m_worldNormalOnB.w\n" +"#ifdef cl_ext_atomic_counters_32\n" +" #pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +" #define counter32_t volatile __global int*\n" +"#endif\n" +"__kernel void mprPenetrationKernel( __global int4* pairs,\n" +" __global const b3RigidBodyData_t* rigidBodies, \n" +" __global const b3Collidable_t* collidables,\n" +" __global const b3ConvexPolyhedronData_t* convexShapes, \n" +" __global const float4* vertices,\n" +" __global float4* separatingNormals,\n" +" __global int* hasSeparatingAxis,\n" +" __global struct b3Contact4Data* restrict globalContactsOut,\n" +" counter32_t nGlobalContactsOut,\n" +" int contactCapacity,\n" +" int numPairs)\n" +"{\n" +" int i = get_global_id(0);\n" +" int pairIndex = i;\n" +" if (i<numPairs)\n" +" {\n" +" int bodyIndexA = pairs[i].x;\n" +" int bodyIndexB = pairs[i].y;\n" +" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" +" \n" +" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" +" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" +" \n" +" \n" +" //once the broadphase avoids static-static pairs, we can remove this test\n" +" if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n" +" {\n" +" return;\n" +" }\n" +" \n" +" if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))\n" +" {\n" +" return;\n" +" }\n" +" float depthOut;\n" +" b3Float4 dirOut;\n" +" b3Float4 posOut;\n" +" int res = b3MprPenetration(pairIndex, bodyIndexA, bodyIndexB,rigidBodies,convexShapes,collidables,vertices,separatingNormals,hasSeparatingAxis,&depthOut, &dirOut, &posOut);\n" +" \n" +" \n" +" \n" +" \n" +" if (res==0)\n" +" {\n" +" //add a contact\n" +" int dstIdx;\n" +" AppendInc( nGlobalContactsOut, dstIdx );\n" +" if (dstIdx<contactCapacity)\n" +" {\n" +" pairs[pairIndex].z = dstIdx;\n" +" __global struct b3Contact4Data* c = globalContactsOut + dstIdx;\n" +" c->m_worldNormalOnB = -dirOut;//normal;\n" +" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" +" c->m_batchIdx = pairIndex;\n" +" int bodyA = pairs[pairIndex].x;\n" +" int bodyB = pairs[pairIndex].y;\n" +" c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;\n" +" c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;\n" +" c->m_childIndexA = -1;\n" +" c->m_childIndexB = -1;\n" +" //for (int i=0;i<nContacts;i++)\n" +" posOut.w = -depthOut;\n" +" c->m_worldPosB[0] = posOut;//localPoints[contactIdx[i]];\n" +" GET_NPOINTS(*c) = 1;//nContacts;\n" +" }\n" +" }\n" +" }\n" +"}\n" +"typedef float4 Quaternion;\n" +"#define make_float4 (float4)\n" +"__inline\n" +"float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = make_float4(a.xyz,0.f);\n" +" float4 b1 = make_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"__inline\n" +"float4 cross3(float4 a, float4 b)\n" +"{\n" +" return cross(a,b);\n" +"}\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b)\n" +"{\n" +" Quaternion ans;\n" +" ans = cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q)\n" +"{\n" +" return (Quaternion)(-q.xyz, q.w);\n" +"}\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec)\n" +"{\n" +" Quaternion qInv = qtInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"__inline\n" +"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n" +"{\n" +" return qtRotate( *orientation, *p ) + (*translation);\n" +"}\n" +"__inline\n" +"float4 qtInvRotate(const Quaternion q, float4 vec)\n" +"{\n" +" return qtRotate( qtInvert( q ), vec );\n" +"}\n" +"inline void project(__global const b3ConvexPolyhedronData_t* hull, const float4 pos, const float4 orn, \n" +"const float4* dir, __global const float4* vertices, float* min, float* max)\n" +"{\n" +" min[0] = FLT_MAX;\n" +" max[0] = -FLT_MAX;\n" +" int numVerts = hull->m_numVertices;\n" +" const float4 localDir = qtInvRotate(orn,*dir);\n" +" float offset = dot(pos,*dir);\n" +" for(int i=0;i<numVerts;i++)\n" +" {\n" +" float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n" +" if(dp < min[0]) \n" +" min[0] = dp;\n" +" if(dp > max[0]) \n" +" max[0] = dp;\n" +" }\n" +" if(min[0]>max[0])\n" +" {\n" +" float tmp = min[0];\n" +" min[0] = max[0];\n" +" max[0] = tmp;\n" +" }\n" +" min[0] += offset;\n" +" max[0] += offset;\n" +"}\n" +"bool findSeparatingAxisUnitSphere( __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, \n" +" const float4 posA1,\n" +" const float4 ornA,\n" +" const float4 posB1,\n" +" const float4 ornB,\n" +" const float4 DeltaC2,\n" +" __global const float4* vertices,\n" +" __global const float4* unitSphereDirections,\n" +" int numUnitSphereDirections,\n" +" float4* sep,\n" +" float* dmin)\n" +"{\n" +" \n" +" float4 posA = posA1;\n" +" posA.w = 0.f;\n" +" float4 posB = posB1;\n" +" posB.w = 0.f;\n" +" int curPlaneTests=0;\n" +" int curEdgeEdge = 0;\n" +" // Test unit sphere directions\n" +" for (int i=0;i<numUnitSphereDirections;i++)\n" +" {\n" +" float4 crossje;\n" +" crossje = unitSphereDirections[i]; \n" +" if (dot3F4(DeltaC2,crossje)>0)\n" +" crossje *= -1.f;\n" +" {\n" +" float dist;\n" +" bool result = true;\n" +" float Min0,Max0;\n" +" float Min1,Max1;\n" +" project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);\n" +" project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);\n" +" \n" +" if(Max0<Min1 || Max1<Min0)\n" +" return false;\n" +" \n" +" float d0 = Max0 - Min1;\n" +" float d1 = Max1 - Min0;\n" +" dist = d0<d1 ? d0:d1;\n" +" result = true;\n" +" \n" +" if(dist<*dmin)\n" +" {\n" +" *dmin = dist;\n" +" *sep = crossje;\n" +" }\n" +" }\n" +" }\n" +" \n" +" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" +" {\n" +" *sep = -(*sep);\n" +" }\n" +" return true;\n" +"}\n" +"__kernel void findSeparatingAxisUnitSphereKernel( __global const int4* pairs, \n" +" __global const b3RigidBodyData_t* rigidBodies, \n" +" __global const b3Collidable_t* collidables,\n" +" __global const b3ConvexPolyhedronData_t* convexShapes, \n" +" __global const float4* vertices,\n" +" __global const float4* unitSphereDirections,\n" +" __global float4* separatingNormals,\n" +" __global int* hasSeparatingAxis,\n" +" __global float* dmins,\n" +" int numUnitSphereDirections,\n" +" int numPairs\n" +" )\n" +"{\n" +" int i = get_global_id(0);\n" +" \n" +" if (i<numPairs)\n" +" {\n" +" if (hasSeparatingAxis[i])\n" +" {\n" +" \n" +" int bodyIndexA = pairs[i].x;\n" +" int bodyIndexB = pairs[i].y;\n" +" \n" +" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" +" \n" +" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" +" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" +" \n" +" \n" +" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" +" \n" +" float dmin = dmins[i];\n" +" \n" +" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" +" posA.w = 0.f;\n" +" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" +" posB.w = 0.f;\n" +" float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" +" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" +" float4 c0 = transform(&c0local, &posA, &ornA);\n" +" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" +" float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" +" float4 c1 = transform(&c1local,&posB,&ornB);\n" +" const float4 DeltaC2 = c0 - c1;\n" +" float4 sepNormal = separatingNormals[i];\n" +" \n" +" int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;\n" +" if (numEdgeEdgeDirections>numUnitSphereDirections)\n" +" {\n" +" bool sepEE = findSeparatingAxisUnitSphere( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" +" posB,ornB,\n" +" DeltaC2,\n" +" vertices,unitSphereDirections,numUnitSphereDirections,&sepNormal,&dmin);\n" +" if (!sepEE)\n" +" {\n" +" hasSeparatingAxis[i] = 0;\n" +" } else\n" +" {\n" +" hasSeparatingAxis[i] = 1;\n" +" separatingNormals[i] = sepNormal;\n" +" }\n" +" }\n" +" } //if (hasSeparatingAxis[i])\n" +" }//(i<numPairs)\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl new file mode 100644 index 0000000000..9c9e920f13 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl @@ -0,0 +1,1374 @@ +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h" + +#define SHAPE_CONVEX_HULL 3 +#define SHAPE_PLANE 4 +#define SHAPE_CONCAVE_TRIMESH 5 +#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6 +#define SHAPE_SPHERE 7 + + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile __global int* +#endif + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + +#define max2 max +#define min2 min + +typedef unsigned int u32; + + + + +typedef struct +{ + union + { + float4 m_min; + float m_minElems[4]; + int m_minIndices[4]; + }; + union + { + float4 m_max; + float m_maxElems[4]; + int m_maxIndices[4]; + }; +} btAabbCL; + +///keep this in sync with btCollidable.h +typedef struct +{ + int m_numChildShapes; + float m_radius; + int m_shapeType; + int m_shapeIndex; + +} btCollidableGpu; + +typedef struct +{ + float4 m_childPosition; + float4 m_childOrientation; + int m_shapeIndex; + int m_unused0; + int m_unused1; + int m_unused2; +} btGpuChildShape; + +#define GET_NPOINTS(x) (x).m_worldNormalOnB.w + +typedef struct +{ + float4 m_pos; + float4 m_quat; + float4 m_linVel; + float4 m_angVel; + + u32 m_collidableIdx; + float m_invMass; + float m_restituitionCoeff; + float m_frictionCoeff; +} BodyData; + + +typedef struct +{ + float4 m_localCenter; + float4 m_extents; + float4 mC; + float4 mE; + + float m_radius; + int m_faceOffset; + int m_numFaces; + int m_numVertices; + + int m_vertexOffset; + int m_uniqueEdgesOffset; + int m_numUniqueEdges; + int m_unused; + +} ConvexPolyhedronCL; + +typedef struct +{ + float4 m_plane; + int m_indexOffset; + int m_numIndices; +} btGpuFace; + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define make_float4 (float4) +#define make_float2 (float2) +#define make_uint4 (uint4) +#define make_int4 (int4) +#define make_uint2 (uint2) +#define make_int2 (int2) + + +__inline +float fastDiv(float numerator, float denominator) +{ + return native_divide(numerator, denominator); +// return numerator/denominator; +} + +__inline +float4 fastDiv4(float4 numerator, float4 denominator) +{ + return native_divide(numerator, denominator); +} + + +__inline +float4 cross3(float4 a, float4 b) +{ + return cross(a,b); +} + +//#define dot3F4 dot + +__inline +float dot3F4(float4 a, float4 b) +{ + float4 a1 = make_float4(a.xyz,0.f); + float4 b1 = make_float4(b.xyz,0.f); + return dot(a1, b1); +} + +__inline +float4 fastNormalize4(float4 v) +{ + return fast_normalize(v); +} + + +/////////////////////////////////////// +// Quaternion +/////////////////////////////////////// + +typedef float4 Quaternion; + +__inline +Quaternion qtMul(Quaternion a, Quaternion b); + +__inline +Quaternion qtNormalize(Quaternion in); + +__inline +float4 qtRotate(Quaternion q, float4 vec); + +__inline +Quaternion qtInvert(Quaternion q); + + + + +__inline +Quaternion qtMul(Quaternion a, Quaternion b) +{ + Quaternion ans; + ans = cross3( a, b ); + ans += a.w*b+b.w*a; +// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z); + ans.w = a.w*b.w - dot3F4(a, b); + return ans; +} + +__inline +Quaternion qtNormalize(Quaternion in) +{ + return fastNormalize4(in); +// in /= length( in ); +// return in; +} +__inline +float4 qtRotate(Quaternion q, float4 vec) +{ + Quaternion qInv = qtInvert( q ); + float4 vcpy = vec; + vcpy.w = 0.f; + float4 out = qtMul(qtMul(q,vcpy),qInv); + return out; +} + +__inline +Quaternion qtInvert(Quaternion q) +{ + return (Quaternion)(-q.xyz, q.w); +} + +__inline +float4 qtInvRotate(const Quaternion q, float4 vec) +{ + return qtRotate( qtInvert( q ), vec ); +} + +__inline +float4 transform(const float4* p, const float4* translation, const Quaternion* orientation) +{ + return qtRotate( *orientation, *p ) + (*translation); +} + +void trInverse(float4 translationIn, Quaternion orientationIn, + float4* translationOut, Quaternion* orientationOut) +{ + *orientationOut = qtInvert(orientationIn); + *translationOut = qtRotate(*orientationOut, -translationIn); +} + +void trMul(float4 translationA, Quaternion orientationA, + float4 translationB, Quaternion orientationB, + float4* translationOut, Quaternion* orientationOut) +{ + *orientationOut = qtMul(orientationA,orientationB); + *translationOut = transform(&translationB,&translationA,&orientationA); +} + + + +__inline +float4 normalize3(const float4 a) +{ + float4 n = make_float4(a.x, a.y, a.z, 0.f); + return fastNormalize4( n ); +} + + +__inline float4 lerp3(const float4 a,const float4 b, float t) +{ + return make_float4( a.x + (b.x - a.x) * t, + a.y + (b.y - a.y) * t, + a.z + (b.z - a.z) * t, + 0.f); +} + + +float signedDistanceFromPointToPlane(float4 point, float4 planeEqn, float4* closestPointOnFace) +{ + float4 n = (float4)(planeEqn.x, planeEqn.y, planeEqn.z, 0); + float dist = dot3F4(n, point) + planeEqn.w; + *closestPointOnFace = point - dist * n; + return dist; +} + + + +inline bool IsPointInPolygon(float4 p, + const btGpuFace* face, + __global const float4* baseVertex, + __global const int* convexIndices, + float4* out) +{ + float4 a; + float4 b; + float4 ab; + float4 ap; + float4 v; + + float4 plane = make_float4(face->m_plane.x,face->m_plane.y,face->m_plane.z,0.f); + + if (face->m_numIndices<2) + return false; + + + float4 v0 = baseVertex[convexIndices[face->m_indexOffset + face->m_numIndices-1]]; + + b = v0; + + for(unsigned i=0; i != face->m_numIndices; ++i) + { + a = b; + float4 vi = baseVertex[convexIndices[face->m_indexOffset + i]]; + b = vi; + ab = b-a; + ap = p-a; + v = cross3(ab,plane); + + if (dot(ap, v) > 0.f) + { + float ab_m2 = dot(ab, ab); + float rt = ab_m2 != 0.f ? dot(ab, ap) / ab_m2 : 0.f; + if (rt <= 0.f) + { + *out = a; + } + else if (rt >= 1.f) + { + *out = b; + } + else + { + float s = 1.f - rt; + out[0].x = s * a.x + rt * b.x; + out[0].y = s * a.y + rt * b.y; + out[0].z = s * a.z + rt * b.z; + } + return false; + } + } + return true; +} + + + + +void computeContactSphereConvex(int pairIndex, + int bodyIndexA, int bodyIndexB, + int collidableIndexA, int collidableIndexB, + __global const BodyData* rigidBodies, + __global const btCollidableGpu* collidables, + __global const ConvexPolyhedronCL* convexShapes, + __global const float4* convexVertices, + __global const int* convexIndices, + __global const btGpuFace* faces, + __global struct b3Contact4Data* restrict globalContactsOut, + counter32_t nGlobalContactsOut, + int maxContactCapacity, + float4 spherePos2, + float radius, + float4 pos, + float4 quat + ) +{ + + float4 invPos; + float4 invOrn; + + trInverse(pos,quat, &invPos,&invOrn); + + float4 spherePos = transform(&spherePos2,&invPos,&invOrn); + + int shapeIndex = collidables[collidableIndexB].m_shapeIndex; + int numFaces = convexShapes[shapeIndex].m_numFaces; + float4 closestPnt = (float4)(0, 0, 0, 0); + float4 hitNormalWorld = (float4)(0, 0, 0, 0); + float minDist = -1000000.f; + bool bCollide = true; + + for ( int f = 0; f < numFaces; f++ ) + { + btGpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f]; + + // set up a plane equation + float4 planeEqn; + float4 n1 = face.m_plane; + n1.w = 0.f; + planeEqn = n1; + planeEqn.w = face.m_plane.w; + + + // compute a signed distance from the vertex in cloth to the face of rigidbody. + float4 pntReturn; + float dist = signedDistanceFromPointToPlane(spherePos, planeEqn, &pntReturn); + + // If the distance is positive, the plane is a separating plane. + if ( dist > radius ) + { + bCollide = false; + break; + } + + + if (dist>0) + { + //might hit an edge or vertex + float4 out; + float4 zeroPos = make_float4(0,0,0,0); + + bool isInPoly = IsPointInPolygon(spherePos, + &face, + &convexVertices[convexShapes[shapeIndex].m_vertexOffset], + convexIndices, + &out); + if (isInPoly) + { + if (dist>minDist) + { + minDist = dist; + closestPnt = pntReturn; + hitNormalWorld = planeEqn; + + } + } else + { + float4 tmp = spherePos-out; + float l2 = dot(tmp,tmp); + if (l2<radius*radius) + { + dist = sqrt(l2); + if (dist>minDist) + { + minDist = dist; + closestPnt = out; + hitNormalWorld = tmp/dist; + + } + + } else + { + bCollide = false; + break; + } + } + } else + { + if ( dist > minDist ) + { + minDist = dist; + closestPnt = pntReturn; + hitNormalWorld.xyz = planeEqn.xyz; + } + } + + } + + + + if (bCollide && minDist > -10000) + { + float4 normalOnSurfaceB1 = qtRotate(quat,-hitNormalWorld); + float4 pOnB1 = transform(&closestPnt,&pos,&quat); + + float actualDepth = minDist-radius; + if (actualDepth<=0.f) + { + + + pOnB1.w = actualDepth; + + int dstIdx; + AppendInc( nGlobalContactsOut, dstIdx ); + + + if (1)//dstIdx < maxContactCapacity) + { + __global struct b3Contact4Data* c = &globalContactsOut[dstIdx]; + c->m_worldNormalOnB = -normalOnSurfaceB1; + c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff); + c->m_batchIdx = pairIndex; + c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB; + c->m_worldPosB[0] = pOnB1; + c->m_childIndexA = -1; + c->m_childIndexB = -1; + + GET_NPOINTS(*c) = 1; + } + + } + }//if (hasCollision) + +} + + + +int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int4* contactIdx) +{ + if( nPoints == 0 ) + return 0; + + if (nPoints <=4) + return nPoints; + + + if (nPoints >64) + nPoints = 64; + + float4 center = make_float4(0.f); + { + + for (int i=0;i<nPoints;i++) + center += p[i]; + center /= (float)nPoints; + } + + + + // sample 4 directions + + float4 aVector = p[0] - center; + float4 u = cross3( nearNormal, aVector ); + float4 v = cross3( nearNormal, u ); + u = normalize3( u ); + v = normalize3( v ); + + + //keep point with deepest penetration + float minW= FLT_MAX; + + int minIndex=-1; + + float4 maxDots; + maxDots.x = FLT_MIN; + maxDots.y = FLT_MIN; + maxDots.z = FLT_MIN; + maxDots.w = FLT_MIN; + + // idx, distance + for(int ie = 0; ie<nPoints; ie++ ) + { + if (p[ie].w<minW) + { + minW = p[ie].w; + minIndex=ie; + } + float f; + float4 r = p[ie]-center; + f = dot3F4( u, r ); + if (f<maxDots.x) + { + maxDots.x = f; + contactIdx[0].x = ie; + } + + f = dot3F4( -u, r ); + if (f<maxDots.y) + { + maxDots.y = f; + contactIdx[0].y = ie; + } + + + f = dot3F4( v, r ); + if (f<maxDots.z) + { + maxDots.z = f; + contactIdx[0].z = ie; + } + + f = dot3F4( -v, r ); + if (f<maxDots.w) + { + maxDots.w = f; + contactIdx[0].w = ie; + } + + } + + if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex) + { + //replace the first contact with minimum (todo: replace contact with least penetration) + contactIdx[0].x = minIndex; + } + + return 4; + +} + +#define MAX_PLANE_CONVEX_POINTS 64 + +int computeContactPlaneConvex(int pairIndex, + int bodyIndexA, int bodyIndexB, + int collidableIndexA, int collidableIndexB, + __global const BodyData* rigidBodies, + __global const btCollidableGpu*collidables, + __global const ConvexPolyhedronCL* convexShapes, + __global const float4* convexVertices, + __global const int* convexIndices, + __global const btGpuFace* faces, + __global struct b3Contact4Data* restrict globalContactsOut, + counter32_t nGlobalContactsOut, + int maxContactCapacity, + float4 posB, + Quaternion ornB + ) +{ + int resultIndex=-1; + + int shapeIndex = collidables[collidableIndexB].m_shapeIndex; + __global const ConvexPolyhedronCL* hullB = &convexShapes[shapeIndex]; + + float4 posA; + posA = rigidBodies[bodyIndexA].m_pos; + Quaternion ornA; + ornA = rigidBodies[bodyIndexA].m_quat; + + int numContactsOut = 0; + int numWorldVertsB1= 0; + + float4 planeEq; + planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane; + float4 planeNormal = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f); + float4 planeNormalWorld; + planeNormalWorld = qtRotate(ornA,planeNormal); + float planeConstant = planeEq.w; + + float4 invPosA;Quaternion invOrnA; + float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1; + { + + trInverse(posA,ornA,&invPosA,&invOrnA); + trMul(invPosA,invOrnA,posB,ornB,&convexInPlaneTransPos1,&convexInPlaneTransOrn1); + } + float4 invPosB;Quaternion invOrnB; + float4 planeInConvexPos1; Quaternion planeInConvexOrn1; + { + + trInverse(posB,ornB,&invPosB,&invOrnB); + trMul(invPosB,invOrnB,posA,ornA,&planeInConvexPos1,&planeInConvexOrn1); + } + + + float4 planeNormalInConvex = qtRotate(planeInConvexOrn1,-planeNormal); + float maxDot = -1e30; + int hitVertex=-1; + float4 hitVtx; + + + + float4 contactPoints[MAX_PLANE_CONVEX_POINTS]; + int numPoints = 0; + + int4 contactIdx; + contactIdx=make_int4(0,1,2,3); + + + for (int i=0;i<hullB->m_numVertices;i++) + { + float4 vtx = convexVertices[hullB->m_vertexOffset+i]; + float curDot = dot(vtx,planeNormalInConvex); + + + if (curDot>maxDot) + { + hitVertex=i; + maxDot=curDot; + hitVtx = vtx; + //make sure the deepest points is always included + if (numPoints==MAX_PLANE_CONVEX_POINTS) + numPoints--; + } + + if (numPoints<MAX_PLANE_CONVEX_POINTS) + { + float4 vtxWorld = transform(&vtx, &posB, &ornB); + float4 vtxInPlane = transform(&vtxWorld, &invPosA, &invOrnA);//oplaneTransform.inverse()*vtxWorld; + float dist = dot(planeNormal,vtxInPlane)-planeConstant; + if (dist<0.f) + { + vtxWorld.w = dist; + contactPoints[numPoints] = vtxWorld; + numPoints++; + } + } + + } + + int numReducedPoints = numPoints; + if (numPoints>4) + { + numReducedPoints = extractManifoldSequential( contactPoints, numPoints, planeNormalInConvex, &contactIdx); + } + + if (numReducedPoints>0) + { + int dstIdx; + AppendInc( nGlobalContactsOut, dstIdx ); + + if (dstIdx < maxContactCapacity) + { + resultIndex = dstIdx; + __global struct b3Contact4Data* c = &globalContactsOut[dstIdx]; + c->m_worldNormalOnB = -planeNormalWorld; + //c->setFrictionCoeff(0.7); + //c->setRestituitionCoeff(0.f); + c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff); + c->m_batchIdx = pairIndex; + c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB; + c->m_childIndexA = -1; + c->m_childIndexB = -1; + + switch (numReducedPoints) + { + case 4: + c->m_worldPosB[3] = contactPoints[contactIdx.w]; + case 3: + c->m_worldPosB[2] = contactPoints[contactIdx.z]; + case 2: + c->m_worldPosB[1] = contactPoints[contactIdx.y]; + case 1: + c->m_worldPosB[0] = contactPoints[contactIdx.x]; + default: + { + } + }; + + GET_NPOINTS(*c) = numReducedPoints; + }//if (dstIdx < numPairs) + } + + return resultIndex; +} + + +void computeContactPlaneSphere(int pairIndex, + int bodyIndexA, int bodyIndexB, + int collidableIndexA, int collidableIndexB, + __global const BodyData* rigidBodies, + __global const btCollidableGpu* collidables, + __global const btGpuFace* faces, + __global struct b3Contact4Data* restrict globalContactsOut, + counter32_t nGlobalContactsOut, + int maxContactCapacity) +{ + float4 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane; + float radius = collidables[collidableIndexB].m_radius; + float4 posA1 = rigidBodies[bodyIndexA].m_pos; + float4 ornA1 = rigidBodies[bodyIndexA].m_quat; + float4 posB1 = rigidBodies[bodyIndexB].m_pos; + float4 ornB1 = rigidBodies[bodyIndexB].m_quat; + + bool hasCollision = false; + float4 planeNormal1 = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f); + float planeConstant = planeEq.w; + float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1; + { + float4 invPosA;Quaternion invOrnA; + trInverse(posA1,ornA1,&invPosA,&invOrnA); + trMul(invPosA,invOrnA,posB1,ornB1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1); + } + float4 planeInConvexPos1; Quaternion planeInConvexOrn1; + { + float4 invPosB;Quaternion invOrnB; + trInverse(posB1,ornB1,&invPosB,&invOrnB); + trMul(invPosB,invOrnB,posA1,ornA1,&planeInConvexPos1,&planeInConvexOrn1); + } + float4 vtx1 = qtRotate(planeInConvexOrn1,-planeNormal1)*radius; + float4 vtxInPlane1 = transform(&vtx1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1); + float distance = dot3F4(planeNormal1,vtxInPlane1) - planeConstant; + hasCollision = distance < 0.f;//m_manifoldPtr->getContactBreakingThreshold(); + if (hasCollision) + { + float4 vtxInPlaneProjected1 = vtxInPlane1 - distance*planeNormal1; + float4 vtxInPlaneWorld1 = transform(&vtxInPlaneProjected1,&posA1,&ornA1); + float4 normalOnSurfaceB1 = qtRotate(ornA1,planeNormal1); + float4 pOnB1 = vtxInPlaneWorld1+normalOnSurfaceB1*distance; + pOnB1.w = distance; + + int dstIdx; + AppendInc( nGlobalContactsOut, dstIdx ); + + if (dstIdx < maxContactCapacity) + { + __global struct b3Contact4Data* c = &globalContactsOut[dstIdx]; + c->m_worldNormalOnB = -normalOnSurfaceB1; + c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff); + c->m_batchIdx = pairIndex; + c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB; + c->m_worldPosB[0] = pOnB1; + c->m_childIndexA = -1; + c->m_childIndexB = -1; + GET_NPOINTS(*c) = 1; + }//if (dstIdx < numPairs) + }//if (hasCollision) +} + + +__kernel void primitiveContactsKernel( __global int4* pairs, + __global const BodyData* rigidBodies, + __global const btCollidableGpu* collidables, + __global const ConvexPolyhedronCL* convexShapes, + __global const float4* vertices, + __global const float4* uniqueEdges, + __global const btGpuFace* faces, + __global const int* indices, + __global struct b3Contact4Data* restrict globalContactsOut, + counter32_t nGlobalContactsOut, + int numPairs, int maxContactCapacity) +{ + + int i = get_global_id(0); + int pairIndex = i; + + float4 worldVertsB1[64]; + float4 worldVertsB2[64]; + int capacityWorldVerts = 64; + + float4 localContactsOut[64]; + int localContactCapacity=64; + + float minDist = -1e30f; + float maxDist = 0.02f; + + if (i<numPairs) + { + + int bodyIndexA = pairs[i].x; + int bodyIndexB = pairs[i].y; + + int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + + if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE && + collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL) + { + + float4 posB; + posB = rigidBodies[bodyIndexB].m_pos; + Quaternion ornB; + ornB = rigidBodies[bodyIndexB].m_quat; + int contactIndex = computeContactPlaneConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, + rigidBodies,collidables,convexShapes,vertices,indices, + faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity, posB,ornB); + if (contactIndex>=0) + pairs[pairIndex].z = contactIndex; + + return; + } + + + if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL && + collidables[collidableIndexB].m_shapeType == SHAPE_PLANE) + { + + float4 posA; + posA = rigidBodies[bodyIndexA].m_pos; + Quaternion ornA; + ornA = rigidBodies[bodyIndexA].m_quat; + + + int contactIndex = computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA, collidableIndexB,collidableIndexA, + rigidBodies,collidables,convexShapes,vertices,indices, + faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA); + + if (contactIndex>=0) + pairs[pairIndex].z = contactIndex; + + return; + } + + if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE && + collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE) + { + computeContactPlaneSphere(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, + rigidBodies,collidables,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity); + return; + } + + + if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE && + collidables[collidableIndexB].m_shapeType == SHAPE_PLANE) + { + + + computeContactPlaneSphere( pairIndex, bodyIndexB,bodyIndexA, collidableIndexB,collidableIndexA, + rigidBodies,collidables, + faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity); + + return; + } + + + + + if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE && + collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL) + { + + float4 spherePos = rigidBodies[bodyIndexA].m_pos; + float sphereRadius = collidables[collidableIndexA].m_radius; + float4 convexPos = rigidBodies[bodyIndexB].m_pos; + float4 convexOrn = rigidBodies[bodyIndexB].m_quat; + + computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, + rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity, + spherePos,sphereRadius,convexPos,convexOrn); + + return; + } + + if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL && + collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE) + { + + float4 spherePos = rigidBodies[bodyIndexB].m_pos; + float sphereRadius = collidables[collidableIndexB].m_radius; + float4 convexPos = rigidBodies[bodyIndexA].m_pos; + float4 convexOrn = rigidBodies[bodyIndexA].m_quat; + + computeContactSphereConvex(pairIndex, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, + rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity, + spherePos,sphereRadius,convexPos,convexOrn); + return; + } + + + + + + + if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE && + collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE) + { + //sphere-sphere + float radiusA = collidables[collidableIndexA].m_radius; + float radiusB = collidables[collidableIndexB].m_radius; + float4 posA = rigidBodies[bodyIndexA].m_pos; + float4 posB = rigidBodies[bodyIndexB].m_pos; + + float4 diff = posA-posB; + float len = length(diff); + + ///iff distance positive, don't generate a new contact + if ( len <= (radiusA+radiusB)) + { + ///distance (negative means penetration) + float dist = len - (radiusA+radiusB); + float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f); + if (len > 0.00001) + { + normalOnSurfaceB = diff / len; + } + float4 contactPosB = posB + normalOnSurfaceB*radiusB; + contactPosB.w = dist; + + int dstIdx; + AppendInc( nGlobalContactsOut, dstIdx ); + + if (dstIdx < maxContactCapacity) + { + __global struct b3Contact4Data* c = &globalContactsOut[dstIdx]; + c->m_worldNormalOnB = normalOnSurfaceB; + c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff); + c->m_batchIdx = pairIndex; + int bodyA = pairs[pairIndex].x; + int bodyB = pairs[pairIndex].y; + c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB; + c->m_worldPosB[0] = contactPosB; + c->m_childIndexA = -1; + c->m_childIndexB = -1; + GET_NPOINTS(*c) = 1; + }//if (dstIdx < numPairs) + }//if ( len <= (radiusA+radiusB)) + + return; + }//SHAPE_SPHERE SHAPE_SPHERE + + }// if (i<numPairs) + +} + + +// work-in-progress +__kernel void processCompoundPairsPrimitivesKernel( __global const int4* gpuCompoundPairs, + __global const BodyData* rigidBodies, + __global const btCollidableGpu* collidables, + __global const ConvexPolyhedronCL* convexShapes, + __global const float4* vertices, + __global const float4* uniqueEdges, + __global const btGpuFace* faces, + __global const int* indices, + __global btAabbCL* aabbs, + __global const btGpuChildShape* gpuChildShapes, + __global struct b3Contact4Data* restrict globalContactsOut, + counter32_t nGlobalContactsOut, + int numCompoundPairs, int maxContactCapacity + ) +{ + + int i = get_global_id(0); + if (i<numCompoundPairs) + { + int bodyIndexA = gpuCompoundPairs[i].x; + int bodyIndexB = gpuCompoundPairs[i].y; + + int childShapeIndexA = gpuCompoundPairs[i].z; + int childShapeIndexB = gpuCompoundPairs[i].w; + + int collidableIndexA = -1; + int collidableIndexB = -1; + + float4 ornA = rigidBodies[bodyIndexA].m_quat; + float4 posA = rigidBodies[bodyIndexA].m_pos; + + float4 ornB = rigidBodies[bodyIndexB].m_quat; + float4 posB = rigidBodies[bodyIndexB].m_pos; + + if (childShapeIndexA >= 0) + { + collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex; + float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition; + float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation; + float4 newPosA = qtRotate(ornA,childPosA)+posA; + float4 newOrnA = qtMul(ornA,childOrnA); + posA = newPosA; + ornA = newOrnA; + } else + { + collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + } + + if (childShapeIndexB>=0) + { + collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex; + float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition; + float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; + float4 newPosB = transform(&childPosB,&posB,&ornB); + float4 newOrnB = qtMul(ornB,childOrnB); + posB = newPosB; + ornB = newOrnB; + } else + { + collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + } + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + int shapeTypeA = collidables[collidableIndexA].m_shapeType; + int shapeTypeB = collidables[collidableIndexB].m_shapeType; + + int pairIndex = i; + if ((shapeTypeA == SHAPE_PLANE) && (shapeTypeB==SHAPE_CONVEX_HULL)) + { + + computeContactPlaneConvex( pairIndex, bodyIndexA,bodyIndexB, collidableIndexA,collidableIndexB, + rigidBodies,collidables,convexShapes,vertices,indices, + faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,posB,ornB); + return; + } + + if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB==SHAPE_PLANE)) + { + + computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA, collidableIndexB,collidableIndexA, + rigidBodies,collidables,convexShapes,vertices,indices, + faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA); + return; + } + + if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB == SHAPE_SPHERE)) + { + float4 spherePos = rigidBodies[bodyIndexB].m_pos; + float sphereRadius = collidables[collidableIndexB].m_radius; + float4 convexPos = posA; + float4 convexOrn = ornA; + + computeContactSphereConvex(pairIndex, bodyIndexB, bodyIndexA , collidableIndexB,collidableIndexA, + rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity, + spherePos,sphereRadius,convexPos,convexOrn); + + return; + } + + if ((shapeTypeA == SHAPE_SPHERE) && (shapeTypeB == SHAPE_CONVEX_HULL)) + { + + float4 spherePos = rigidBodies[bodyIndexA].m_pos; + float sphereRadius = collidables[collidableIndexA].m_radius; + float4 convexPos = posB; + float4 convexOrn = ornB; + + + computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, + rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity, + spherePos,sphereRadius,convexPos,convexOrn); + + return; + } + }// if (i<numCompoundPairs) +} + + +bool pointInTriangle(const float4* vertices, const float4* normal, float4 *p ) +{ + + const float4* p1 = &vertices[0]; + const float4* p2 = &vertices[1]; + const float4* p3 = &vertices[2]; + + float4 edge1; edge1 = (*p2 - *p1); + float4 edge2; edge2 = ( *p3 - *p2 ); + float4 edge3; edge3 = ( *p1 - *p3 ); + + + float4 p1_to_p; p1_to_p = ( *p - *p1 ); + float4 p2_to_p; p2_to_p = ( *p - *p2 ); + float4 p3_to_p; p3_to_p = ( *p - *p3 ); + + float4 edge1_normal; edge1_normal = ( cross(edge1,*normal)); + float4 edge2_normal; edge2_normal = ( cross(edge2,*normal)); + float4 edge3_normal; edge3_normal = ( cross(edge3,*normal)); + + + + float r1, r2, r3; + r1 = dot(edge1_normal,p1_to_p ); + r2 = dot(edge2_normal,p2_to_p ); + r3 = dot(edge3_normal,p3_to_p ); + + if ( r1 > 0 && r2 > 0 && r3 > 0 ) + return true; + if ( r1 <= 0 && r2 <= 0 && r3 <= 0 ) + return true; + return false; + +} + + +float segmentSqrDistance(float4 from, float4 to,float4 p, float4* nearest) +{ + float4 diff = p - from; + float4 v = to - from; + float t = dot(v,diff); + + if (t > 0) + { + float dotVV = dot(v,v); + if (t < dotVV) + { + t /= dotVV; + diff -= t*v; + } else + { + t = 1; + diff -= v; + } + } else + { + t = 0; + } + *nearest = from + t*v; + return dot(diff,diff); +} + + +void computeContactSphereTriangle(int pairIndex, + int bodyIndexA, int bodyIndexB, + int collidableIndexA, int collidableIndexB, + __global const BodyData* rigidBodies, + __global const btCollidableGpu* collidables, + const float4* triangleVertices, + __global struct b3Contact4Data* restrict globalContactsOut, + counter32_t nGlobalContactsOut, + int maxContactCapacity, + float4 spherePos2, + float radius, + float4 pos, + float4 quat, + int faceIndex + ) +{ + + float4 invPos; + float4 invOrn; + + trInverse(pos,quat, &invPos,&invOrn); + float4 spherePos = transform(&spherePos2,&invPos,&invOrn); + int numFaces = 3; + float4 closestPnt = (float4)(0, 0, 0, 0); + float4 hitNormalWorld = (float4)(0, 0, 0, 0); + float minDist = -1000000.f; + bool bCollide = false; + + + ////////////////////////////////////// + + float4 sphereCenter; + sphereCenter = spherePos; + + const float4* vertices = triangleVertices; + float contactBreakingThreshold = 0.f;//todo? + float radiusWithThreshold = radius + contactBreakingThreshold; + float4 edge10; + edge10 = vertices[1]-vertices[0]; + edge10.w = 0.f;//is this needed? + float4 edge20; + edge20 = vertices[2]-vertices[0]; + edge20.w = 0.f;//is this needed? + float4 normal = cross3(edge10,edge20); + normal = normalize(normal); + float4 p1ToCenter; + p1ToCenter = sphereCenter - vertices[0]; + + float distanceFromPlane = dot(p1ToCenter,normal); + + if (distanceFromPlane < 0.f) + { + //triangle facing the other way + distanceFromPlane *= -1.f; + normal *= -1.f; + } + hitNormalWorld = normal; + + bool isInsideContactPlane = distanceFromPlane < radiusWithThreshold; + + // Check for contact / intersection + bool hasContact = false; + float4 contactPoint; + if (isInsideContactPlane) + { + + if (pointInTriangle(vertices,&normal, &sphereCenter)) + { + // Inside the contact wedge - touches a point on the shell plane + hasContact = true; + contactPoint = sphereCenter - normal*distanceFromPlane; + + } else { + // Could be inside one of the contact capsules + float contactCapsuleRadiusSqr = radiusWithThreshold*radiusWithThreshold; + float4 nearestOnEdge; + int numEdges = 3; + for (int i = 0; i < numEdges; i++) + { + float4 pa =vertices[i]; + float4 pb = vertices[(i+1)%3]; + + float distanceSqr = segmentSqrDistance(pa,pb,sphereCenter, &nearestOnEdge); + if (distanceSqr < contactCapsuleRadiusSqr) + { + // Yep, we're inside a capsule + hasContact = true; + contactPoint = nearestOnEdge; + + } + + } + } + } + + if (hasContact) + { + + closestPnt = contactPoint; + float4 contactToCenter = sphereCenter - contactPoint; + minDist = length(contactToCenter); + if (minDist>FLT_EPSILON) + { + hitNormalWorld = normalize(contactToCenter);//*(1./minDist); + bCollide = true; + } + + } + + + ///////////////////////////////////// + + if (bCollide && minDist > -10000) + { + + float4 normalOnSurfaceB1 = qtRotate(quat,-hitNormalWorld); + float4 pOnB1 = transform(&closestPnt,&pos,&quat); + float actualDepth = minDist-radius; + + + if (actualDepth<=0.f) + { + pOnB1.w = actualDepth; + int dstIdx; + + + float lenSqr = dot3F4(normalOnSurfaceB1,normalOnSurfaceB1); + if (lenSqr>FLT_EPSILON) + { + AppendInc( nGlobalContactsOut, dstIdx ); + + if (dstIdx < maxContactCapacity) + { + __global struct b3Contact4Data* c = &globalContactsOut[dstIdx]; + c->m_worldNormalOnB = -normalOnSurfaceB1; + c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff); + c->m_batchIdx = pairIndex; + c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB; + c->m_worldPosB[0] = pOnB1; + + c->m_childIndexA = -1; + c->m_childIndexB = faceIndex; + + GET_NPOINTS(*c) = 1; + } + } + + } + }//if (hasCollision) + +} + + + +// work-in-progress +__kernel void findConcaveSphereContactsKernel( __global int4* concavePairs, + __global const BodyData* rigidBodies, + __global const btCollidableGpu* collidables, + __global const ConvexPolyhedronCL* convexShapes, + __global const float4* vertices, + __global const float4* uniqueEdges, + __global const btGpuFace* faces, + __global const int* indices, + __global btAabbCL* aabbs, + __global struct b3Contact4Data* restrict globalContactsOut, + counter32_t nGlobalContactsOut, + int numConcavePairs, int maxContactCapacity + ) +{ + + int i = get_global_id(0); + if (i>=numConcavePairs) + return; + int pairIdx = i; + + int bodyIndexA = concavePairs[i].x; + int bodyIndexB = concavePairs[i].y; + + int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + if (collidables[collidableIndexB].m_shapeType==SHAPE_SPHERE) + { + int f = concavePairs[i].z; + btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f]; + + float4 verticesA[3]; + for (int i=0;i<3;i++) + { + int index = indices[face.m_indexOffset+i]; + float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index]; + verticesA[i] = vert; + } + + float4 spherePos = rigidBodies[bodyIndexB].m_pos; + float sphereRadius = collidables[collidableIndexB].m_radius; + float4 convexPos = rigidBodies[bodyIndexA].m_pos; + float4 convexOrn = rigidBodies[bodyIndexA].m_quat; + + computeContactSphereTriangle(i, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, + rigidBodies,collidables, + verticesA, + globalContactsOut, nGlobalContactsOut,maxContactCapacity, + spherePos,sphereRadius,convexPos,convexOrn, f); + + return; + } +}
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h new file mode 100644 index 0000000000..b0103fe674 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h @@ -0,0 +1,1289 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* primitiveContactsKernelsCL= \ +"#ifndef B3_CONTACT4DATA_H\n" +"#define B3_CONTACT4DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"typedef struct b3Contact4Data b3Contact4Data_t;\n" +"struct b3Contact4Data\n" +"{\n" +" b3Float4 m_worldPosB[4];\n" +"// b3Float4 m_localPosA[4];\n" +"// b3Float4 m_localPosB[4];\n" +" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" +" unsigned short m_restituitionCoeffCmp;\n" +" unsigned short m_frictionCoeffCmp;\n" +" int m_batchIdx;\n" +" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" +" int m_bodyBPtrAndSignBit;\n" +" int m_childIndexA;\n" +" int m_childIndexB;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"};\n" +"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" +"{\n" +" return (int)contact->m_worldNormalOnB.w;\n" +"};\n" +"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" +"{\n" +" contact->m_worldNormalOnB.w = (float)numPoints;\n" +"};\n" +"#endif //B3_CONTACT4DATA_H\n" +"#define SHAPE_CONVEX_HULL 3\n" +"#define SHAPE_PLANE 4\n" +"#define SHAPE_CONCAVE_TRIMESH 5\n" +"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n" +"#define SHAPE_SPHERE 7\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile __global int*\n" +"#endif\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"#define max2 max\n" +"#define min2 min\n" +"typedef unsigned int u32;\n" +"typedef struct \n" +"{\n" +" union\n" +" {\n" +" float4 m_min;\n" +" float m_minElems[4];\n" +" int m_minIndices[4];\n" +" };\n" +" union\n" +" {\n" +" float4 m_max;\n" +" float m_maxElems[4];\n" +" int m_maxIndices[4];\n" +" };\n" +"} btAabbCL;\n" +"///keep this in sync with btCollidable.h\n" +"typedef struct\n" +"{\n" +" int m_numChildShapes;\n" +" float m_radius;\n" +" int m_shapeType;\n" +" int m_shapeIndex;\n" +" \n" +"} btCollidableGpu;\n" +"typedef struct\n" +"{\n" +" float4 m_childPosition;\n" +" float4 m_childOrientation;\n" +" int m_shapeIndex;\n" +" int m_unused0;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"} btGpuChildShape;\n" +"#define GET_NPOINTS(x) (x).m_worldNormalOnB.w\n" +"typedef struct\n" +"{\n" +" float4 m_pos;\n" +" float4 m_quat;\n" +" float4 m_linVel;\n" +" float4 m_angVel;\n" +" u32 m_collidableIdx; \n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"} BodyData;\n" +"typedef struct \n" +"{\n" +" float4 m_localCenter;\n" +" float4 m_extents;\n" +" float4 mC;\n" +" float4 mE;\n" +" \n" +" float m_radius;\n" +" int m_faceOffset;\n" +" int m_numFaces;\n" +" int m_numVertices;\n" +" \n" +" int m_vertexOffset;\n" +" int m_uniqueEdgesOffset;\n" +" int m_numUniqueEdges;\n" +" int m_unused;\n" +"} ConvexPolyhedronCL;\n" +"typedef struct\n" +"{\n" +" float4 m_plane;\n" +" int m_indexOffset;\n" +" int m_numIndices;\n" +"} btGpuFace;\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define make_float4 (float4)\n" +"#define make_float2 (float2)\n" +"#define make_uint4 (uint4)\n" +"#define make_int4 (int4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"__inline\n" +"float fastDiv(float numerator, float denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"// return numerator/denominator; \n" +"}\n" +"__inline\n" +"float4 fastDiv4(float4 numerator, float4 denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"}\n" +"__inline\n" +"float4 cross3(float4 a, float4 b)\n" +"{\n" +" return cross(a,b);\n" +"}\n" +"//#define dot3F4 dot\n" +"__inline\n" +"float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = make_float4(a.xyz,0.f);\n" +" float4 b1 = make_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" return fast_normalize(v);\n" +"}\n" +"///////////////////////////////////////\n" +"// Quaternion\n" +"///////////////////////////////////////\n" +"typedef float4 Quaternion;\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b);\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in);\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec);\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q);\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b)\n" +"{\n" +" Quaternion ans;\n" +" ans = cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in)\n" +"{\n" +" return fastNormalize4(in);\n" +"// in /= length( in );\n" +"// return in;\n" +"}\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec)\n" +"{\n" +" Quaternion qInv = qtInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q)\n" +"{\n" +" return (Quaternion)(-q.xyz, q.w);\n" +"}\n" +"__inline\n" +"float4 qtInvRotate(const Quaternion q, float4 vec)\n" +"{\n" +" return qtRotate( qtInvert( q ), vec );\n" +"}\n" +"__inline\n" +"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n" +"{\n" +" return qtRotate( *orientation, *p ) + (*translation);\n" +"}\n" +"void trInverse(float4 translationIn, Quaternion orientationIn,\n" +" float4* translationOut, Quaternion* orientationOut)\n" +"{\n" +" *orientationOut = qtInvert(orientationIn);\n" +" *translationOut = qtRotate(*orientationOut, -translationIn);\n" +"}\n" +"void trMul(float4 translationA, Quaternion orientationA,\n" +" float4 translationB, Quaternion orientationB,\n" +" float4* translationOut, Quaternion* orientationOut)\n" +"{\n" +" *orientationOut = qtMul(orientationA,orientationB);\n" +" *translationOut = transform(&translationB,&translationA,&orientationA);\n" +"}\n" +"__inline\n" +"float4 normalize3(const float4 a)\n" +"{\n" +" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" +" return fastNormalize4( n );\n" +"}\n" +"__inline float4 lerp3(const float4 a,const float4 b, float t)\n" +"{\n" +" return make_float4( a.x + (b.x - a.x) * t,\n" +" a.y + (b.y - a.y) * t,\n" +" a.z + (b.z - a.z) * t,\n" +" 0.f);\n" +"}\n" +"float signedDistanceFromPointToPlane(float4 point, float4 planeEqn, float4* closestPointOnFace)\n" +"{\n" +" float4 n = (float4)(planeEqn.x, planeEqn.y, planeEqn.z, 0);\n" +" float dist = dot3F4(n, point) + planeEqn.w;\n" +" *closestPointOnFace = point - dist * n;\n" +" return dist;\n" +"}\n" +"inline bool IsPointInPolygon(float4 p, \n" +" const btGpuFace* face,\n" +" __global const float4* baseVertex,\n" +" __global const int* convexIndices,\n" +" float4* out)\n" +"{\n" +" float4 a;\n" +" float4 b;\n" +" float4 ab;\n" +" float4 ap;\n" +" float4 v;\n" +" float4 plane = make_float4(face->m_plane.x,face->m_plane.y,face->m_plane.z,0.f);\n" +" \n" +" if (face->m_numIndices<2)\n" +" return false;\n" +" \n" +" float4 v0 = baseVertex[convexIndices[face->m_indexOffset + face->m_numIndices-1]];\n" +" \n" +" b = v0;\n" +" for(unsigned i=0; i != face->m_numIndices; ++i)\n" +" {\n" +" a = b;\n" +" float4 vi = baseVertex[convexIndices[face->m_indexOffset + i]];\n" +" b = vi;\n" +" ab = b-a;\n" +" ap = p-a;\n" +" v = cross3(ab,plane);\n" +" if (dot(ap, v) > 0.f)\n" +" {\n" +" float ab_m2 = dot(ab, ab);\n" +" float rt = ab_m2 != 0.f ? dot(ab, ap) / ab_m2 : 0.f;\n" +" if (rt <= 0.f)\n" +" {\n" +" *out = a;\n" +" }\n" +" else if (rt >= 1.f) \n" +" {\n" +" *out = b;\n" +" }\n" +" else\n" +" {\n" +" float s = 1.f - rt;\n" +" out[0].x = s * a.x + rt * b.x;\n" +" out[0].y = s * a.y + rt * b.y;\n" +" out[0].z = s * a.z + rt * b.z;\n" +" }\n" +" return false;\n" +" }\n" +" }\n" +" return true;\n" +"}\n" +"void computeContactSphereConvex(int pairIndex,\n" +" int bodyIndexA, int bodyIndexB, \n" +" int collidableIndexA, int collidableIndexB, \n" +" __global const BodyData* rigidBodies, \n" +" __global const btCollidableGpu* collidables,\n" +" __global const ConvexPolyhedronCL* convexShapes,\n" +" __global const float4* convexVertices,\n" +" __global const int* convexIndices,\n" +" __global const btGpuFace* faces,\n" +" __global struct b3Contact4Data* restrict globalContactsOut,\n" +" counter32_t nGlobalContactsOut,\n" +" int maxContactCapacity,\n" +" float4 spherePos2,\n" +" float radius,\n" +" float4 pos,\n" +" float4 quat\n" +" )\n" +"{\n" +" float4 invPos;\n" +" float4 invOrn;\n" +" trInverse(pos,quat, &invPos,&invOrn);\n" +" float4 spherePos = transform(&spherePos2,&invPos,&invOrn);\n" +" int shapeIndex = collidables[collidableIndexB].m_shapeIndex;\n" +" int numFaces = convexShapes[shapeIndex].m_numFaces;\n" +" float4 closestPnt = (float4)(0, 0, 0, 0);\n" +" float4 hitNormalWorld = (float4)(0, 0, 0, 0);\n" +" float minDist = -1000000.f;\n" +" bool bCollide = true;\n" +" for ( int f = 0; f < numFaces; f++ )\n" +" {\n" +" btGpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f];\n" +" // set up a plane equation \n" +" float4 planeEqn;\n" +" float4 n1 = face.m_plane;\n" +" n1.w = 0.f;\n" +" planeEqn = n1;\n" +" planeEqn.w = face.m_plane.w;\n" +" \n" +" \n" +" // compute a signed distance from the vertex in cloth to the face of rigidbody.\n" +" float4 pntReturn;\n" +" float dist = signedDistanceFromPointToPlane(spherePos, planeEqn, &pntReturn);\n" +" // If the distance is positive, the plane is a separating plane. \n" +" if ( dist > radius )\n" +" {\n" +" bCollide = false;\n" +" break;\n" +" }\n" +" if (dist>0)\n" +" {\n" +" //might hit an edge or vertex\n" +" float4 out;\n" +" float4 zeroPos = make_float4(0,0,0,0);\n" +" bool isInPoly = IsPointInPolygon(spherePos,\n" +" &face,\n" +" &convexVertices[convexShapes[shapeIndex].m_vertexOffset],\n" +" convexIndices,\n" +" &out);\n" +" if (isInPoly)\n" +" {\n" +" if (dist>minDist)\n" +" {\n" +" minDist = dist;\n" +" closestPnt = pntReturn;\n" +" hitNormalWorld = planeEqn;\n" +" \n" +" }\n" +" } else\n" +" {\n" +" float4 tmp = spherePos-out;\n" +" float l2 = dot(tmp,tmp);\n" +" if (l2<radius*radius)\n" +" {\n" +" dist = sqrt(l2);\n" +" if (dist>minDist)\n" +" {\n" +" minDist = dist;\n" +" closestPnt = out;\n" +" hitNormalWorld = tmp/dist;\n" +" \n" +" }\n" +" \n" +" } else\n" +" {\n" +" bCollide = false;\n" +" break;\n" +" }\n" +" }\n" +" } else\n" +" {\n" +" if ( dist > minDist )\n" +" {\n" +" minDist = dist;\n" +" closestPnt = pntReturn;\n" +" hitNormalWorld.xyz = planeEqn.xyz;\n" +" }\n" +" }\n" +" \n" +" }\n" +" \n" +" if (bCollide && minDist > -10000)\n" +" {\n" +" float4 normalOnSurfaceB1 = qtRotate(quat,-hitNormalWorld);\n" +" float4 pOnB1 = transform(&closestPnt,&pos,&quat);\n" +" \n" +" float actualDepth = minDist-radius;\n" +" if (actualDepth<=0.f)\n" +" {\n" +" \n" +" pOnB1.w = actualDepth;\n" +" int dstIdx;\n" +" AppendInc( nGlobalContactsOut, dstIdx );\n" +" \n" +" \n" +" if (1)//dstIdx < maxContactCapacity)\n" +" {\n" +" __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" +" c->m_worldNormalOnB = -normalOnSurfaceB1;\n" +" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" +" c->m_batchIdx = pairIndex;\n" +" c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n" +" c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n" +" c->m_worldPosB[0] = pOnB1;\n" +" c->m_childIndexA = -1;\n" +" c->m_childIndexB = -1;\n" +" GET_NPOINTS(*c) = 1;\n" +" } \n" +" }\n" +" }//if (hasCollision)\n" +"}\n" +" \n" +"int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int4* contactIdx)\n" +"{\n" +" if( nPoints == 0 )\n" +" return 0;\n" +" \n" +" if (nPoints <=4)\n" +" return nPoints;\n" +" \n" +" \n" +" if (nPoints >64)\n" +" nPoints = 64;\n" +" \n" +" float4 center = make_float4(0.f);\n" +" {\n" +" \n" +" for (int i=0;i<nPoints;i++)\n" +" center += p[i];\n" +" center /= (float)nPoints;\n" +" }\n" +" \n" +" \n" +" \n" +" // sample 4 directions\n" +" \n" +" float4 aVector = p[0] - center;\n" +" float4 u = cross3( nearNormal, aVector );\n" +" float4 v = cross3( nearNormal, u );\n" +" u = normalize3( u );\n" +" v = normalize3( v );\n" +" \n" +" \n" +" //keep point with deepest penetration\n" +" float minW= FLT_MAX;\n" +" \n" +" int minIndex=-1;\n" +" \n" +" float4 maxDots;\n" +" maxDots.x = FLT_MIN;\n" +" maxDots.y = FLT_MIN;\n" +" maxDots.z = FLT_MIN;\n" +" maxDots.w = FLT_MIN;\n" +" \n" +" // idx, distance\n" +" for(int ie = 0; ie<nPoints; ie++ )\n" +" {\n" +" if (p[ie].w<minW)\n" +" {\n" +" minW = p[ie].w;\n" +" minIndex=ie;\n" +" }\n" +" float f;\n" +" float4 r = p[ie]-center;\n" +" f = dot3F4( u, r );\n" +" if (f<maxDots.x)\n" +" {\n" +" maxDots.x = f;\n" +" contactIdx[0].x = ie;\n" +" }\n" +" \n" +" f = dot3F4( -u, r );\n" +" if (f<maxDots.y)\n" +" {\n" +" maxDots.y = f;\n" +" contactIdx[0].y = ie;\n" +" }\n" +" \n" +" \n" +" f = dot3F4( v, r );\n" +" if (f<maxDots.z)\n" +" {\n" +" maxDots.z = f;\n" +" contactIdx[0].z = ie;\n" +" }\n" +" \n" +" f = dot3F4( -v, r );\n" +" if (f<maxDots.w)\n" +" {\n" +" maxDots.w = f;\n" +" contactIdx[0].w = ie;\n" +" }\n" +" \n" +" }\n" +" \n" +" if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)\n" +" {\n" +" //replace the first contact with minimum (todo: replace contact with least penetration)\n" +" contactIdx[0].x = minIndex;\n" +" }\n" +" \n" +" return 4;\n" +" \n" +"}\n" +"#define MAX_PLANE_CONVEX_POINTS 64\n" +"int computeContactPlaneConvex(int pairIndex,\n" +" int bodyIndexA, int bodyIndexB, \n" +" int collidableIndexA, int collidableIndexB, \n" +" __global const BodyData* rigidBodies, \n" +" __global const btCollidableGpu*collidables,\n" +" __global const ConvexPolyhedronCL* convexShapes,\n" +" __global const float4* convexVertices,\n" +" __global const int* convexIndices,\n" +" __global const btGpuFace* faces,\n" +" __global struct b3Contact4Data* restrict globalContactsOut,\n" +" counter32_t nGlobalContactsOut,\n" +" int maxContactCapacity,\n" +" float4 posB,\n" +" Quaternion ornB\n" +" )\n" +"{\n" +" int resultIndex=-1;\n" +" int shapeIndex = collidables[collidableIndexB].m_shapeIndex;\n" +" __global const ConvexPolyhedronCL* hullB = &convexShapes[shapeIndex];\n" +" \n" +" float4 posA;\n" +" posA = rigidBodies[bodyIndexA].m_pos;\n" +" Quaternion ornA;\n" +" ornA = rigidBodies[bodyIndexA].m_quat;\n" +" int numContactsOut = 0;\n" +" int numWorldVertsB1= 0;\n" +" float4 planeEq;\n" +" planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;\n" +" float4 planeNormal = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f);\n" +" float4 planeNormalWorld;\n" +" planeNormalWorld = qtRotate(ornA,planeNormal);\n" +" float planeConstant = planeEq.w;\n" +" \n" +" float4 invPosA;Quaternion invOrnA;\n" +" float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1;\n" +" {\n" +" \n" +" trInverse(posA,ornA,&invPosA,&invOrnA);\n" +" trMul(invPosA,invOrnA,posB,ornB,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);\n" +" }\n" +" float4 invPosB;Quaternion invOrnB;\n" +" float4 planeInConvexPos1; Quaternion planeInConvexOrn1;\n" +" {\n" +" \n" +" trInverse(posB,ornB,&invPosB,&invOrnB);\n" +" trMul(invPosB,invOrnB,posA,ornA,&planeInConvexPos1,&planeInConvexOrn1); \n" +" }\n" +" \n" +" float4 planeNormalInConvex = qtRotate(planeInConvexOrn1,-planeNormal);\n" +" float maxDot = -1e30;\n" +" int hitVertex=-1;\n" +" float4 hitVtx;\n" +" float4 contactPoints[MAX_PLANE_CONVEX_POINTS];\n" +" int numPoints = 0;\n" +" int4 contactIdx;\n" +" contactIdx=make_int4(0,1,2,3);\n" +" \n" +" \n" +" for (int i=0;i<hullB->m_numVertices;i++)\n" +" {\n" +" float4 vtx = convexVertices[hullB->m_vertexOffset+i];\n" +" float curDot = dot(vtx,planeNormalInConvex);\n" +" if (curDot>maxDot)\n" +" {\n" +" hitVertex=i;\n" +" maxDot=curDot;\n" +" hitVtx = vtx;\n" +" //make sure the deepest points is always included\n" +" if (numPoints==MAX_PLANE_CONVEX_POINTS)\n" +" numPoints--;\n" +" }\n" +" if (numPoints<MAX_PLANE_CONVEX_POINTS)\n" +" {\n" +" float4 vtxWorld = transform(&vtx, &posB, &ornB);\n" +" float4 vtxInPlane = transform(&vtxWorld, &invPosA, &invOrnA);//oplaneTransform.inverse()*vtxWorld;\n" +" float dist = dot(planeNormal,vtxInPlane)-planeConstant;\n" +" if (dist<0.f)\n" +" {\n" +" vtxWorld.w = dist;\n" +" contactPoints[numPoints] = vtxWorld;\n" +" numPoints++;\n" +" }\n" +" }\n" +" }\n" +" int numReducedPoints = numPoints;\n" +" if (numPoints>4)\n" +" {\n" +" numReducedPoints = extractManifoldSequential( contactPoints, numPoints, planeNormalInConvex, &contactIdx);\n" +" }\n" +" if (numReducedPoints>0)\n" +" {\n" +" int dstIdx;\n" +" AppendInc( nGlobalContactsOut, dstIdx );\n" +" if (dstIdx < maxContactCapacity)\n" +" {\n" +" resultIndex = dstIdx;\n" +" __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" +" c->m_worldNormalOnB = -planeNormalWorld;\n" +" //c->setFrictionCoeff(0.7);\n" +" //c->setRestituitionCoeff(0.f);\n" +" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" +" c->m_batchIdx = pairIndex;\n" +" c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n" +" c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n" +" c->m_childIndexA = -1;\n" +" c->m_childIndexB = -1;\n" +" switch (numReducedPoints)\n" +" {\n" +" case 4:\n" +" c->m_worldPosB[3] = contactPoints[contactIdx.w];\n" +" case 3:\n" +" c->m_worldPosB[2] = contactPoints[contactIdx.z];\n" +" case 2:\n" +" c->m_worldPosB[1] = contactPoints[contactIdx.y];\n" +" case 1:\n" +" c->m_worldPosB[0] = contactPoints[contactIdx.x];\n" +" default:\n" +" {\n" +" }\n" +" };\n" +" \n" +" GET_NPOINTS(*c) = numReducedPoints;\n" +" }//if (dstIdx < numPairs)\n" +" } \n" +" return resultIndex;\n" +"}\n" +"void computeContactPlaneSphere(int pairIndex,\n" +" int bodyIndexA, int bodyIndexB, \n" +" int collidableIndexA, int collidableIndexB, \n" +" __global const BodyData* rigidBodies, \n" +" __global const btCollidableGpu* collidables,\n" +" __global const btGpuFace* faces,\n" +" __global struct b3Contact4Data* restrict globalContactsOut,\n" +" counter32_t nGlobalContactsOut,\n" +" int maxContactCapacity)\n" +"{\n" +" float4 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;\n" +" float radius = collidables[collidableIndexB].m_radius;\n" +" float4 posA1 = rigidBodies[bodyIndexA].m_pos;\n" +" float4 ornA1 = rigidBodies[bodyIndexA].m_quat;\n" +" float4 posB1 = rigidBodies[bodyIndexB].m_pos;\n" +" float4 ornB1 = rigidBodies[bodyIndexB].m_quat;\n" +" \n" +" bool hasCollision = false;\n" +" float4 planeNormal1 = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f);\n" +" float planeConstant = planeEq.w;\n" +" float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1;\n" +" {\n" +" float4 invPosA;Quaternion invOrnA;\n" +" trInverse(posA1,ornA1,&invPosA,&invOrnA);\n" +" trMul(invPosA,invOrnA,posB1,ornB1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);\n" +" }\n" +" float4 planeInConvexPos1; Quaternion planeInConvexOrn1;\n" +" {\n" +" float4 invPosB;Quaternion invOrnB;\n" +" trInverse(posB1,ornB1,&invPosB,&invOrnB);\n" +" trMul(invPosB,invOrnB,posA1,ornA1,&planeInConvexPos1,&planeInConvexOrn1); \n" +" }\n" +" float4 vtx1 = qtRotate(planeInConvexOrn1,-planeNormal1)*radius;\n" +" float4 vtxInPlane1 = transform(&vtx1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);\n" +" float distance = dot3F4(planeNormal1,vtxInPlane1) - planeConstant;\n" +" hasCollision = distance < 0.f;//m_manifoldPtr->getContactBreakingThreshold();\n" +" if (hasCollision)\n" +" {\n" +" float4 vtxInPlaneProjected1 = vtxInPlane1 - distance*planeNormal1;\n" +" float4 vtxInPlaneWorld1 = transform(&vtxInPlaneProjected1,&posA1,&ornA1);\n" +" float4 normalOnSurfaceB1 = qtRotate(ornA1,planeNormal1);\n" +" float4 pOnB1 = vtxInPlaneWorld1+normalOnSurfaceB1*distance;\n" +" pOnB1.w = distance;\n" +" int dstIdx;\n" +" AppendInc( nGlobalContactsOut, dstIdx );\n" +" \n" +" if (dstIdx < maxContactCapacity)\n" +" {\n" +" __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" +" c->m_worldNormalOnB = -normalOnSurfaceB1;\n" +" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" +" c->m_batchIdx = pairIndex;\n" +" c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n" +" c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n" +" c->m_worldPosB[0] = pOnB1;\n" +" c->m_childIndexA = -1;\n" +" c->m_childIndexB = -1;\n" +" GET_NPOINTS(*c) = 1;\n" +" }//if (dstIdx < numPairs)\n" +" }//if (hasCollision)\n" +"}\n" +"__kernel void primitiveContactsKernel( __global int4* pairs, \n" +" __global const BodyData* rigidBodies, \n" +" __global const btCollidableGpu* collidables,\n" +" __global const ConvexPolyhedronCL* convexShapes, \n" +" __global const float4* vertices,\n" +" __global const float4* uniqueEdges,\n" +" __global const btGpuFace* faces,\n" +" __global const int* indices,\n" +" __global struct b3Contact4Data* restrict globalContactsOut,\n" +" counter32_t nGlobalContactsOut,\n" +" int numPairs, int maxContactCapacity)\n" +"{\n" +" int i = get_global_id(0);\n" +" int pairIndex = i;\n" +" \n" +" float4 worldVertsB1[64];\n" +" float4 worldVertsB2[64];\n" +" int capacityWorldVerts = 64; \n" +" float4 localContactsOut[64];\n" +" int localContactCapacity=64;\n" +" \n" +" float minDist = -1e30f;\n" +" float maxDist = 0.02f;\n" +" if (i<numPairs)\n" +" {\n" +" int bodyIndexA = pairs[i].x;\n" +" int bodyIndexB = pairs[i].y;\n" +" \n" +" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" +" \n" +" if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&\n" +" collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)\n" +" {\n" +" float4 posB;\n" +" posB = rigidBodies[bodyIndexB].m_pos;\n" +" Quaternion ornB;\n" +" ornB = rigidBodies[bodyIndexB].m_quat;\n" +" int contactIndex = computeContactPlaneConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n" +" rigidBodies,collidables,convexShapes,vertices,indices,\n" +" faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity, posB,ornB);\n" +" if (contactIndex>=0)\n" +" pairs[pairIndex].z = contactIndex;\n" +" return;\n" +" }\n" +" if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&\n" +" collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)\n" +" {\n" +" float4 posA;\n" +" posA = rigidBodies[bodyIndexA].m_pos;\n" +" Quaternion ornA;\n" +" ornA = rigidBodies[bodyIndexA].m_quat;\n" +" int contactIndex = computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA, collidableIndexB,collidableIndexA, \n" +" rigidBodies,collidables,convexShapes,vertices,indices,\n" +" faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA);\n" +" if (contactIndex>=0)\n" +" pairs[pairIndex].z = contactIndex;\n" +" return;\n" +" }\n" +" if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&\n" +" collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n" +" {\n" +" computeContactPlaneSphere(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n" +" rigidBodies,collidables,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity);\n" +" return;\n" +" }\n" +" if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n" +" collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)\n" +" {\n" +" computeContactPlaneSphere( pairIndex, bodyIndexB,bodyIndexA, collidableIndexB,collidableIndexA, \n" +" rigidBodies,collidables,\n" +" faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity);\n" +" return;\n" +" }\n" +" \n" +" \n" +" if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n" +" collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)\n" +" {\n" +" \n" +" float4 spherePos = rigidBodies[bodyIndexA].m_pos;\n" +" float sphereRadius = collidables[collidableIndexA].m_radius;\n" +" float4 convexPos = rigidBodies[bodyIndexB].m_pos;\n" +" float4 convexOrn = rigidBodies[bodyIndexB].m_quat;\n" +" computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n" +" rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n" +" spherePos,sphereRadius,convexPos,convexOrn);\n" +" return;\n" +" }\n" +" if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&\n" +" collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n" +" {\n" +" \n" +" float4 spherePos = rigidBodies[bodyIndexB].m_pos;\n" +" float sphereRadius = collidables[collidableIndexB].m_radius;\n" +" float4 convexPos = rigidBodies[bodyIndexA].m_pos;\n" +" float4 convexOrn = rigidBodies[bodyIndexA].m_quat;\n" +" computeContactSphereConvex(pairIndex, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, \n" +" rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n" +" spherePos,sphereRadius,convexPos,convexOrn);\n" +" return;\n" +" }\n" +" \n" +" \n" +" \n" +" \n" +" \n" +" \n" +" if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n" +" collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n" +" {\n" +" //sphere-sphere\n" +" float radiusA = collidables[collidableIndexA].m_radius;\n" +" float radiusB = collidables[collidableIndexB].m_radius;\n" +" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" +" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" +" float4 diff = posA-posB;\n" +" float len = length(diff);\n" +" \n" +" ///iff distance positive, don't generate a new contact\n" +" if ( len <= (radiusA+radiusB))\n" +" {\n" +" ///distance (negative means penetration)\n" +" float dist = len - (radiusA+radiusB);\n" +" float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);\n" +" if (len > 0.00001)\n" +" {\n" +" normalOnSurfaceB = diff / len;\n" +" }\n" +" float4 contactPosB = posB + normalOnSurfaceB*radiusB;\n" +" contactPosB.w = dist;\n" +" \n" +" int dstIdx;\n" +" AppendInc( nGlobalContactsOut, dstIdx );\n" +" \n" +" if (dstIdx < maxContactCapacity)\n" +" {\n" +" __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" +" c->m_worldNormalOnB = normalOnSurfaceB;\n" +" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" +" c->m_batchIdx = pairIndex;\n" +" int bodyA = pairs[pairIndex].x;\n" +" int bodyB = pairs[pairIndex].y;\n" +" c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" +" c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" +" c->m_worldPosB[0] = contactPosB;\n" +" c->m_childIndexA = -1;\n" +" c->m_childIndexB = -1;\n" +" GET_NPOINTS(*c) = 1;\n" +" }//if (dstIdx < numPairs)\n" +" }//if ( len <= (radiusA+radiusB))\n" +" return;\n" +" }//SHAPE_SPHERE SHAPE_SPHERE\n" +" }// if (i<numPairs)\n" +"}\n" +"// work-in-progress\n" +"__kernel void processCompoundPairsPrimitivesKernel( __global const int4* gpuCompoundPairs,\n" +" __global const BodyData* rigidBodies, \n" +" __global const btCollidableGpu* collidables,\n" +" __global const ConvexPolyhedronCL* convexShapes, \n" +" __global const float4* vertices,\n" +" __global const float4* uniqueEdges,\n" +" __global const btGpuFace* faces,\n" +" __global const int* indices,\n" +" __global btAabbCL* aabbs,\n" +" __global const btGpuChildShape* gpuChildShapes,\n" +" __global struct b3Contact4Data* restrict globalContactsOut,\n" +" counter32_t nGlobalContactsOut,\n" +" int numCompoundPairs, int maxContactCapacity\n" +" )\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i<numCompoundPairs)\n" +" {\n" +" int bodyIndexA = gpuCompoundPairs[i].x;\n" +" int bodyIndexB = gpuCompoundPairs[i].y;\n" +" int childShapeIndexA = gpuCompoundPairs[i].z;\n" +" int childShapeIndexB = gpuCompoundPairs[i].w;\n" +" \n" +" int collidableIndexA = -1;\n" +" int collidableIndexB = -1;\n" +" \n" +" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" +" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" +" \n" +" float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" +" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" +" \n" +" if (childShapeIndexA >= 0)\n" +" {\n" +" collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n" +" float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n" +" float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n" +" float4 newPosA = qtRotate(ornA,childPosA)+posA;\n" +" float4 newOrnA = qtMul(ornA,childOrnA);\n" +" posA = newPosA;\n" +" ornA = newOrnA;\n" +" } else\n" +" {\n" +" collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" }\n" +" \n" +" if (childShapeIndexB>=0)\n" +" {\n" +" collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" +" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" +" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" +" float4 newPosB = transform(&childPosB,&posB,&ornB);\n" +" float4 newOrnB = qtMul(ornB,childOrnB);\n" +" posB = newPosB;\n" +" ornB = newOrnB;\n" +" } else\n" +" {\n" +" collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; \n" +" }\n" +" \n" +" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" +" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" +" \n" +" int shapeTypeA = collidables[collidableIndexA].m_shapeType;\n" +" int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n" +" int pairIndex = i;\n" +" if ((shapeTypeA == SHAPE_PLANE) && (shapeTypeB==SHAPE_CONVEX_HULL))\n" +" {\n" +" computeContactPlaneConvex( pairIndex, bodyIndexA,bodyIndexB, collidableIndexA,collidableIndexB, \n" +" rigidBodies,collidables,convexShapes,vertices,indices,\n" +" faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,posB,ornB);\n" +" return;\n" +" }\n" +" if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB==SHAPE_PLANE))\n" +" {\n" +" computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA, collidableIndexB,collidableIndexA, \n" +" rigidBodies,collidables,convexShapes,vertices,indices,\n" +" faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA);\n" +" return;\n" +" }\n" +" if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB == SHAPE_SPHERE))\n" +" {\n" +" float4 spherePos = rigidBodies[bodyIndexB].m_pos;\n" +" float sphereRadius = collidables[collidableIndexB].m_radius;\n" +" float4 convexPos = posA;\n" +" float4 convexOrn = ornA;\n" +" \n" +" computeContactSphereConvex(pairIndex, bodyIndexB, bodyIndexA , collidableIndexB,collidableIndexA, \n" +" rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n" +" spherePos,sphereRadius,convexPos,convexOrn);\n" +" \n" +" return;\n" +" }\n" +" if ((shapeTypeA == SHAPE_SPHERE) && (shapeTypeB == SHAPE_CONVEX_HULL))\n" +" {\n" +" float4 spherePos = rigidBodies[bodyIndexA].m_pos;\n" +" float sphereRadius = collidables[collidableIndexA].m_radius;\n" +" float4 convexPos = posB;\n" +" float4 convexOrn = ornB;\n" +" \n" +" computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n" +" rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n" +" spherePos,sphereRadius,convexPos,convexOrn);\n" +" \n" +" return;\n" +" }\n" +" }// if (i<numCompoundPairs)\n" +"}\n" +"bool pointInTriangle(const float4* vertices, const float4* normal, float4 *p )\n" +"{\n" +" const float4* p1 = &vertices[0];\n" +" const float4* p2 = &vertices[1];\n" +" const float4* p3 = &vertices[2];\n" +" float4 edge1; edge1 = (*p2 - *p1);\n" +" float4 edge2; edge2 = ( *p3 - *p2 );\n" +" float4 edge3; edge3 = ( *p1 - *p3 );\n" +" \n" +" float4 p1_to_p; p1_to_p = ( *p - *p1 );\n" +" float4 p2_to_p; p2_to_p = ( *p - *p2 );\n" +" float4 p3_to_p; p3_to_p = ( *p - *p3 );\n" +" float4 edge1_normal; edge1_normal = ( cross(edge1,*normal));\n" +" float4 edge2_normal; edge2_normal = ( cross(edge2,*normal));\n" +" float4 edge3_normal; edge3_normal = ( cross(edge3,*normal));\n" +" \n" +" \n" +" float r1, r2, r3;\n" +" r1 = dot(edge1_normal,p1_to_p );\n" +" r2 = dot(edge2_normal,p2_to_p );\n" +" r3 = dot(edge3_normal,p3_to_p );\n" +" \n" +" if ( r1 > 0 && r2 > 0 && r3 > 0 )\n" +" return true;\n" +" if ( r1 <= 0 && r2 <= 0 && r3 <= 0 ) \n" +" return true;\n" +" return false;\n" +"}\n" +"float segmentSqrDistance(float4 from, float4 to,float4 p, float4* nearest) \n" +"{\n" +" float4 diff = p - from;\n" +" float4 v = to - from;\n" +" float t = dot(v,diff);\n" +" \n" +" if (t > 0) \n" +" {\n" +" float dotVV = dot(v,v);\n" +" if (t < dotVV) \n" +" {\n" +" t /= dotVV;\n" +" diff -= t*v;\n" +" } else \n" +" {\n" +" t = 1;\n" +" diff -= v;\n" +" }\n" +" } else\n" +" {\n" +" t = 0;\n" +" }\n" +" *nearest = from + t*v;\n" +" return dot(diff,diff); \n" +"}\n" +"void computeContactSphereTriangle(int pairIndex,\n" +" int bodyIndexA, int bodyIndexB,\n" +" int collidableIndexA, int collidableIndexB, \n" +" __global const BodyData* rigidBodies, \n" +" __global const btCollidableGpu* collidables,\n" +" const float4* triangleVertices,\n" +" __global struct b3Contact4Data* restrict globalContactsOut,\n" +" counter32_t nGlobalContactsOut,\n" +" int maxContactCapacity,\n" +" float4 spherePos2,\n" +" float radius,\n" +" float4 pos,\n" +" float4 quat,\n" +" int faceIndex\n" +" )\n" +"{\n" +" float4 invPos;\n" +" float4 invOrn;\n" +" trInverse(pos,quat, &invPos,&invOrn);\n" +" float4 spherePos = transform(&spherePos2,&invPos,&invOrn);\n" +" int numFaces = 3;\n" +" float4 closestPnt = (float4)(0, 0, 0, 0);\n" +" float4 hitNormalWorld = (float4)(0, 0, 0, 0);\n" +" float minDist = -1000000.f;\n" +" bool bCollide = false;\n" +" \n" +" //////////////////////////////////////\n" +" float4 sphereCenter;\n" +" sphereCenter = spherePos;\n" +" const float4* vertices = triangleVertices;\n" +" float contactBreakingThreshold = 0.f;//todo?\n" +" float radiusWithThreshold = radius + contactBreakingThreshold;\n" +" float4 edge10;\n" +" edge10 = vertices[1]-vertices[0];\n" +" edge10.w = 0.f;//is this needed?\n" +" float4 edge20;\n" +" edge20 = vertices[2]-vertices[0];\n" +" edge20.w = 0.f;//is this needed?\n" +" float4 normal = cross3(edge10,edge20);\n" +" normal = normalize(normal);\n" +" float4 p1ToCenter;\n" +" p1ToCenter = sphereCenter - vertices[0];\n" +" \n" +" float distanceFromPlane = dot(p1ToCenter,normal);\n" +" if (distanceFromPlane < 0.f)\n" +" {\n" +" //triangle facing the other way\n" +" distanceFromPlane *= -1.f;\n" +" normal *= -1.f;\n" +" }\n" +" hitNormalWorld = normal;\n" +" bool isInsideContactPlane = distanceFromPlane < radiusWithThreshold;\n" +" \n" +" // Check for contact / intersection\n" +" bool hasContact = false;\n" +" float4 contactPoint;\n" +" if (isInsideContactPlane) \n" +" {\n" +" \n" +" if (pointInTriangle(vertices,&normal, &sphereCenter)) \n" +" {\n" +" // Inside the contact wedge - touches a point on the shell plane\n" +" hasContact = true;\n" +" contactPoint = sphereCenter - normal*distanceFromPlane;\n" +" \n" +" } else {\n" +" // Could be inside one of the contact capsules\n" +" float contactCapsuleRadiusSqr = radiusWithThreshold*radiusWithThreshold;\n" +" float4 nearestOnEdge;\n" +" int numEdges = 3;\n" +" for (int i = 0; i < numEdges; i++) \n" +" {\n" +" float4 pa =vertices[i];\n" +" float4 pb = vertices[(i+1)%3];\n" +" float distanceSqr = segmentSqrDistance(pa,pb,sphereCenter, &nearestOnEdge);\n" +" if (distanceSqr < contactCapsuleRadiusSqr) \n" +" {\n" +" // Yep, we're inside a capsule\n" +" hasContact = true;\n" +" contactPoint = nearestOnEdge;\n" +" \n" +" }\n" +" \n" +" }\n" +" }\n" +" }\n" +" if (hasContact) \n" +" {\n" +" closestPnt = contactPoint;\n" +" float4 contactToCenter = sphereCenter - contactPoint;\n" +" minDist = length(contactToCenter);\n" +" if (minDist>FLT_EPSILON)\n" +" {\n" +" hitNormalWorld = normalize(contactToCenter);//*(1./minDist);\n" +" bCollide = true;\n" +" }\n" +" \n" +" }\n" +" /////////////////////////////////////\n" +" if (bCollide && minDist > -10000)\n" +" {\n" +" \n" +" float4 normalOnSurfaceB1 = qtRotate(quat,-hitNormalWorld);\n" +" float4 pOnB1 = transform(&closestPnt,&pos,&quat);\n" +" float actualDepth = minDist-radius;\n" +" \n" +" if (actualDepth<=0.f)\n" +" {\n" +" pOnB1.w = actualDepth;\n" +" int dstIdx;\n" +" \n" +" float lenSqr = dot3F4(normalOnSurfaceB1,normalOnSurfaceB1);\n" +" if (lenSqr>FLT_EPSILON)\n" +" {\n" +" AppendInc( nGlobalContactsOut, dstIdx );\n" +" \n" +" if (dstIdx < maxContactCapacity)\n" +" {\n" +" __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" +" c->m_worldNormalOnB = -normalOnSurfaceB1;\n" +" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" +" c->m_batchIdx = pairIndex;\n" +" c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n" +" c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n" +" c->m_worldPosB[0] = pOnB1;\n" +" c->m_childIndexA = -1;\n" +" c->m_childIndexB = faceIndex;\n" +" GET_NPOINTS(*c) = 1;\n" +" } \n" +" }\n" +" }\n" +" }//if (hasCollision)\n" +"}\n" +"// work-in-progress\n" +"__kernel void findConcaveSphereContactsKernel( __global int4* concavePairs,\n" +" __global const BodyData* rigidBodies,\n" +" __global const btCollidableGpu* collidables,\n" +" __global const ConvexPolyhedronCL* convexShapes, \n" +" __global const float4* vertices,\n" +" __global const float4* uniqueEdges,\n" +" __global const btGpuFace* faces,\n" +" __global const int* indices,\n" +" __global btAabbCL* aabbs,\n" +" __global struct b3Contact4Data* restrict globalContactsOut,\n" +" counter32_t nGlobalContactsOut,\n" +" int numConcavePairs, int maxContactCapacity\n" +" )\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numConcavePairs)\n" +" return;\n" +" int pairIdx = i;\n" +" int bodyIndexA = concavePairs[i].x;\n" +" int bodyIndexB = concavePairs[i].y;\n" +" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" +" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" +" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" +" if (collidables[collidableIndexB].m_shapeType==SHAPE_SPHERE)\n" +" {\n" +" int f = concavePairs[i].z;\n" +" btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n" +" \n" +" float4 verticesA[3];\n" +" for (int i=0;i<3;i++)\n" +" {\n" +" int index = indices[face.m_indexOffset+i];\n" +" float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n" +" verticesA[i] = vert;\n" +" }\n" +" float4 spherePos = rigidBodies[bodyIndexB].m_pos;\n" +" float sphereRadius = collidables[collidableIndexB].m_radius;\n" +" float4 convexPos = rigidBodies[bodyIndexA].m_pos;\n" +" float4 convexOrn = rigidBodies[bodyIndexA].m_quat;\n" +" computeContactSphereTriangle(i, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, \n" +" rigidBodies,collidables,\n" +" verticesA,\n" +" globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n" +" spherePos,sphereRadius,convexPos,convexOrn, f);\n" +" return;\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl new file mode 100644 index 0000000000..a6565fd6fa --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl @@ -0,0 +1,2018 @@ +//keep this enum in sync with the CPU version (in btCollidable.h) +//written by Erwin Coumans + + +#define SHAPE_CONVEX_HULL 3 +#define SHAPE_CONCAVE_TRIMESH 5 +#define TRIANGLE_NUM_CONVEX_FACES 5 +#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6 + +#define B3_MAX_STACK_DEPTH 256 + + +typedef unsigned int u32; + +///keep this in sync with btCollidable.h +typedef struct +{ + union { + int m_numChildShapes; + int m_bvhIndex; + }; + union + { + float m_radius; + int m_compoundBvhIndex; + }; + + int m_shapeType; + int m_shapeIndex; + +} btCollidableGpu; + +#define MAX_NUM_PARTS_IN_BITS 10 + +///b3QuantizedBvhNode is a compressed aabb node, 16 bytes. +///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range). +typedef struct +{ + //12 bytes + unsigned short int m_quantizedAabbMin[3]; + unsigned short int m_quantizedAabbMax[3]; + //4 bytes + int m_escapeIndexOrTriangleIndex; +} b3QuantizedBvhNode; + +typedef struct +{ + float4 m_aabbMin; + float4 m_aabbMax; + float4 m_quantization; + int m_numNodes; + int m_numSubTrees; + int m_nodeOffset; + int m_subTreeOffset; + +} b3BvhInfo; + + +int getTriangleIndex(const b3QuantizedBvhNode* rootNode) +{ + unsigned int x=0; + unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS); + // Get only the lower bits where the triangle index is stored + return (rootNode->m_escapeIndexOrTriangleIndex&~(y)); +} + +int getTriangleIndexGlobal(__global const b3QuantizedBvhNode* rootNode) +{ + unsigned int x=0; + unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS); + // Get only the lower bits where the triangle index is stored + return (rootNode->m_escapeIndexOrTriangleIndex&~(y)); +} + +int isLeafNode(const b3QuantizedBvhNode* rootNode) +{ + //skipindex is negative (internal node), triangleindex >=0 (leafnode) + return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0; +} + +int isLeafNodeGlobal(__global const b3QuantizedBvhNode* rootNode) +{ + //skipindex is negative (internal node), triangleindex >=0 (leafnode) + return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0; +} + +int getEscapeIndex(const b3QuantizedBvhNode* rootNode) +{ + return -rootNode->m_escapeIndexOrTriangleIndex; +} + +int getEscapeIndexGlobal(__global const b3QuantizedBvhNode* rootNode) +{ + return -rootNode->m_escapeIndexOrTriangleIndex; +} + + +typedef struct +{ + //12 bytes + unsigned short int m_quantizedAabbMin[3]; + unsigned short int m_quantizedAabbMax[3]; + //4 bytes, points to the root of the subtree + int m_rootNodeIndex; + //4 bytes + int m_subtreeSize; + int m_padding[3]; +} b3BvhSubtreeInfo; + + + + + + + +typedef struct +{ + float4 m_childPosition; + float4 m_childOrientation; + int m_shapeIndex; + int m_unused0; + int m_unused1; + int m_unused2; +} btGpuChildShape; + + +typedef struct +{ + float4 m_pos; + float4 m_quat; + float4 m_linVel; + float4 m_angVel; + + u32 m_collidableIdx; + float m_invMass; + float m_restituitionCoeff; + float m_frictionCoeff; +} BodyData; + + +typedef struct +{ + float4 m_localCenter; + float4 m_extents; + float4 mC; + float4 mE; + + float m_radius; + int m_faceOffset; + int m_numFaces; + int m_numVertices; + + int m_vertexOffset; + int m_uniqueEdgesOffset; + int m_numUniqueEdges; + int m_unused; +} ConvexPolyhedronCL; + +typedef struct +{ + union + { + float4 m_min; + float m_minElems[4]; + int m_minIndices[4]; + }; + union + { + float4 m_max; + float m_maxElems[4]; + int m_maxIndices[4]; + }; +} btAabbCL; + +#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h" +#include "Bullet3Common/shared/b3Int2.h" + + + +typedef struct +{ + float4 m_plane; + int m_indexOffset; + int m_numIndices; +} btGpuFace; + +#define make_float4 (float4) + + +__inline +float4 cross3(float4 a, float4 b) +{ + return cross(a,b); + + +// float4 a1 = make_float4(a.xyz,0.f); +// float4 b1 = make_float4(b.xyz,0.f); + +// return cross(a1,b1); + +//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f); + + // float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f); + + //return c; +} + +__inline +float dot3F4(float4 a, float4 b) +{ + float4 a1 = make_float4(a.xyz,0.f); + float4 b1 = make_float4(b.xyz,0.f); + return dot(a1, b1); +} + +__inline +float4 fastNormalize4(float4 v) +{ + v = make_float4(v.xyz,0.f); + return fast_normalize(v); +} + + +/////////////////////////////////////// +// Quaternion +/////////////////////////////////////// + +typedef float4 Quaternion; + +__inline +Quaternion qtMul(Quaternion a, Quaternion b); + +__inline +Quaternion qtNormalize(Quaternion in); + +__inline +float4 qtRotate(Quaternion q, float4 vec); + +__inline +Quaternion qtInvert(Quaternion q); + + + + +__inline +Quaternion qtMul(Quaternion a, Quaternion b) +{ + Quaternion ans; + ans = cross3( a, b ); + ans += a.w*b+b.w*a; +// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z); + ans.w = a.w*b.w - dot3F4(a, b); + return ans; +} + +__inline +Quaternion qtNormalize(Quaternion in) +{ + return fastNormalize4(in); +// in /= length( in ); +// return in; +} +__inline +float4 qtRotate(Quaternion q, float4 vec) +{ + Quaternion qInv = qtInvert( q ); + float4 vcpy = vec; + vcpy.w = 0.f; + float4 out = qtMul(qtMul(q,vcpy),qInv); + return out; +} + +__inline +Quaternion qtInvert(Quaternion q) +{ + return (Quaternion)(-q.xyz, q.w); +} + +__inline +float4 qtInvRotate(const Quaternion q, float4 vec) +{ + return qtRotate( qtInvert( q ), vec ); +} + +__inline +float4 transform(const float4* p, const float4* translation, const Quaternion* orientation) +{ + return qtRotate( *orientation, *p ) + (*translation); +} + + + +__inline +float4 normalize3(const float4 a) +{ + float4 n = make_float4(a.x, a.y, a.z, 0.f); + return fastNormalize4( n ); +} + +inline void projectLocal(const ConvexPolyhedronCL* hull, const float4 pos, const float4 orn, +const float4* dir, const float4* vertices, float* min, float* max) +{ + min[0] = FLT_MAX; + max[0] = -FLT_MAX; + int numVerts = hull->m_numVertices; + + const float4 localDir = qtInvRotate(orn,*dir); + float offset = dot(pos,*dir); + for(int i=0;i<numVerts;i++) + { + float dp = dot(vertices[hull->m_vertexOffset+i],localDir); + if(dp < min[0]) + min[0] = dp; + if(dp > max[0]) + max[0] = dp; + } + if(min[0]>max[0]) + { + float tmp = min[0]; + min[0] = max[0]; + max[0] = tmp; + } + min[0] += offset; + max[0] += offset; +} + +inline void project(__global const ConvexPolyhedronCL* hull, const float4 pos, const float4 orn, +const float4* dir, __global const float4* vertices, float* min, float* max) +{ + min[0] = FLT_MAX; + max[0] = -FLT_MAX; + int numVerts = hull->m_numVertices; + + const float4 localDir = qtInvRotate(orn,*dir); + float offset = dot(pos,*dir); + for(int i=0;i<numVerts;i++) + { + float dp = dot(vertices[hull->m_vertexOffset+i],localDir); + if(dp < min[0]) + min[0] = dp; + if(dp > max[0]) + max[0] = dp; + } + if(min[0]>max[0]) + { + float tmp = min[0]; + min[0] = max[0]; + max[0] = tmp; + } + min[0] += offset; + max[0] += offset; +} + +inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, + const float4 posA,const float4 ornA, + const float4 posB,const float4 ornB, + float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth) +{ + float Min0,Max0; + float Min1,Max1; + projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0); + project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1); + + if(Max0<Min1 || Max1<Min0) + return false; + + float d0 = Max0 - Min1; + float d1 = Max1 - Min0; + *depth = d0<d1 ? d0:d1; + return true; +} + + + + +inline bool IsAlmostZero(const float4 v) +{ + if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f) + return false; + return true; +} + + + +bool findSeparatingAxisLocalA( const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, + const float4 posA1, + const float4 ornA, + const float4 posB1, + const float4 ornB, + const float4 DeltaC2, + + const float4* verticesA, + const float4* uniqueEdgesA, + const btGpuFace* facesA, + const int* indicesA, + + __global const float4* verticesB, + __global const float4* uniqueEdgesB, + __global const btGpuFace* facesB, + __global const int* indicesB, + float4* sep, + float* dmin) +{ + + + float4 posA = posA1; + posA.w = 0.f; + float4 posB = posB1; + posB.w = 0.f; + int curPlaneTests=0; + { + int numFacesA = hullA->m_numFaces; + // Test normals from hullA + for(int i=0;i<numFacesA;i++) + { + const float4 normal = facesA[hullA->m_faceOffset+i].m_plane; + float4 faceANormalWS = qtRotate(ornA,normal); + if (dot3F4(DeltaC2,faceANormalWS)<0) + faceANormalWS*=-1.f; + curPlaneTests++; + float d; + if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d)) + return false; + if(d<*dmin) + { + *dmin = d; + *sep = faceANormalWS; + } + } + } + if((dot3F4(-DeltaC2,*sep))>0.0f) + { + *sep = -(*sep); + } + return true; +} + +bool findSeparatingAxisLocalB( __global const ConvexPolyhedronCL* hullA, const ConvexPolyhedronCL* hullB, + const float4 posA1, + const float4 ornA, + const float4 posB1, + const float4 ornB, + const float4 DeltaC2, + __global const float4* verticesA, + __global const float4* uniqueEdgesA, + __global const btGpuFace* facesA, + __global const int* indicesA, + const float4* verticesB, + const float4* uniqueEdgesB, + const btGpuFace* facesB, + const int* indicesB, + float4* sep, + float* dmin) +{ + + + float4 posA = posA1; + posA.w = 0.f; + float4 posB = posB1; + posB.w = 0.f; + int curPlaneTests=0; + { + int numFacesA = hullA->m_numFaces; + // Test normals from hullA + for(int i=0;i<numFacesA;i++) + { + const float4 normal = facesA[hullA->m_faceOffset+i].m_plane; + float4 faceANormalWS = qtRotate(ornA,normal); + if (dot3F4(DeltaC2,faceANormalWS)<0) + faceANormalWS *= -1.f; + curPlaneTests++; + float d; + if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d)) + return false; + if(d<*dmin) + { + *dmin = d; + *sep = faceANormalWS; + } + } + } + if((dot3F4(-DeltaC2,*sep))>0.0f) + { + *sep = -(*sep); + } + return true; +} + + + +bool findSeparatingAxisEdgeEdgeLocalA( const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, + const float4 posA1, + const float4 ornA, + const float4 posB1, + const float4 ornB, + const float4 DeltaC2, + const float4* verticesA, + const float4* uniqueEdgesA, + const btGpuFace* facesA, + const int* indicesA, + __global const float4* verticesB, + __global const float4* uniqueEdgesB, + __global const btGpuFace* facesB, + __global const int* indicesB, + float4* sep, + float* dmin) +{ + + + float4 posA = posA1; + posA.w = 0.f; + float4 posB = posB1; + posB.w = 0.f; + + int curPlaneTests=0; + + int curEdgeEdge = 0; + // Test edges + for(int e0=0;e0<hullA->m_numUniqueEdges;e0++) + { + const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0]; + float4 edge0World = qtRotate(ornA,edge0); + + for(int e1=0;e1<hullB->m_numUniqueEdges;e1++) + { + const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1]; + float4 edge1World = qtRotate(ornB,edge1); + + + float4 crossje = cross3(edge0World,edge1World); + + curEdgeEdge++; + if(!IsAlmostZero(crossje)) + { + crossje = normalize3(crossje); + if (dot3F4(DeltaC2,crossje)<0) + crossje *= -1.f; + + float dist; + bool result = true; + { + float Min0,Max0; + float Min1,Max1; + projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0); + project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1); + + if(Max0<Min1 || Max1<Min0) + result = false; + + float d0 = Max0 - Min1; + float d1 = Max1 - Min0; + dist = d0<d1 ? d0:d1; + result = true; + + } + + + if(dist<*dmin) + { + *dmin = dist; + *sep = crossje; + } + } + } + + } + + + if((dot3F4(-DeltaC2,*sep))>0.0f) + { + *sep = -(*sep); + } + return true; +} + + +inline bool TestSepAxis(__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, + const float4 posA,const float4 ornA, + const float4 posB,const float4 ornB, + float4* sep_axis, __global const float4* vertices,float* depth) +{ + float Min0,Max0; + float Min1,Max1; + project(hullA,posA,ornA,sep_axis,vertices, &Min0, &Max0); + project(hullB,posB,ornB, sep_axis,vertices, &Min1, &Max1); + + if(Max0<Min1 || Max1<Min0) + return false; + + float d0 = Max0 - Min1; + float d1 = Max1 - Min0; + *depth = d0<d1 ? d0:d1; + return true; +} + + +bool findSeparatingAxis( __global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, + const float4 posA1, + const float4 ornA, + const float4 posB1, + const float4 ornB, + const float4 DeltaC2, + __global const float4* vertices, + __global const float4* uniqueEdges, + __global const btGpuFace* faces, + __global const int* indices, + float4* sep, + float* dmin) +{ + + + float4 posA = posA1; + posA.w = 0.f; + float4 posB = posB1; + posB.w = 0.f; + + int curPlaneTests=0; + + { + int numFacesA = hullA->m_numFaces; + // Test normals from hullA + for(int i=0;i<numFacesA;i++) + { + const float4 normal = faces[hullA->m_faceOffset+i].m_plane; + float4 faceANormalWS = qtRotate(ornA,normal); + + if (dot3F4(DeltaC2,faceANormalWS)<0) + faceANormalWS*=-1.f; + + curPlaneTests++; + + float d; + if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, vertices,&d)) + return false; + + if(d<*dmin) + { + *dmin = d; + *sep = faceANormalWS; + } + } + } + + + if((dot3F4(-DeltaC2,*sep))>0.0f) + { + *sep = -(*sep); + } + + return true; +} + + + + +bool findSeparatingAxisUnitSphere( __global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, + const float4 posA1, + const float4 ornA, + const float4 posB1, + const float4 ornB, + const float4 DeltaC2, + __global const float4* vertices, + __global const float4* unitSphereDirections, + int numUnitSphereDirections, + float4* sep, + float* dmin) +{ + + float4 posA = posA1; + posA.w = 0.f; + float4 posB = posB1; + posB.w = 0.f; + + int curPlaneTests=0; + + int curEdgeEdge = 0; + // Test unit sphere directions + for (int i=0;i<numUnitSphereDirections;i++) + { + + float4 crossje; + crossje = unitSphereDirections[i]; + + if (dot3F4(DeltaC2,crossje)>0) + crossje *= -1.f; + { + float dist; + bool result = true; + float Min0,Max0; + float Min1,Max1; + project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0); + project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1); + + if(Max0<Min1 || Max1<Min0) + return false; + + float d0 = Max0 - Min1; + float d1 = Max1 - Min0; + dist = d0<d1 ? d0:d1; + result = true; + + if(dist<*dmin) + { + *dmin = dist; + *sep = crossje; + } + } + } + + + if((dot3F4(-DeltaC2,*sep))>0.0f) + { + *sep = -(*sep); + } + return true; +} + + +bool findSeparatingAxisEdgeEdge( __global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, + const float4 posA1, + const float4 ornA, + const float4 posB1, + const float4 ornB, + const float4 DeltaC2, + __global const float4* vertices, + __global const float4* uniqueEdges, + __global const btGpuFace* faces, + __global const int* indices, + float4* sep, + float* dmin) +{ + + + float4 posA = posA1; + posA.w = 0.f; + float4 posB = posB1; + posB.w = 0.f; + + int curPlaneTests=0; + + int curEdgeEdge = 0; + // Test edges + for(int e0=0;e0<hullA->m_numUniqueEdges;e0++) + { + const float4 edge0 = uniqueEdges[hullA->m_uniqueEdgesOffset+e0]; + float4 edge0World = qtRotate(ornA,edge0); + + for(int e1=0;e1<hullB->m_numUniqueEdges;e1++) + { + const float4 edge1 = uniqueEdges[hullB->m_uniqueEdgesOffset+e1]; + float4 edge1World = qtRotate(ornB,edge1); + + + float4 crossje = cross3(edge0World,edge1World); + + curEdgeEdge++; + if(!IsAlmostZero(crossje)) + { + crossje = normalize3(crossje); + if (dot3F4(DeltaC2,crossje)<0) + crossje*=-1.f; + + float dist; + bool result = true; + { + float Min0,Max0; + float Min1,Max1; + project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0); + project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1); + + if(Max0<Min1 || Max1<Min0) + return false; + + float d0 = Max0 - Min1; + float d1 = Max1 - Min0; + dist = d0<d1 ? d0:d1; + result = true; + + } + + + if(dist<*dmin) + { + *dmin = dist; + *sep = crossje; + } + } + } + + } + + + if((dot3F4(-DeltaC2,*sep))>0.0f) + { + *sep = -(*sep); + } + return true; +} + + +// work-in-progress +__kernel void processCompoundPairsKernel( __global const int4* gpuCompoundPairs, + __global const BodyData* rigidBodies, + __global const btCollidableGpu* collidables, + __global const ConvexPolyhedronCL* convexShapes, + __global const float4* vertices, + __global const float4* uniqueEdges, + __global const btGpuFace* faces, + __global const int* indices, + __global btAabbCL* aabbs, + __global const btGpuChildShape* gpuChildShapes, + __global volatile float4* gpuCompoundSepNormalsOut, + __global volatile int* gpuHasCompoundSepNormalsOut, + int numCompoundPairs + ) +{ + + int i = get_global_id(0); + if (i<numCompoundPairs) + { + int bodyIndexA = gpuCompoundPairs[i].x; + int bodyIndexB = gpuCompoundPairs[i].y; + + int childShapeIndexA = gpuCompoundPairs[i].z; + int childShapeIndexB = gpuCompoundPairs[i].w; + + int collidableIndexA = -1; + int collidableIndexB = -1; + + float4 ornA = rigidBodies[bodyIndexA].m_quat; + float4 posA = rigidBodies[bodyIndexA].m_pos; + + float4 ornB = rigidBodies[bodyIndexB].m_quat; + float4 posB = rigidBodies[bodyIndexB].m_pos; + + if (childShapeIndexA >= 0) + { + collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex; + float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition; + float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation; + float4 newPosA = qtRotate(ornA,childPosA)+posA; + float4 newOrnA = qtMul(ornA,childOrnA); + posA = newPosA; + ornA = newOrnA; + } else + { + collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + } + + if (childShapeIndexB>=0) + { + collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex; + float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition; + float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; + float4 newPosB = transform(&childPosB,&posB,&ornB); + float4 newOrnB = qtMul(ornB,childOrnB); + posB = newPosB; + ornB = newOrnB; + } else + { + collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + } + + gpuHasCompoundSepNormalsOut[i] = 0; + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + int shapeTypeA = collidables[collidableIndexA].m_shapeType; + int shapeTypeB = collidables[collidableIndexB].m_shapeType; + + + if ((shapeTypeA != SHAPE_CONVEX_HULL) || (shapeTypeB != SHAPE_CONVEX_HULL)) + { + return; + } + + int hasSeparatingAxis = 5; + + int numFacesA = convexShapes[shapeIndexA].m_numFaces; + float dmin = FLT_MAX; + posA.w = 0.f; + posB.w = 0.f; + float4 c0local = convexShapes[shapeIndexA].m_localCenter; + float4 c0 = transform(&c0local, &posA, &ornA); + float4 c1local = convexShapes[shapeIndexB].m_localCenter; + float4 c1 = transform(&c1local,&posB,&ornB); + const float4 DeltaC2 = c0 - c1; + float4 sepNormal = make_float4(1,0,0,0); + bool sepA = findSeparatingAxis( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin); + hasSeparatingAxis = 4; + if (!sepA) + { + hasSeparatingAxis = 0; + } else + { + bool sepB = findSeparatingAxis( &convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,posA,ornA,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin); + + if (!sepB) + { + hasSeparatingAxis = 0; + } else//(!sepB) + { + bool sepEE = findSeparatingAxisEdgeEdge( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin); + if (sepEE) + { + gpuCompoundSepNormalsOut[i] = sepNormal;//fastNormalize4(sepNormal); + gpuHasCompoundSepNormalsOut[i] = 1; + }//sepEE + }//(!sepB) + }//(!sepA) + + + } + +} + + +inline b3Float4 MyUnQuantize(const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin) +{ + b3Float4 vecOut; + vecOut = b3MakeFloat4( + (float)(vecIn[0]) / (quantization.x), + (float)(vecIn[1]) / (quantization.y), + (float)(vecIn[2]) / (quantization.z), + 0.f); + + vecOut += bvhAabbMin; + return vecOut; +} + +inline b3Float4 MyUnQuantizeGlobal(__global const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin) +{ + b3Float4 vecOut; + vecOut = b3MakeFloat4( + (float)(vecIn[0]) / (quantization.x), + (float)(vecIn[1]) / (quantization.y), + (float)(vecIn[2]) / (quantization.z), + 0.f); + + vecOut += bvhAabbMin; + return vecOut; +} + + +// work-in-progress +__kernel void findCompoundPairsKernel( __global const int4* pairs, + __global const BodyData* rigidBodies, + __global const btCollidableGpu* collidables, + __global const ConvexPolyhedronCL* convexShapes, + __global const float4* vertices, + __global const float4* uniqueEdges, + __global const btGpuFace* faces, + __global const int* indices, + __global b3Aabb_t* aabbLocalSpace, + __global const btGpuChildShape* gpuChildShapes, + __global volatile int4* gpuCompoundPairsOut, + __global volatile int* numCompoundPairsOut, + __global const b3BvhSubtreeInfo* subtrees, + __global const b3QuantizedBvhNode* quantizedNodes, + __global const b3BvhInfo* bvhInfos, + int numPairs, + int maxNumCompoundPairsCapacity + ) +{ + + int i = get_global_id(0); + + if (i<numPairs) + { + int bodyIndexA = pairs[i].x; + int bodyIndexB = pairs[i].y; + + int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + + //once the broadphase avoids static-static pairs, we can remove this test + if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0)) + { + return; + } + + if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) &&(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)) + { + int bvhA = collidables[collidableIndexA].m_compoundBvhIndex; + int bvhB = collidables[collidableIndexB].m_compoundBvhIndex; + int numSubTreesA = bvhInfos[bvhA].m_numSubTrees; + int subTreesOffsetA = bvhInfos[bvhA].m_subTreeOffset; + int subTreesOffsetB = bvhInfos[bvhB].m_subTreeOffset; + + + int numSubTreesB = bvhInfos[bvhB].m_numSubTrees; + + float4 posA = rigidBodies[bodyIndexA].m_pos; + b3Quat ornA = rigidBodies[bodyIndexA].m_quat; + + b3Quat ornB = rigidBodies[bodyIndexB].m_quat; + float4 posB = rigidBodies[bodyIndexB].m_pos; + + + for (int p=0;p<numSubTreesA;p++) + { + b3BvhSubtreeInfo subtreeA = subtrees[subTreesOffsetA+p]; + //bvhInfos[bvhA].m_quantization + b3Float4 treeAminLocal = MyUnQuantize(subtreeA.m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin); + b3Float4 treeAmaxLocal = MyUnQuantize(subtreeA.m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin); + + b3Float4 aabbAMinOut,aabbAMaxOut; + float margin=0.f; + b3TransformAabb2(treeAminLocal,treeAmaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut); + + for (int q=0;q<numSubTreesB;q++) + { + b3BvhSubtreeInfo subtreeB = subtrees[subTreesOffsetB+q]; + + b3Float4 treeBminLocal = MyUnQuantize(subtreeB.m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin); + b3Float4 treeBmaxLocal = MyUnQuantize(subtreeB.m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin); + + b3Float4 aabbBMinOut,aabbBMaxOut; + float margin=0.f; + b3TransformAabb2(treeBminLocal,treeBmaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut); + + + + bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut); + if (aabbOverlap) + { + + int startNodeIndexA = subtreeA.m_rootNodeIndex+bvhInfos[bvhA].m_nodeOffset; + int endNodeIndexA = startNodeIndexA+subtreeA.m_subtreeSize; + + int startNodeIndexB = subtreeB.m_rootNodeIndex+bvhInfos[bvhB].m_nodeOffset; + int endNodeIndexB = startNodeIndexB+subtreeB.m_subtreeSize; + + + b3Int2 nodeStack[B3_MAX_STACK_DEPTH]; + b3Int2 node0; + node0.x = startNodeIndexA; + node0.y = startNodeIndexB; + int maxStackDepth = B3_MAX_STACK_DEPTH; + int depth=0; + nodeStack[depth++]=node0; + + do + { + b3Int2 node = nodeStack[--depth]; + + b3Float4 aMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin); + b3Float4 aMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin); + + b3Float4 bMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin); + b3Float4 bMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin); + + float margin=0.f; + b3Float4 aabbAMinOut,aabbAMaxOut; + b3TransformAabb2(aMinLocal,aMaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut); + + b3Float4 aabbBMinOut,aabbBMaxOut; + b3TransformAabb2(bMinLocal,bMaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut); + + + bool nodeOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut); + if (nodeOverlap) + { + bool isLeafA = isLeafNodeGlobal(&quantizedNodes[node.x]); + bool isLeafB = isLeafNodeGlobal(&quantizedNodes[node.y]); + bool isInternalA = !isLeafA; + bool isInternalB = !isLeafB; + + //fail, even though it might hit two leaf nodes + if (depth+4>maxStackDepth && !(isLeafA && isLeafB)) + { + //printf("Error: traversal exceeded maxStackDepth"); + continue; + } + + if(isInternalA) + { + int nodeAleftChild = node.x+1; + bool isNodeALeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.x+1]); + int nodeArightChild = isNodeALeftChildLeaf? node.x+2 : node.x+1 + getEscapeIndexGlobal(&quantizedNodes[node.x+1]); + + if(isInternalB) + { + int nodeBleftChild = node.y+1; + bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]); + int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]); + + nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBleftChild); + nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBleftChild); + nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBrightChild); + nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBrightChild); + } + else + { + nodeStack[depth++] = b3MakeInt2(nodeAleftChild,node.y); + nodeStack[depth++] = b3MakeInt2(nodeArightChild,node.y); + } + } + else + { + if(isInternalB) + { + int nodeBleftChild = node.y+1; + bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]); + int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]); + nodeStack[depth++] = b3MakeInt2(node.x,nodeBleftChild); + nodeStack[depth++] = b3MakeInt2(node.x,nodeBrightChild); + } + else + { + int compoundPairIdx = atomic_inc(numCompoundPairsOut); + if (compoundPairIdx<maxNumCompoundPairsCapacity) + { + int childShapeIndexA = getTriangleIndexGlobal(&quantizedNodes[node.x]); + int childShapeIndexB = getTriangleIndexGlobal(&quantizedNodes[node.y]); + gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB); + } + } + } + } + } while (depth); + } + } + } + + return; + } + + + + + + if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)) + { + + if (collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + { + + int numChildrenA = collidables[collidableIndexA].m_numChildShapes; + for (int c=0;c<numChildrenA;c++) + { + int childShapeIndexA = collidables[collidableIndexA].m_shapeIndex+c; + int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex; + + float4 posA = rigidBodies[bodyIndexA].m_pos; + float4 ornA = rigidBodies[bodyIndexA].m_quat; + float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition; + float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation; + float4 newPosA = qtRotate(ornA,childPosA)+posA; + float4 newOrnA = qtMul(ornA,childOrnA); + + int shapeIndexA = collidables[childColIndexA].m_shapeIndex; + b3Aabb_t aabbAlocal = aabbLocalSpace[shapeIndexA]; + float margin = 0.f; + + b3Float4 aabbAMinWS; + b3Float4 aabbAMaxWS; + + b3TransformAabb2(aabbAlocal.m_minVec,aabbAlocal.m_maxVec,margin, + newPosA, + newOrnA, + &aabbAMinWS,&aabbAMaxWS); + + + if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + { + int numChildrenB = collidables[collidableIndexB].m_numChildShapes; + for (int b=0;b<numChildrenB;b++) + { + int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b; + int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex; + float4 ornB = rigidBodies[bodyIndexB].m_quat; + float4 posB = rigidBodies[bodyIndexB].m_pos; + float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition; + float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; + float4 newPosB = transform(&childPosB,&posB,&ornB); + float4 newOrnB = qtMul(ornB,childOrnB); + + int shapeIndexB = collidables[childColIndexB].m_shapeIndex; + b3Aabb_t aabbBlocal = aabbLocalSpace[shapeIndexB]; + + b3Float4 aabbBMinWS; + b3Float4 aabbBMaxWS; + + b3TransformAabb2(aabbBlocal.m_minVec,aabbBlocal.m_maxVec,margin, + newPosB, + newOrnB, + &aabbBMinWS,&aabbBMaxWS); + + + + bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinWS,aabbAMaxWS,aabbBMinWS,aabbBMaxWS); + if (aabbOverlap) + { + int numFacesA = convexShapes[shapeIndexA].m_numFaces; + float dmin = FLT_MAX; + float4 posA = newPosA; + posA.w = 0.f; + float4 posB = newPosB; + posB.w = 0.f; + float4 c0local = convexShapes[shapeIndexA].m_localCenter; + float4 ornA = newOrnA; + float4 c0 = transform(&c0local, &posA, &ornA); + float4 c1local = convexShapes[shapeIndexB].m_localCenter; + float4 ornB =newOrnB; + float4 c1 = transform(&c1local,&posB,&ornB); + const float4 DeltaC2 = c0 - c1; + + {// + int compoundPairIdx = atomic_inc(numCompoundPairsOut); + if (compoundPairIdx<maxNumCompoundPairsCapacity) + { + gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB); + } + }// + }//fi(1) + } //for (int b=0 + }//if (collidables[collidableIndexB]. + else//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + { + if (1) + { + int numFacesA = convexShapes[shapeIndexA].m_numFaces; + float dmin = FLT_MAX; + float4 posA = newPosA; + posA.w = 0.f; + float4 posB = rigidBodies[bodyIndexB].m_pos; + posB.w = 0.f; + float4 c0local = convexShapes[shapeIndexA].m_localCenter; + float4 ornA = newOrnA; + float4 c0 = transform(&c0local, &posA, &ornA); + float4 c1local = convexShapes[shapeIndexB].m_localCenter; + float4 ornB = rigidBodies[bodyIndexB].m_quat; + float4 c1 = transform(&c1local,&posB,&ornB); + const float4 DeltaC2 = c0 - c1; + + { + int compoundPairIdx = atomic_inc(numCompoundPairsOut); + if (compoundPairIdx<maxNumCompoundPairsCapacity) + { + gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,-1); + }//if (compoundPairIdx<maxNumCompoundPairsCapacity) + }// + }//fi (1) + }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + }//for (int b=0;b<numChildrenB;b++) + return; + }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH) + && (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)) + { + int numChildrenB = collidables[collidableIndexB].m_numChildShapes; + for (int b=0;b<numChildrenB;b++) + { + int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b; + int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex; + float4 ornB = rigidBodies[bodyIndexB].m_quat; + float4 posB = rigidBodies[bodyIndexB].m_pos; + float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition; + float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; + float4 newPosB = qtRotate(ornB,childPosB)+posB; + float4 newOrnB = qtMul(ornB,childOrnB); + + int shapeIndexB = collidables[childColIndexB].m_shapeIndex; + + + ////////////////////////////////////// + + if (1) + { + int numFacesA = convexShapes[shapeIndexA].m_numFaces; + float dmin = FLT_MAX; + float4 posA = rigidBodies[bodyIndexA].m_pos; + posA.w = 0.f; + float4 posB = newPosB; + posB.w = 0.f; + float4 c0local = convexShapes[shapeIndexA].m_localCenter; + float4 ornA = rigidBodies[bodyIndexA].m_quat; + float4 c0 = transform(&c0local, &posA, &ornA); + float4 c1local = convexShapes[shapeIndexB].m_localCenter; + float4 ornB =newOrnB; + float4 c1 = transform(&c1local,&posB,&ornB); + const float4 DeltaC2 = c0 - c1; + {// + int compoundPairIdx = atomic_inc(numCompoundPairsOut); + if (compoundPairIdx<maxNumCompoundPairsCapacity) + { + gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,-1,childShapeIndexB); + }//fi (compoundPairIdx<maxNumCompoundPairsCapacity) + }// + }//fi (1) + }//for (int b=0;b<numChildrenB;b++) + return; + }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + return; + }//fi ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)) + }//i<numPairs +} + +// work-in-progress +__kernel void findSeparatingAxisKernel( __global const int4* pairs, + __global const BodyData* rigidBodies, + __global const btCollidableGpu* collidables, + __global const ConvexPolyhedronCL* convexShapes, + __global const float4* vertices, + __global const float4* uniqueEdges, + __global const btGpuFace* faces, + __global const int* indices, + __global btAabbCL* aabbs, + __global volatile float4* separatingNormals, + __global volatile int* hasSeparatingAxis, + int numPairs + ) +{ + + int i = get_global_id(0); + + if (i<numPairs) + { + + + int bodyIndexA = pairs[i].x; + int bodyIndexB = pairs[i].y; + + int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + + //once the broadphase avoids static-static pairs, we can remove this test + if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0)) + { + hasSeparatingAxis[i] = 0; + return; + } + + + if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL)) + { + hasSeparatingAxis[i] = 0; + return; + } + + if ((collidables[collidableIndexA].m_shapeType==SHAPE_CONCAVE_TRIMESH)) + { + hasSeparatingAxis[i] = 0; + return; + } + + int numFacesA = convexShapes[shapeIndexA].m_numFaces; + + float dmin = FLT_MAX; + + float4 posA = rigidBodies[bodyIndexA].m_pos; + posA.w = 0.f; + float4 posB = rigidBodies[bodyIndexB].m_pos; + posB.w = 0.f; + float4 c0local = convexShapes[shapeIndexA].m_localCenter; + float4 ornA = rigidBodies[bodyIndexA].m_quat; + float4 c0 = transform(&c0local, &posA, &ornA); + float4 c1local = convexShapes[shapeIndexB].m_localCenter; + float4 ornB =rigidBodies[bodyIndexB].m_quat; + float4 c1 = transform(&c1local,&posB,&ornB); + const float4 DeltaC2 = c0 - c1; + float4 sepNormal; + + bool sepA = findSeparatingAxis( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA, + posB,ornB, + DeltaC2, + vertices,uniqueEdges,faces, + indices,&sepNormal,&dmin); + hasSeparatingAxis[i] = 4; + if (!sepA) + { + hasSeparatingAxis[i] = 0; + } else + { + bool sepB = findSeparatingAxis( &convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB, + posA,ornA, + DeltaC2, + vertices,uniqueEdges,faces, + indices,&sepNormal,&dmin); + + if (!sepB) + { + hasSeparatingAxis[i] = 0; + } else + { + bool sepEE = findSeparatingAxisEdgeEdge( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA, + posB,ornB, + DeltaC2, + vertices,uniqueEdges,faces, + indices,&sepNormal,&dmin); + if (!sepEE) + { + hasSeparatingAxis[i] = 0; + } else + { + hasSeparatingAxis[i] = 1; + separatingNormals[i] = sepNormal; + } + } + } + + } + +} + + +__kernel void findSeparatingAxisVertexFaceKernel( __global const int4* pairs, + __global const BodyData* rigidBodies, + __global const btCollidableGpu* collidables, + __global const ConvexPolyhedronCL* convexShapes, + __global const float4* vertices, + __global const float4* uniqueEdges, + __global const btGpuFace* faces, + __global const int* indices, + __global btAabbCL* aabbs, + __global volatile float4* separatingNormals, + __global volatile int* hasSeparatingAxis, + __global float* dmins, + int numPairs + ) +{ + + int i = get_global_id(0); + + if (i<numPairs) + { + + + int bodyIndexA = pairs[i].x; + int bodyIndexB = pairs[i].y; + + int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + hasSeparatingAxis[i] = 0; + + //once the broadphase avoids static-static pairs, we can remove this test + if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0)) + { + return; + } + + + if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL)) + { + return; + } + + + int numFacesA = convexShapes[shapeIndexA].m_numFaces; + + float dmin = FLT_MAX; + + dmins[i] = dmin; + + float4 posA = rigidBodies[bodyIndexA].m_pos; + posA.w = 0.f; + float4 posB = rigidBodies[bodyIndexB].m_pos; + posB.w = 0.f; + float4 c0local = convexShapes[shapeIndexA].m_localCenter; + float4 ornA = rigidBodies[bodyIndexA].m_quat; + float4 c0 = transform(&c0local, &posA, &ornA); + float4 c1local = convexShapes[shapeIndexB].m_localCenter; + float4 ornB =rigidBodies[bodyIndexB].m_quat; + float4 c1 = transform(&c1local,&posB,&ornB); + const float4 DeltaC2 = c0 - c1; + float4 sepNormal; + + bool sepA = findSeparatingAxis( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA, + posB,ornB, + DeltaC2, + vertices,uniqueEdges,faces, + indices,&sepNormal,&dmin); + hasSeparatingAxis[i] = 4; + if (!sepA) + { + hasSeparatingAxis[i] = 0; + } else + { + bool sepB = findSeparatingAxis( &convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB, + posA,ornA, + DeltaC2, + vertices,uniqueEdges,faces, + indices,&sepNormal,&dmin); + + if (sepB) + { + dmins[i] = dmin; + hasSeparatingAxis[i] = 1; + separatingNormals[i] = sepNormal; + } + } + + } + +} + + +__kernel void findSeparatingAxisEdgeEdgeKernel( __global const int4* pairs, + __global const BodyData* rigidBodies, + __global const btCollidableGpu* collidables, + __global const ConvexPolyhedronCL* convexShapes, + __global const float4* vertices, + __global const float4* uniqueEdges, + __global const btGpuFace* faces, + __global const int* indices, + __global btAabbCL* aabbs, + __global float4* separatingNormals, + __global int* hasSeparatingAxis, + __global float* dmins, + __global const float4* unitSphereDirections, + int numUnitSphereDirections, + int numPairs + ) +{ + + int i = get_global_id(0); + + if (i<numPairs) + { + + if (hasSeparatingAxis[i]) + { + + int bodyIndexA = pairs[i].x; + int bodyIndexB = pairs[i].y; + + int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + + int numFacesA = convexShapes[shapeIndexA].m_numFaces; + + float dmin = dmins[i]; + + float4 posA = rigidBodies[bodyIndexA].m_pos; + posA.w = 0.f; + float4 posB = rigidBodies[bodyIndexB].m_pos; + posB.w = 0.f; + float4 c0local = convexShapes[shapeIndexA].m_localCenter; + float4 ornA = rigidBodies[bodyIndexA].m_quat; + float4 c0 = transform(&c0local, &posA, &ornA); + float4 c1local = convexShapes[shapeIndexB].m_localCenter; + float4 ornB =rigidBodies[bodyIndexB].m_quat; + float4 c1 = transform(&c1local,&posB,&ornB); + const float4 DeltaC2 = c0 - c1; + float4 sepNormal = separatingNormals[i]; + + + + bool sepEE = false; + int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges; + if (numEdgeEdgeDirections<=numUnitSphereDirections) + { + sepEE = findSeparatingAxisEdgeEdge( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA, + posB,ornB, + DeltaC2, + vertices,uniqueEdges,faces, + indices,&sepNormal,&dmin); + + if (!sepEE) + { + hasSeparatingAxis[i] = 0; + } else + { + hasSeparatingAxis[i] = 1; + separatingNormals[i] = sepNormal; + } + } + /* + ///else case is a separate kernel, to make Mac OSX OpenCL compiler happy + else + { + sepEE = findSeparatingAxisUnitSphere(&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA, + posB,ornB, + DeltaC2, + vertices,unitSphereDirections,numUnitSphereDirections, + &sepNormal,&dmin); + if (!sepEE) + { + hasSeparatingAxis[i] = 0; + } else + { + hasSeparatingAxis[i] = 1; + separatingNormals[i] = sepNormal; + } + } + */ + } //if (hasSeparatingAxis[i]) + }//(i<numPairs) +} + + + + + +inline int findClippingFaces(const float4 separatingNormal, + const ConvexPolyhedronCL* hullA, + __global const ConvexPolyhedronCL* hullB, + const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, + __global float4* worldVertsA1, + __global float4* worldNormalsA1, + __global float4* worldVertsB1, + int capacityWorldVerts, + const float minDist, float maxDist, + const float4* verticesA, + const btGpuFace* facesA, + const int* indicesA, + __global const float4* verticesB, + __global const btGpuFace* facesB, + __global const int* indicesB, + __global int4* clippingFaces, int pairIndex) +{ + int numContactsOut = 0; + int numWorldVertsB1= 0; + + + int closestFaceB=0; + float dmax = -FLT_MAX; + + { + for(int face=0;face<hullB->m_numFaces;face++) + { + const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x, + facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f); + const float4 WorldNormal = qtRotate(ornB, Normal); + float d = dot3F4(WorldNormal,separatingNormal); + if (d > dmax) + { + dmax = d; + closestFaceB = face; + } + } + } + + { + const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB]; + int numVertices = polyB.m_numIndices; + if (numVertices>capacityWorldVerts) + numVertices = capacityWorldVerts; + + for(int e0=0;e0<numVertices;e0++) + { + if (e0<capacityWorldVerts) + { + const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]]; + worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB); + } + } + } + + int closestFaceA=0; + { + float dmin = FLT_MAX; + for(int face=0;face<hullA->m_numFaces;face++) + { + const float4 Normal = make_float4( + facesA[hullA->m_faceOffset+face].m_plane.x, + facesA[hullA->m_faceOffset+face].m_plane.y, + facesA[hullA->m_faceOffset+face].m_plane.z, + 0.f); + const float4 faceANormalWS = qtRotate(ornA,Normal); + + float d = dot3F4(faceANormalWS,separatingNormal); + if (d < dmin) + { + dmin = d; + closestFaceA = face; + worldNormalsA1[pairIndex] = faceANormalWS; + } + } + } + + int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices; + if (numVerticesA>capacityWorldVerts) + numVerticesA = capacityWorldVerts; + + for(int e0=0;e0<numVerticesA;e0++) + { + if (e0<capacityWorldVerts) + { + const float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]]; + worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA); + } + } + + clippingFaces[pairIndex].x = closestFaceA; + clippingFaces[pairIndex].y = closestFaceB; + clippingFaces[pairIndex].z = numVerticesA; + clippingFaces[pairIndex].w = numWorldVertsB1; + + + return numContactsOut; +} + + + + +// work-in-progress +__kernel void findConcaveSeparatingAxisKernel( __global int4* concavePairs, + __global const BodyData* rigidBodies, + __global const btCollidableGpu* collidables, + __global const ConvexPolyhedronCL* convexShapes, + __global const float4* vertices, + __global const float4* uniqueEdges, + __global const btGpuFace* faces, + __global const int* indices, + __global const btGpuChildShape* gpuChildShapes, + __global btAabbCL* aabbs, + __global float4* concaveSeparatingNormalsOut, + __global int* concaveHasSeparatingNormals, + __global int4* clippingFacesOut, + __global float4* worldVertsA1GPU, + __global float4* worldNormalsAGPU, + __global float4* worldVertsB1GPU, + int vertexFaceCapacity, + int numConcavePairs + ) +{ + + int i = get_global_id(0); + if (i>=numConcavePairs) + return; + + concaveHasSeparatingNormals[i] = 0; + + int pairIdx = i; + + int bodyIndexA = concavePairs[i].x; + int bodyIndexB = concavePairs[i].y; + + int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&& + collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS) + { + concavePairs[pairIdx].w = -1; + return; + } + + + + int numFacesA = convexShapes[shapeIndexA].m_numFaces; + int numActualConcaveConvexTests = 0; + + int f = concavePairs[i].z; + + bool overlap = false; + + ConvexPolyhedronCL convexPolyhedronA; + + //add 3 vertices of the triangle + convexPolyhedronA.m_numVertices = 3; + convexPolyhedronA.m_vertexOffset = 0; + float4 localCenter = make_float4(0.f,0.f,0.f,0.f); + + btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f]; + float4 triMinAabb, triMaxAabb; + btAabbCL triAabb; + triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f); + triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f); + + float4 verticesA[3]; + for (int i=0;i<3;i++) + { + int index = indices[face.m_indexOffset+i]; + float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index]; + verticesA[i] = vert; + localCenter += vert; + + triAabb.m_min = min(triAabb.m_min,vert); + triAabb.m_max = max(triAabb.m_max,vert); + + } + + overlap = true; + overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap; + overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap; + overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap; + + if (overlap) + { + float dmin = FLT_MAX; + int hasSeparatingAxis=5; + float4 sepAxis=make_float4(1,2,3,4); + + int localCC=0; + numActualConcaveConvexTests++; + + //a triangle has 3 unique edges + convexPolyhedronA.m_numUniqueEdges = 3; + convexPolyhedronA.m_uniqueEdgesOffset = 0; + float4 uniqueEdgesA[3]; + + uniqueEdgesA[0] = (verticesA[1]-verticesA[0]); + uniqueEdgesA[1] = (verticesA[2]-verticesA[1]); + uniqueEdgesA[2] = (verticesA[0]-verticesA[2]); + + + convexPolyhedronA.m_faceOffset = 0; + + float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f); + + btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES]; + int indicesA[3+3+2+2+2]; + int curUsedIndices=0; + int fidx=0; + + //front size of triangle + { + facesA[fidx].m_indexOffset=curUsedIndices; + indicesA[0] = 0; + indicesA[1] = 1; + indicesA[2] = 2; + curUsedIndices+=3; + float c = face.m_plane.w; + facesA[fidx].m_plane.x = normal.x; + facesA[fidx].m_plane.y = normal.y; + facesA[fidx].m_plane.z = normal.z; + facesA[fidx].m_plane.w = c; + facesA[fidx].m_numIndices=3; + } + fidx++; + //back size of triangle + { + facesA[fidx].m_indexOffset=curUsedIndices; + indicesA[3]=2; + indicesA[4]=1; + indicesA[5]=0; + curUsedIndices+=3; + float c = dot(normal,verticesA[0]); + float c1 = -face.m_plane.w; + facesA[fidx].m_plane.x = -normal.x; + facesA[fidx].m_plane.y = -normal.y; + facesA[fidx].m_plane.z = -normal.z; + facesA[fidx].m_plane.w = c; + facesA[fidx].m_numIndices=3; + } + fidx++; + + bool addEdgePlanes = true; + if (addEdgePlanes) + { + int numVertices=3; + int prevVertex = numVertices-1; + for (int i=0;i<numVertices;i++) + { + float4 v0 = verticesA[i]; + float4 v1 = verticesA[prevVertex]; + + float4 edgeNormal = normalize(cross(normal,v1-v0)); + float c = -dot(edgeNormal,v0); + + facesA[fidx].m_numIndices = 2; + facesA[fidx].m_indexOffset=curUsedIndices; + indicesA[curUsedIndices++]=i; + indicesA[curUsedIndices++]=prevVertex; + + facesA[fidx].m_plane.x = edgeNormal.x; + facesA[fidx].m_plane.y = edgeNormal.y; + facesA[fidx].m_plane.z = edgeNormal.z; + facesA[fidx].m_plane.w = c; + fidx++; + prevVertex = i; + } + } + convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES; + convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f); + + + float4 posA = rigidBodies[bodyIndexA].m_pos; + posA.w = 0.f; + float4 posB = rigidBodies[bodyIndexB].m_pos; + posB.w = 0.f; + + float4 ornA = rigidBodies[bodyIndexA].m_quat; + float4 ornB =rigidBodies[bodyIndexB].m_quat; + + + + + /////////////////// + ///compound shape support + + if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + { + int compoundChild = concavePairs[pairIdx].w; + int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild; + int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex; + float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition; + float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; + float4 newPosB = transform(&childPosB,&posB,&ornB); + float4 newOrnB = qtMul(ornB,childOrnB); + posB = newPosB; + ornB = newOrnB; + shapeIndexB = collidables[childColIndexB].m_shapeIndex; + } + ////////////////// + + float4 c0local = convexPolyhedronA.m_localCenter; + float4 c0 = transform(&c0local, &posA, &ornA); + float4 c1local = convexShapes[shapeIndexB].m_localCenter; + float4 c1 = transform(&c1local,&posB,&ornB); + const float4 DeltaC2 = c0 - c1; + + + bool sepA = findSeparatingAxisLocalA( &convexPolyhedronA, &convexShapes[shapeIndexB], + posA,ornA, + posB,ornB, + DeltaC2, + verticesA,uniqueEdgesA,facesA,indicesA, + vertices,uniqueEdges,faces,indices, + &sepAxis,&dmin); + hasSeparatingAxis = 4; + if (!sepA) + { + hasSeparatingAxis = 0; + } else + { + bool sepB = findSeparatingAxisLocalB( &convexShapes[shapeIndexB],&convexPolyhedronA, + posB,ornB, + posA,ornA, + DeltaC2, + vertices,uniqueEdges,faces,indices, + verticesA,uniqueEdgesA,facesA,indicesA, + &sepAxis,&dmin); + + if (!sepB) + { + hasSeparatingAxis = 0; + } else + { + bool sepEE = findSeparatingAxisEdgeEdgeLocalA( &convexPolyhedronA, &convexShapes[shapeIndexB], + posA,ornA, + posB,ornB, + DeltaC2, + verticesA,uniqueEdgesA,facesA,indicesA, + vertices,uniqueEdges,faces,indices, + &sepAxis,&dmin); + + if (!sepEE) + { + hasSeparatingAxis = 0; + } else + { + hasSeparatingAxis = 1; + } + } + } + + if (hasSeparatingAxis) + { + sepAxis.w = dmin; + concaveSeparatingNormalsOut[pairIdx]=sepAxis; + concaveHasSeparatingNormals[i]=1; + + + float minDist = -1e30f; + float maxDist = 0.02f; + + + + findClippingFaces(sepAxis, + &convexPolyhedronA, + &convexShapes[shapeIndexB], + posA,ornA, + posB,ornB, + worldVertsA1GPU, + worldNormalsAGPU, + worldVertsB1GPU, + vertexFaceCapacity, + minDist, maxDist, + verticesA, + facesA, + indicesA, + vertices, + faces, + indices, + clippingFacesOut, pairIdx); + + + } else + { + //mark this pair as in-active + concavePairs[pairIdx].w = -1; + } + } + else + { + //mark this pair as in-active + concavePairs[pairIdx].w = -1; + } + + concavePairs[pairIdx].z = -1;//now z is used for existing/persistent contacts +} + + + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl new file mode 100644 index 0000000000..f433971741 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl @@ -0,0 +1,1888 @@ + +#define TRIANGLE_NUM_CONVEX_FACES 5 + + + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile __global int* +#endif + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + +#define max2 max +#define min2 min + +typedef unsigned int u32; + + + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h" + + + +#define GET_NPOINTS(x) (x).m_worldNormalOnB.w + + + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define make_float4 (float4) +#define make_float2 (float2) +#define make_uint4 (uint4) +#define make_int4 (int4) +#define make_uint2 (uint2) +#define make_int2 (int2) + + +__inline +float fastDiv(float numerator, float denominator) +{ + return native_divide(numerator, denominator); +// return numerator/denominator; +} + +__inline +float4 fastDiv4(float4 numerator, float4 denominator) +{ + return native_divide(numerator, denominator); +} + + +__inline +float4 cross3(float4 a, float4 b) +{ + return cross(a,b); +} + +//#define dot3F4 dot + +__inline +float dot3F4(float4 a, float4 b) +{ + float4 a1 = make_float4(a.xyz,0.f); + float4 b1 = make_float4(b.xyz,0.f); + return dot(a1, b1); +} + +__inline +float4 fastNormalize4(float4 v) +{ + return fast_normalize(v); +} + + +/////////////////////////////////////// +// Quaternion +/////////////////////////////////////// + +typedef float4 Quaternion; + +__inline +Quaternion qtMul(Quaternion a, Quaternion b); + +__inline +Quaternion qtNormalize(Quaternion in); + +__inline +float4 qtRotate(Quaternion q, float4 vec); + +__inline +Quaternion qtInvert(Quaternion q); + + + + +__inline +Quaternion qtMul(Quaternion a, Quaternion b) +{ + Quaternion ans; + ans = cross3( a, b ); + ans += a.w*b+b.w*a; +// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z); + ans.w = a.w*b.w - dot3F4(a, b); + return ans; +} + +__inline +Quaternion qtNormalize(Quaternion in) +{ + return fastNormalize4(in); +// in /= length( in ); +// return in; +} +__inline +float4 qtRotate(Quaternion q, float4 vec) +{ + Quaternion qInv = qtInvert( q ); + float4 vcpy = vec; + vcpy.w = 0.f; + float4 out = qtMul(qtMul(q,vcpy),qInv); + return out; +} + +__inline +Quaternion qtInvert(Quaternion q) +{ + return (Quaternion)(-q.xyz, q.w); +} + +__inline +float4 qtInvRotate(const Quaternion q, float4 vec) +{ + return qtRotate( qtInvert( q ), vec ); +} + +__inline +float4 transform(const float4* p, const float4* translation, const Quaternion* orientation) +{ + return qtRotate( *orientation, *p ) + (*translation); +} + + + +__inline +float4 normalize3(const float4 a) +{ + float4 n = make_float4(a.x, a.y, a.z, 0.f); + return fastNormalize4( n ); +} + + +__inline float4 lerp3(const float4 a,const float4 b, float t) +{ + return make_float4( a.x + (b.x - a.x) * t, + a.y + (b.y - a.y) * t, + a.z + (b.z - a.z) * t, + 0.f); +} + + + +// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut +int clipFaceGlobal(__global const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, __global float4* ppVtxOut) +{ + + int ve; + float ds, de; + int numVertsOut = 0; + //double-check next test + if (numVertsIn < 2) + return 0; + + float4 firstVertex=pVtxIn[numVertsIn-1]; + float4 endVertex = pVtxIn[0]; + + ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS; + + for (ve = 0; ve < numVertsIn; ve++) + { + endVertex=pVtxIn[ve]; + de = dot3F4(planeNormalWS,endVertex)+planeEqWS; + if (ds<0) + { + if (de<0) + { + // Start < 0, end < 0, so output endVertex + ppVtxOut[numVertsOut++] = endVertex; + } + else + { + // Start < 0, end >= 0, so output intersection + ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) ); + } + } + else + { + if (de<0) + { + // Start >= 0, end < 0 so output intersection and end + ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) ); + ppVtxOut[numVertsOut++] = endVertex; + } + } + firstVertex = endVertex; + ds = de; + } + return numVertsOut; +} + + + +// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut +int clipFace(const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, float4* ppVtxOut) +{ + + int ve; + float ds, de; + int numVertsOut = 0; +//double-check next test + if (numVertsIn < 2) + return 0; + + float4 firstVertex=pVtxIn[numVertsIn-1]; + float4 endVertex = pVtxIn[0]; + + ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS; + + for (ve = 0; ve < numVertsIn; ve++) + { + endVertex=pVtxIn[ve]; + + de = dot3F4(planeNormalWS,endVertex)+planeEqWS; + + if (ds<0) + { + if (de<0) + { + // Start < 0, end < 0, so output endVertex + ppVtxOut[numVertsOut++] = endVertex; + } + else + { + // Start < 0, end >= 0, so output intersection + ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) ); + } + } + else + { + if (de<0) + { + // Start >= 0, end < 0 so output intersection and end + ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) ); + ppVtxOut[numVertsOut++] = endVertex; + } + } + firstVertex = endVertex; + ds = de; + } + return numVertsOut; +} + + +int clipFaceAgainstHull(const float4 separatingNormal, __global const b3ConvexPolyhedronData_t* hullA, + const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1, + float4* worldVertsB2, int capacityWorldVertsB2, + const float minDist, float maxDist, + __global const float4* vertices, + __global const b3GpuFace_t* faces, + __global const int* indices, + float4* contactsOut, + int contactCapacity) +{ + int numContactsOut = 0; + + float4* pVtxIn = worldVertsB1; + float4* pVtxOut = worldVertsB2; + + int numVertsIn = numWorldVertsB1; + int numVertsOut = 0; + + int closestFaceA=-1; + { + float dmin = FLT_MAX; + for(int face=0;face<hullA->m_numFaces;face++) + { + const float4 Normal = make_float4( + faces[hullA->m_faceOffset+face].m_plane.x, + faces[hullA->m_faceOffset+face].m_plane.y, + faces[hullA->m_faceOffset+face].m_plane.z,0.f); + const float4 faceANormalWS = qtRotate(ornA,Normal); + + float d = dot3F4(faceANormalWS,separatingNormal); + if (d < dmin) + { + dmin = d; + closestFaceA = face; + } + } + } + if (closestFaceA<0) + return numContactsOut; + + b3GpuFace_t polyA = faces[hullA->m_faceOffset+closestFaceA]; + + // clip polygon to back of planes of all faces of hull A that are adjacent to witness face + int numVerticesA = polyA.m_numIndices; + for(int e0=0;e0<numVerticesA;e0++) + { + const float4 a = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+e0]]; + const float4 b = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+((e0+1)%numVerticesA)]]; + const float4 edge0 = a - b; + const float4 WorldEdge0 = qtRotate(ornA,edge0); + float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f); + float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA); + + float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1); + float4 worldA1 = transform(&a,&posA,&ornA); + float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1); + + float4 planeNormalWS = planeNormalWS1; + float planeEqWS=planeEqWS1; + + //clip face + //clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS); + numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut); + + //btSwap(pVtxIn,pVtxOut); + float4* tmp = pVtxOut; + pVtxOut = pVtxIn; + pVtxIn = tmp; + numVertsIn = numVertsOut; + numVertsOut = 0; + } + + + // only keep points that are behind the witness face + { + float4 localPlaneNormal = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f); + float localPlaneEq = polyA.m_plane.w; + float4 planeNormalWS = qtRotate(ornA,localPlaneNormal); + float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA); + for (int i=0;i<numVertsIn;i++) + { + float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS; + if (depth <=minDist) + { + depth = minDist; + } + + if (depth <=maxDist) + { + float4 pointInWorld = pVtxIn[i]; + //resultOut.addContactPoint(separatingNormal,point,depth); + contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth); + } + } + } + + return numContactsOut; +} + + + +int clipFaceAgainstHullLocalA(const float4 separatingNormal, const b3ConvexPolyhedronData_t* hullA, + const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1, + float4* worldVertsB2, int capacityWorldVertsB2, + const float minDist, float maxDist, + const float4* verticesA, + const b3GpuFace_t* facesA, + const int* indicesA, + __global const float4* verticesB, + __global const b3GpuFace_t* facesB, + __global const int* indicesB, + float4* contactsOut, + int contactCapacity) +{ + int numContactsOut = 0; + + float4* pVtxIn = worldVertsB1; + float4* pVtxOut = worldVertsB2; + + int numVertsIn = numWorldVertsB1; + int numVertsOut = 0; + + int closestFaceA=-1; + { + float dmin = FLT_MAX; + for(int face=0;face<hullA->m_numFaces;face++) + { + const float4 Normal = make_float4( + facesA[hullA->m_faceOffset+face].m_plane.x, + facesA[hullA->m_faceOffset+face].m_plane.y, + facesA[hullA->m_faceOffset+face].m_plane.z,0.f); + const float4 faceANormalWS = qtRotate(ornA,Normal); + + float d = dot3F4(faceANormalWS,separatingNormal); + if (d < dmin) + { + dmin = d; + closestFaceA = face; + } + } + } + if (closestFaceA<0) + return numContactsOut; + + b3GpuFace_t polyA = facesA[hullA->m_faceOffset+closestFaceA]; + + // clip polygon to back of planes of all faces of hull A that are adjacent to witness face + int numVerticesA = polyA.m_numIndices; + for(int e0=0;e0<numVerticesA;e0++) + { + const float4 a = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+e0]]; + const float4 b = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+((e0+1)%numVerticesA)]]; + const float4 edge0 = a - b; + const float4 WorldEdge0 = qtRotate(ornA,edge0); + float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f); + float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA); + + float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1); + float4 worldA1 = transform(&a,&posA,&ornA); + float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1); + + float4 planeNormalWS = planeNormalWS1; + float planeEqWS=planeEqWS1; + + //clip face + //clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS); + numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut); + + //btSwap(pVtxIn,pVtxOut); + float4* tmp = pVtxOut; + pVtxOut = pVtxIn; + pVtxIn = tmp; + numVertsIn = numVertsOut; + numVertsOut = 0; + } + + + // only keep points that are behind the witness face + { + float4 localPlaneNormal = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f); + float localPlaneEq = polyA.m_plane.w; + float4 planeNormalWS = qtRotate(ornA,localPlaneNormal); + float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA); + for (int i=0;i<numVertsIn;i++) + { + float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS; + if (depth <=minDist) + { + depth = minDist; + } + + if (depth <=maxDist) + { + float4 pointInWorld = pVtxIn[i]; + //resultOut.addContactPoint(separatingNormal,point,depth); + contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth); + } + } + } + + return numContactsOut; +} + +int clipHullAgainstHull(const float4 separatingNormal, + __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, + const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, + float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts, + const float minDist, float maxDist, + __global const float4* vertices, + __global const b3GpuFace_t* faces, + __global const int* indices, + float4* localContactsOut, + int localContactCapacity) +{ + int numContactsOut = 0; + int numWorldVertsB1= 0; + + + int closestFaceB=-1; + float dmax = -FLT_MAX; + + { + for(int face=0;face<hullB->m_numFaces;face++) + { + const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x, + faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f); + const float4 WorldNormal = qtRotate(ornB, Normal); + float d = dot3F4(WorldNormal,separatingNormal); + if (d > dmax) + { + dmax = d; + closestFaceB = face; + } + } + } + + { + const b3GpuFace_t polyB = faces[hullB->m_faceOffset+closestFaceB]; + const int numVertices = polyB.m_numIndices; + for(int e0=0;e0<numVertices;e0++) + { + const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]]; + worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB); + } + } + + if (closestFaceB>=0) + { + numContactsOut = clipFaceAgainstHull(separatingNormal, hullA, + posA,ornA, + worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,vertices, + faces, + indices,localContactsOut,localContactCapacity); + } + + return numContactsOut; +} + + +int clipHullAgainstHullLocalA(const float4 separatingNormal, + const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, + const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, + float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts, + const float minDist, float maxDist, + const float4* verticesA, + const b3GpuFace_t* facesA, + const int* indicesA, + __global const float4* verticesB, + __global const b3GpuFace_t* facesB, + __global const int* indicesB, + float4* localContactsOut, + int localContactCapacity) +{ + int numContactsOut = 0; + int numWorldVertsB1= 0; + + + int closestFaceB=-1; + float dmax = -FLT_MAX; + + { + for(int face=0;face<hullB->m_numFaces;face++) + { + const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x, + facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f); + const float4 WorldNormal = qtRotate(ornB, Normal); + float d = dot3F4(WorldNormal,separatingNormal); + if (d > dmax) + { + dmax = d; + closestFaceB = face; + } + } + } + + { + const b3GpuFace_t polyB = facesB[hullB->m_faceOffset+closestFaceB]; + const int numVertices = polyB.m_numIndices; + for(int e0=0;e0<numVertices;e0++) + { + const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]]; + worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB); + } + } + + if (closestFaceB>=0) + { + numContactsOut = clipFaceAgainstHullLocalA(separatingNormal, hullA, + posA,ornA, + worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist, + verticesA,facesA,indicesA, + verticesB,facesB,indicesB, + localContactsOut,localContactCapacity); + } + + return numContactsOut; +} + +#define PARALLEL_SUM(v, n) for(int j=1; j<n; j++) v[0] += v[j]; +#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;} +#define REDUCE_MAX(v, n) {int i=0;\ +for(int offset=0; offset<n; offset++) v[i] = (v[i].y > v[i+offset].y)? v[i]: v[i+offset]; } +#define REDUCE_MIN(v, n) {int i=0;\ +for(int offset=0; offset<n; offset++) v[i] = (v[i].y < v[i+offset].y)? v[i]: v[i+offset]; } + +int extractManifoldSequentialGlobal(__global const float4* p, int nPoints, float4 nearNormal, int4* contactIdx) +{ + if( nPoints == 0 ) + return 0; + + if (nPoints <=4) + return nPoints; + + + if (nPoints >64) + nPoints = 64; + + float4 center = make_float4(0.f); + { + + for (int i=0;i<nPoints;i++) + center += p[i]; + center /= (float)nPoints; + } + + + + // sample 4 directions + + float4 aVector = p[0] - center; + float4 u = cross3( nearNormal, aVector ); + float4 v = cross3( nearNormal, u ); + u = normalize3( u ); + v = normalize3( v ); + + + //keep point with deepest penetration + float minW= FLT_MAX; + + int minIndex=-1; + + float4 maxDots; + maxDots.x = FLT_MIN; + maxDots.y = FLT_MIN; + maxDots.z = FLT_MIN; + maxDots.w = FLT_MIN; + + // idx, distance + for(int ie = 0; ie<nPoints; ie++ ) + { + if (p[ie].w<minW) + { + minW = p[ie].w; + minIndex=ie; + } + float f; + float4 r = p[ie]-center; + f = dot3F4( u, r ); + if (f<maxDots.x) + { + maxDots.x = f; + contactIdx[0].x = ie; + } + + f = dot3F4( -u, r ); + if (f<maxDots.y) + { + maxDots.y = f; + contactIdx[0].y = ie; + } + + + f = dot3F4( v, r ); + if (f<maxDots.z) + { + maxDots.z = f; + contactIdx[0].z = ie; + } + + f = dot3F4( -v, r ); + if (f<maxDots.w) + { + maxDots.w = f; + contactIdx[0].w = ie; + } + + } + + if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex) + { + //replace the first contact with minimum (todo: replace contact with least penetration) + contactIdx[0].x = minIndex; + } + + return 4; + +} + + +int extractManifoldSequentialGlobalFake(__global const float4* p, int nPoints, float4 nearNormal, int* contactIdx) +{ + contactIdx[0] = 0; + contactIdx[1] = 1; + contactIdx[2] = 2; + contactIdx[3] = 3; + + if( nPoints == 0 ) return 0; + + nPoints = min2( nPoints, 4 ); + return nPoints; + +} + + + +int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int* contactIdx) +{ + if( nPoints == 0 ) return 0; + + nPoints = min2( nPoints, 64 ); + + float4 center = make_float4(0.f); + { + float4 v[64]; + for (int i=0;i<nPoints;i++) + v[i] = p[i]; + //memcpy( v, p, nPoints*sizeof(float4) ); + PARALLEL_SUM( v, nPoints ); + center = v[0]/(float)nPoints; + } + + + + { // sample 4 directions + if( nPoints < 4 ) + { + for(int i=0; i<nPoints; i++) + contactIdx[i] = i; + return nPoints; + } + + float4 aVector = p[0] - center; + float4 u = cross3( nearNormal, aVector ); + float4 v = cross3( nearNormal, u ); + u = normalize3( u ); + v = normalize3( v ); + + int idx[4]; + + float2 max00 = make_float2(0,FLT_MAX); + { + // idx, distance + { + { + int4 a[64]; + for(int ie = 0; ie<nPoints; ie++ ) + { + + + float f; + float4 r = p[ie]-center; + f = dot3F4( u, r ); + a[ie].x = ((*(u32*)&f) & 0xffffff00) | (0xff & ie); + + f = dot3F4( -u, r ); + a[ie].y = ((*(u32*)&f) & 0xffffff00) | (0xff & ie); + + f = dot3F4( v, r ); + a[ie].z = ((*(u32*)&f) & 0xffffff00) | (0xff & ie); + + f = dot3F4( -v, r ); + a[ie].w = ((*(u32*)&f) & 0xffffff00) | (0xff & ie); + } + + for(int ie=0; ie<nPoints; ie++) + { + a[0].x = (a[0].x > a[ie].x )? a[0].x: a[ie].x; + a[0].y = (a[0].y > a[ie].y )? a[0].y: a[ie].y; + a[0].z = (a[0].z > a[ie].z )? a[0].z: a[ie].z; + a[0].w = (a[0].w > a[ie].w )? a[0].w: a[ie].w; + } + + idx[0] = (int)a[0].x & 0xff; + idx[1] = (int)a[0].y & 0xff; + idx[2] = (int)a[0].z & 0xff; + idx[3] = (int)a[0].w & 0xff; + } + } + + { + float2 h[64]; + PARALLEL_DO( h[ie] = make_float2((float)ie, p[ie].w), nPoints ); + REDUCE_MIN( h, nPoints ); + max00 = h[0]; + } + } + + contactIdx[0] = idx[0]; + contactIdx[1] = idx[1]; + contactIdx[2] = idx[2]; + contactIdx[3] = idx[3]; + + + return 4; + } +} + + + +__kernel void extractManifoldAndAddContactKernel(__global const int4* pairs, + __global const b3RigidBodyData_t* rigidBodies, + __global const float4* closestPointsWorld, + __global const float4* separatingNormalsWorld, + __global const int* contactCounts, + __global const int* contactOffsets, + __global struct b3Contact4Data* restrict contactsOut, + counter32_t nContactsOut, + int contactCapacity, + int numPairs, + int pairIndex + ) +{ + int idx = get_global_id(0); + + if (idx<numPairs) + { + float4 normal = separatingNormalsWorld[idx]; + int nPoints = contactCounts[idx]; + __global const float4* pointsIn = &closestPointsWorld[contactOffsets[idx]]; + float4 localPoints[64]; + for (int i=0;i<nPoints;i++) + { + localPoints[i] = pointsIn[i]; + } + + int contactIdx[4];// = {-1,-1,-1,-1}; + contactIdx[0] = -1; + contactIdx[1] = -1; + contactIdx[2] = -1; + contactIdx[3] = -1; + + int nContacts = extractManifoldSequential(localPoints, nPoints, normal, contactIdx); + + int dstIdx; + AppendInc( nContactsOut, dstIdx ); + if (dstIdx<contactCapacity) + { + __global struct b3Contact4Data* c = contactsOut + dstIdx; + c->m_worldNormalOnB = -normal; + c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff); + c->m_batchIdx = idx; + int bodyA = pairs[pairIndex].x; + int bodyB = pairs[pairIndex].y; + c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB; + c->m_childIndexA = -1; + c->m_childIndexB = -1; + for (int i=0;i<nContacts;i++) + { + c->m_worldPosB[i] = localPoints[contactIdx[i]]; + } + GET_NPOINTS(*c) = nContacts; + } + } +} + + +void trInverse(float4 translationIn, Quaternion orientationIn, + float4* translationOut, Quaternion* orientationOut) +{ + *orientationOut = qtInvert(orientationIn); + *translationOut = qtRotate(*orientationOut, -translationIn); +} + +void trMul(float4 translationA, Quaternion orientationA, + float4 translationB, Quaternion orientationB, + float4* translationOut, Quaternion* orientationOut) +{ + *orientationOut = qtMul(orientationA,orientationB); + *translationOut = transform(&translationB,&translationA,&orientationA); +} + + + + +__kernel void clipHullHullKernel( __global int4* pairs, + __global const b3RigidBodyData_t* rigidBodies, + __global const b3Collidable_t* collidables, + __global const b3ConvexPolyhedronData_t* convexShapes, + __global const float4* vertices, + __global const float4* uniqueEdges, + __global const b3GpuFace_t* faces, + __global const int* indices, + __global const float4* separatingNormals, + __global const int* hasSeparatingAxis, + __global struct b3Contact4Data* restrict globalContactsOut, + counter32_t nGlobalContactsOut, + int numPairs, + int contactCapacity) +{ + + int i = get_global_id(0); + int pairIndex = i; + + float4 worldVertsB1[64]; + float4 worldVertsB2[64]; + int capacityWorldVerts = 64; + + float4 localContactsOut[64]; + int localContactCapacity=64; + + float minDist = -1e30f; + float maxDist = 0.02f; + + if (i<numPairs) + { + + int bodyIndexA = pairs[i].x; + int bodyIndexB = pairs[i].y; + + int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + + if (hasSeparatingAxis[i]) + { + + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + + + + int numLocalContactsOut = clipHullAgainstHull(separatingNormals[i], + &convexShapes[shapeIndexA], &convexShapes[shapeIndexB], + rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat, + rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat, + worldVertsB1,worldVertsB2,capacityWorldVerts, + minDist, maxDist, + vertices,faces,indices, + localContactsOut,localContactCapacity); + + if (numLocalContactsOut>0) + { + float4 normal = -separatingNormals[i]; + int nPoints = numLocalContactsOut; + float4* pointsIn = localContactsOut; + int contactIdx[4];// = {-1,-1,-1,-1}; + + contactIdx[0] = -1; + contactIdx[1] = -1; + contactIdx[2] = -1; + contactIdx[3] = -1; + + int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx); + + + int mprContactIndex = pairs[pairIndex].z; + + int dstIdx = mprContactIndex; + if (dstIdx<0) + { + AppendInc( nGlobalContactsOut, dstIdx ); + } + + if (dstIdx<contactCapacity) + { + pairs[pairIndex].z = dstIdx; + + __global struct b3Contact4Data* c = globalContactsOut+ dstIdx; + c->m_worldNormalOnB = -normal; + c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff); + c->m_batchIdx = pairIndex; + int bodyA = pairs[pairIndex].x; + int bodyB = pairs[pairIndex].y; + c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB; + c->m_childIndexA = -1; + c->m_childIndexB = -1; + + for (int i=0;i<nReducedContacts;i++) + { + //this condition means: overwrite contact point, unless at index i==0 we have a valid 'mpr' contact + if (i>0||(mprContactIndex<0)) + { + c->m_worldPosB[i] = pointsIn[contactIdx[i]]; + } + } + GET_NPOINTS(*c) = nReducedContacts; + } + + }// if (numContactsOut>0) + }// if (hasSeparatingAxis[i]) + }// if (i<numPairs) + +} + + +__kernel void clipCompoundsHullHullKernel( __global const int4* gpuCompoundPairs, + __global const b3RigidBodyData_t* rigidBodies, + __global const b3Collidable_t* collidables, + __global const b3ConvexPolyhedronData_t* convexShapes, + __global const float4* vertices, + __global const float4* uniqueEdges, + __global const b3GpuFace_t* faces, + __global const int* indices, + __global const b3GpuChildShape_t* gpuChildShapes, + __global const float4* gpuCompoundSepNormalsOut, + __global const int* gpuHasCompoundSepNormalsOut, + __global struct b3Contact4Data* restrict globalContactsOut, + counter32_t nGlobalContactsOut, + int numCompoundPairs, int maxContactCapacity) +{ + + int i = get_global_id(0); + int pairIndex = i; + + float4 worldVertsB1[64]; + float4 worldVertsB2[64]; + int capacityWorldVerts = 64; + + float4 localContactsOut[64]; + int localContactCapacity=64; + + float minDist = -1e30f; + float maxDist = 0.02f; + + if (i<numCompoundPairs) + { + + if (gpuHasCompoundSepNormalsOut[i]) + { + + int bodyIndexA = gpuCompoundPairs[i].x; + int bodyIndexB = gpuCompoundPairs[i].y; + + int childShapeIndexA = gpuCompoundPairs[i].z; + int childShapeIndexB = gpuCompoundPairs[i].w; + + int collidableIndexA = -1; + int collidableIndexB = -1; + + float4 ornA = rigidBodies[bodyIndexA].m_quat; + float4 posA = rigidBodies[bodyIndexA].m_pos; + + float4 ornB = rigidBodies[bodyIndexB].m_quat; + float4 posB = rigidBodies[bodyIndexB].m_pos; + + if (childShapeIndexA >= 0) + { + collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex; + float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition; + float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation; + float4 newPosA = qtRotate(ornA,childPosA)+posA; + float4 newOrnA = qtMul(ornA,childOrnA); + posA = newPosA; + ornA = newOrnA; + } else + { + collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + } + + if (childShapeIndexB>=0) + { + collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex; + float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition; + float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; + float4 newPosB = transform(&childPosB,&posB,&ornB); + float4 newOrnB = qtMul(ornB,childOrnB); + posB = newPosB; + ornB = newOrnB; + } else + { + collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + } + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + int numLocalContactsOut = clipHullAgainstHull(gpuCompoundSepNormalsOut[i], + &convexShapes[shapeIndexA], &convexShapes[shapeIndexB], + posA,ornA, + posB,ornB, + worldVertsB1,worldVertsB2,capacityWorldVerts, + minDist, maxDist, + vertices,faces,indices, + localContactsOut,localContactCapacity); + + if (numLocalContactsOut>0) + { + float4 normal = -gpuCompoundSepNormalsOut[i]; + int nPoints = numLocalContactsOut; + float4* pointsIn = localContactsOut; + int contactIdx[4];// = {-1,-1,-1,-1}; + + contactIdx[0] = -1; + contactIdx[1] = -1; + contactIdx[2] = -1; + contactIdx[3] = -1; + + int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx); + + int dstIdx; + AppendInc( nGlobalContactsOut, dstIdx ); + if ((dstIdx+nReducedContacts) < maxContactCapacity) + { + __global struct b3Contact4Data* c = globalContactsOut+ dstIdx; + c->m_worldNormalOnB = -normal; + c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff); + c->m_batchIdx = pairIndex; + int bodyA = gpuCompoundPairs[pairIndex].x; + int bodyB = gpuCompoundPairs[pairIndex].y; + c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB; + c->m_childIndexA = childShapeIndexA; + c->m_childIndexB = childShapeIndexB; + for (int i=0;i<nReducedContacts;i++) + { + c->m_worldPosB[i] = pointsIn[contactIdx[i]]; + } + GET_NPOINTS(*c) = nReducedContacts; + } + + }// if (numContactsOut>0) + }// if (gpuHasCompoundSepNormalsOut[i]) + }// if (i<numCompoundPairs) + +} + + + +__kernel void sphereSphereCollisionKernel( __global const int4* pairs, + __global const b3RigidBodyData_t* rigidBodies, + __global const b3Collidable_t* collidables, + __global const float4* separatingNormals, + __global const int* hasSeparatingAxis, + __global struct b3Contact4Data* restrict globalContactsOut, + counter32_t nGlobalContactsOut, + int contactCapacity, + int numPairs) +{ + + int i = get_global_id(0); + int pairIndex = i; + + if (i<numPairs) + { + int bodyIndexA = pairs[i].x; + int bodyIndexB = pairs[i].y; + + int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + + if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE && + collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE) + { + //sphere-sphere + float radiusA = collidables[collidableIndexA].m_radius; + float radiusB = collidables[collidableIndexB].m_radius; + float4 posA = rigidBodies[bodyIndexA].m_pos; + float4 posB = rigidBodies[bodyIndexB].m_pos; + + float4 diff = posA-posB; + float len = length(diff); + + ///iff distance positive, don't generate a new contact + if ( len <= (radiusA+radiusB)) + { + ///distance (negative means penetration) + float dist = len - (radiusA+radiusB); + float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f); + if (len > 0.00001) + { + normalOnSurfaceB = diff / len; + } + float4 contactPosB = posB + normalOnSurfaceB*radiusB; + contactPosB.w = dist; + + int dstIdx; + AppendInc( nGlobalContactsOut, dstIdx ); + if (dstIdx < contactCapacity) + { + __global struct b3Contact4Data* c = &globalContactsOut[dstIdx]; + c->m_worldNormalOnB = -normalOnSurfaceB; + c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff); + c->m_batchIdx = pairIndex; + int bodyA = pairs[pairIndex].x; + int bodyB = pairs[pairIndex].y; + c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB; + c->m_worldPosB[0] = contactPosB; + c->m_childIndexA = -1; + c->m_childIndexB = -1; + + GET_NPOINTS(*c) = 1; + }//if (dstIdx < numPairs) + }//if ( len <= (radiusA+radiusB)) + }//SHAPE_SPHERE SHAPE_SPHERE + }//if (i<numPairs) +} + +__kernel void clipHullHullConcaveConvexKernel( __global int4* concavePairsIn, + __global const b3RigidBodyData_t* rigidBodies, + __global const b3Collidable_t* collidables, + __global const b3ConvexPolyhedronData_t* convexShapes, + __global const float4* vertices, + __global const float4* uniqueEdges, + __global const b3GpuFace_t* faces, + __global const int* indices, + __global const b3GpuChildShape_t* gpuChildShapes, + __global const float4* separatingNormals, + __global struct b3Contact4Data* restrict globalContactsOut, + counter32_t nGlobalContactsOut, + int contactCapacity, + int numConcavePairs) +{ + + int i = get_global_id(0); + int pairIndex = i; + + float4 worldVertsB1[64]; + float4 worldVertsB2[64]; + int capacityWorldVerts = 64; + + float4 localContactsOut[64]; + int localContactCapacity=64; + + float minDist = -1e30f; + float maxDist = 0.02f; + + if (i<numConcavePairs) + { + //negative value means that the pair is invalid + if (concavePairsIn[i].w<0) + return; + + int bodyIndexA = concavePairsIn[i].x; + int bodyIndexB = concavePairsIn[i].y; + int f = concavePairsIn[i].z; + int childShapeIndexA = f; + + int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + /////////////////////////////////////////////////////////////// + + + bool overlap = false; + + b3ConvexPolyhedronData_t convexPolyhedronA; + + //add 3 vertices of the triangle + convexPolyhedronA.m_numVertices = 3; + convexPolyhedronA.m_vertexOffset = 0; + float4 localCenter = make_float4(0.f,0.f,0.f,0.f); + + b3GpuFace_t face = faces[convexShapes[shapeIndexA].m_faceOffset+f]; + + float4 verticesA[3]; + for (int i=0;i<3;i++) + { + int index = indices[face.m_indexOffset+i]; + float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index]; + verticesA[i] = vert; + localCenter += vert; + } + + float dmin = FLT_MAX; + + int localCC=0; + + //a triangle has 3 unique edges + convexPolyhedronA.m_numUniqueEdges = 3; + convexPolyhedronA.m_uniqueEdgesOffset = 0; + float4 uniqueEdgesA[3]; + + uniqueEdgesA[0] = (verticesA[1]-verticesA[0]); + uniqueEdgesA[1] = (verticesA[2]-verticesA[1]); + uniqueEdgesA[2] = (verticesA[0]-verticesA[2]); + + + convexPolyhedronA.m_faceOffset = 0; + + float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f); + + b3GpuFace_t facesA[TRIANGLE_NUM_CONVEX_FACES]; + int indicesA[3+3+2+2+2]; + int curUsedIndices=0; + int fidx=0; + + //front size of triangle + { + facesA[fidx].m_indexOffset=curUsedIndices; + indicesA[0] = 0; + indicesA[1] = 1; + indicesA[2] = 2; + curUsedIndices+=3; + float c = face.m_plane.w; + facesA[fidx].m_plane.x = normal.x; + facesA[fidx].m_plane.y = normal.y; + facesA[fidx].m_plane.z = normal.z; + facesA[fidx].m_plane.w = c; + facesA[fidx].m_numIndices=3; + } + fidx++; + //back size of triangle + { + facesA[fidx].m_indexOffset=curUsedIndices; + indicesA[3]=2; + indicesA[4]=1; + indicesA[5]=0; + curUsedIndices+=3; + float c = dot3F4(normal,verticesA[0]); + float c1 = -face.m_plane.w; + facesA[fidx].m_plane.x = -normal.x; + facesA[fidx].m_plane.y = -normal.y; + facesA[fidx].m_plane.z = -normal.z; + facesA[fidx].m_plane.w = c; + facesA[fidx].m_numIndices=3; + } + fidx++; + + bool addEdgePlanes = true; + if (addEdgePlanes) + { + int numVertices=3; + int prevVertex = numVertices-1; + for (int i=0;i<numVertices;i++) + { + float4 v0 = verticesA[i]; + float4 v1 = verticesA[prevVertex]; + + float4 edgeNormal = normalize(cross(normal,v1-v0)); + float c = -dot3F4(edgeNormal,v0); + + facesA[fidx].m_numIndices = 2; + facesA[fidx].m_indexOffset=curUsedIndices; + indicesA[curUsedIndices++]=i; + indicesA[curUsedIndices++]=prevVertex; + + facesA[fidx].m_plane.x = edgeNormal.x; + facesA[fidx].m_plane.y = edgeNormal.y; + facesA[fidx].m_plane.z = edgeNormal.z; + facesA[fidx].m_plane.w = c; + fidx++; + prevVertex = i; + } + } + convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES; + convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f); + + + float4 posA = rigidBodies[bodyIndexA].m_pos; + posA.w = 0.f; + float4 posB = rigidBodies[bodyIndexB].m_pos; + posB.w = 0.f; + float4 ornA = rigidBodies[bodyIndexA].m_quat; + float4 ornB =rigidBodies[bodyIndexB].m_quat; + + + float4 sepAxis = separatingNormals[i]; + + int shapeTypeB = collidables[collidableIndexB].m_shapeType; + int childShapeIndexB =-1; + if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS) + { + /////////////////// + ///compound shape support + + childShapeIndexB = concavePairsIn[pairIndex].w; + int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex; + shapeIndexB = collidables[childColIndexB].m_shapeIndex; + float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition; + float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; + float4 newPosB = transform(&childPosB,&posB,&ornB); + float4 newOrnB = qtMul(ornB,childOrnB); + posB = newPosB; + ornB = newOrnB; + + } + + //////////////////////////////////////// + + + + int numLocalContactsOut = clipHullAgainstHullLocalA(sepAxis, + &convexPolyhedronA, &convexShapes[shapeIndexB], + posA,ornA, + posB,ornB, + worldVertsB1,worldVertsB2,capacityWorldVerts, + minDist, maxDist, + &verticesA,&facesA,&indicesA, + vertices,faces,indices, + localContactsOut,localContactCapacity); + + if (numLocalContactsOut>0) + { + float4 normal = -separatingNormals[i]; + int nPoints = numLocalContactsOut; + float4* pointsIn = localContactsOut; + int contactIdx[4];// = {-1,-1,-1,-1}; + + contactIdx[0] = -1; + contactIdx[1] = -1; + contactIdx[2] = -1; + contactIdx[3] = -1; + + int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx); + + int dstIdx; + AppendInc( nGlobalContactsOut, dstIdx ); + if (dstIdx<contactCapacity) + { + __global struct b3Contact4Data* c = globalContactsOut+ dstIdx; + c->m_worldNormalOnB = -normal; + c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff); + c->m_batchIdx = pairIndex; + int bodyA = concavePairsIn[pairIndex].x; + int bodyB = concavePairsIn[pairIndex].y; + c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB; + c->m_childIndexA = childShapeIndexA; + c->m_childIndexB = childShapeIndexB; + for (int i=0;i<nReducedContacts;i++) + { + c->m_worldPosB[i] = pointsIn[contactIdx[i]]; + } + GET_NPOINTS(*c) = nReducedContacts; + } + + }// if (numContactsOut>0) + }// if (i<numPairs) +} + + + + + + +int findClippingFaces(const float4 separatingNormal, + __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, + const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, + __global float4* worldVertsA1, + __global float4* worldNormalsA1, + __global float4* worldVertsB1, + int capacityWorldVerts, + const float minDist, float maxDist, + __global const float4* vertices, + __global const b3GpuFace_t* faces, + __global const int* indices, + __global int4* clippingFaces, int pairIndex) +{ + int numContactsOut = 0; + int numWorldVertsB1= 0; + + + int closestFaceB=-1; + float dmax = -FLT_MAX; + + { + for(int face=0;face<hullB->m_numFaces;face++) + { + const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x, + faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f); + const float4 WorldNormal = qtRotate(ornB, Normal); + float d = dot3F4(WorldNormal,separatingNormal); + if (d > dmax) + { + dmax = d; + closestFaceB = face; + } + } + } + + { + const b3GpuFace_t polyB = faces[hullB->m_faceOffset+closestFaceB]; + const int numVertices = polyB.m_numIndices; + for(int e0=0;e0<numVertices;e0++) + { + const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]]; + worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB); + } + } + + int closestFaceA=-1; + { + float dmin = FLT_MAX; + for(int face=0;face<hullA->m_numFaces;face++) + { + const float4 Normal = make_float4( + faces[hullA->m_faceOffset+face].m_plane.x, + faces[hullA->m_faceOffset+face].m_plane.y, + faces[hullA->m_faceOffset+face].m_plane.z, + 0.f); + const float4 faceANormalWS = qtRotate(ornA,Normal); + + float d = dot3F4(faceANormalWS,separatingNormal); + if (d < dmin) + { + dmin = d; + closestFaceA = face; + worldNormalsA1[pairIndex] = faceANormalWS; + } + } + } + + int numVerticesA = faces[hullA->m_faceOffset+closestFaceA].m_numIndices; + for(int e0=0;e0<numVerticesA;e0++) + { + const float4 a = vertices[hullA->m_vertexOffset+indices[faces[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]]; + worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA); + } + + clippingFaces[pairIndex].x = closestFaceA; + clippingFaces[pairIndex].y = closestFaceB; + clippingFaces[pairIndex].z = numVerticesA; + clippingFaces[pairIndex].w = numWorldVertsB1; + + + return numContactsOut; +} + + + +int clipFaces(__global float4* worldVertsA1, + __global float4* worldNormalsA1, + __global float4* worldVertsB1, + __global float4* worldVertsB2, + int capacityWorldVertsB2, + const float minDist, float maxDist, + __global int4* clippingFaces, + int pairIndex) +{ + int numContactsOut = 0; + + int closestFaceA = clippingFaces[pairIndex].x; + int closestFaceB = clippingFaces[pairIndex].y; + int numVertsInA = clippingFaces[pairIndex].z; + int numVertsInB = clippingFaces[pairIndex].w; + + int numVertsOut = 0; + + if (closestFaceA<0) + return numContactsOut; + + __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2]; + __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2]; + + + + // clip polygon to back of planes of all faces of hull A that are adjacent to witness face + + for(int e0=0;e0<numVertsInA;e0++) + { + const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0]; + const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)]; + const float4 WorldEdge0 = aw - bw; + float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex]; + float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1); + float4 worldA1 = aw; + float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1); + float4 planeNormalWS = planeNormalWS1; + float planeEqWS=planeEqWS1; + numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut); + __global float4* tmp = pVtxOut; + pVtxOut = pVtxIn; + pVtxIn = tmp; + numVertsInB = numVertsOut; + numVertsOut = 0; + } + + //float4 planeNormalWS = worldNormalsA1[pairIndex]; + //float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]); + + + + /*for (int i=0;i<numVertsInB;i++) + { + pVtxOut[i] = pVtxIn[i]; + }*/ + + + + + //numVertsInB=0; + + float4 planeNormalWS = worldNormalsA1[pairIndex]; + float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]); + + for (int i=0;i<numVertsInB;i++) + { + float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS; + if (depth <=minDist) + { + depth = minDist; + } + + if (depth <=maxDist) + { + float4 pointInWorld = pVtxIn[i]; + pVtxOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth); + } + } + + clippingFaces[pairIndex].w =numContactsOut; + + + return numContactsOut; + +} + + + + +__kernel void findClippingFacesKernel( __global const int4* pairs, + __global const b3RigidBodyData_t* rigidBodies, + __global const b3Collidable_t* collidables, + __global const b3ConvexPolyhedronData_t* convexShapes, + __global const float4* vertices, + __global const float4* uniqueEdges, + __global const b3GpuFace_t* faces, + __global const int* indices, + __global const float4* separatingNormals, + __global const int* hasSeparatingAxis, + __global int4* clippingFacesOut, + __global float4* worldVertsA1, + __global float4* worldNormalsA1, + __global float4* worldVertsB1, + int capacityWorldVerts, + int numPairs + ) +{ + + int i = get_global_id(0); + int pairIndex = i; + + + float minDist = -1e30f; + float maxDist = 0.02f; + + if (i<numPairs) + { + + if (hasSeparatingAxis[i]) + { + + int bodyIndexA = pairs[i].x; + int bodyIndexB = pairs[i].y; + + int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + + + int numLocalContactsOut = findClippingFaces(separatingNormals[i], + &convexShapes[shapeIndexA], &convexShapes[shapeIndexB], + rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat, + rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat, + worldVertsA1, + worldNormalsA1, + worldVertsB1,capacityWorldVerts, + minDist, maxDist, + vertices,faces,indices, + clippingFacesOut,i); + + + }// if (hasSeparatingAxis[i]) + }// if (i<numPairs) + +} + + + + +__kernel void clipFacesAndFindContactsKernel( __global const float4* separatingNormals, + __global const int* hasSeparatingAxis, + __global int4* clippingFacesOut, + __global float4* worldVertsA1, + __global float4* worldNormalsA1, + __global float4* worldVertsB1, + __global float4* worldVertsB2, + int vertexFaceCapacity, + int numPairs, + int debugMode + ) +{ + int i = get_global_id(0); + int pairIndex = i; + + + float minDist = -1e30f; + float maxDist = 0.02f; + + if (i<numPairs) + { + + if (hasSeparatingAxis[i]) + { + +// int bodyIndexA = pairs[i].x; + // int bodyIndexB = pairs[i].y; + + int numLocalContactsOut = 0; + + int capacityWorldVertsB2 = vertexFaceCapacity; + + __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2]; + __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2]; + + + { + __global int4* clippingFaces = clippingFacesOut; + + + int closestFaceA = clippingFaces[pairIndex].x; + int closestFaceB = clippingFaces[pairIndex].y; + int numVertsInA = clippingFaces[pairIndex].z; + int numVertsInB = clippingFaces[pairIndex].w; + + int numVertsOut = 0; + + if (closestFaceA>=0) + { + + + + // clip polygon to back of planes of all faces of hull A that are adjacent to witness face + + for(int e0=0;e0<numVertsInA;e0++) + { + const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0]; + const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)]; + const float4 WorldEdge0 = aw - bw; + float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex]; + float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1); + float4 worldA1 = aw; + float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1); + float4 planeNormalWS = planeNormalWS1; + float planeEqWS=planeEqWS1; + numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut); + __global float4* tmp = pVtxOut; + pVtxOut = pVtxIn; + pVtxIn = tmp; + numVertsInB = numVertsOut; + numVertsOut = 0; + } + + float4 planeNormalWS = worldNormalsA1[pairIndex]; + float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]); + + for (int i=0;i<numVertsInB;i++) + { + float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS; + if (depth <=minDist) + { + depth = minDist; + } + + if (depth <=maxDist) + { + float4 pointInWorld = pVtxIn[i]; + pVtxOut[numLocalContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth); + } + } + + } + clippingFaces[pairIndex].w =numLocalContactsOut; + + + } + + for (int i=0;i<numLocalContactsOut;i++) + pVtxIn[i] = pVtxOut[i]; + + }// if (hasSeparatingAxis[i]) + }// if (i<numPairs) + +} + + + + + +__kernel void newContactReductionKernel( __global int4* pairs, + __global const b3RigidBodyData_t* rigidBodies, + __global const float4* separatingNormals, + __global const int* hasSeparatingAxis, + __global struct b3Contact4Data* globalContactsOut, + __global int4* clippingFaces, + __global float4* worldVertsB2, + volatile __global int* nGlobalContactsOut, + int vertexFaceCapacity, + int contactCapacity, + int numPairs + ) +{ + int i = get_global_id(0); + int pairIndex = i; + + int4 contactIdx; + contactIdx=make_int4(0,1,2,3); + + if (i<numPairs) + { + + if (hasSeparatingAxis[i]) + { + + + + + int nPoints = clippingFaces[pairIndex].w; + + if (nPoints>0) + { + + __global float4* pointsIn = &worldVertsB2[pairIndex*vertexFaceCapacity]; + float4 normal = -separatingNormals[i]; + + int nReducedContacts = extractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx); + + int mprContactIndex = pairs[pairIndex].z; + + int dstIdx = mprContactIndex; + + if (dstIdx<0) + { + AppendInc( nGlobalContactsOut, dstIdx ); + } +//#if 0 + + if (dstIdx < contactCapacity) + { + + __global struct b3Contact4Data* c = &globalContactsOut[dstIdx]; + c->m_worldNormalOnB = -normal; + c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff); + c->m_batchIdx = pairIndex; + int bodyA = pairs[pairIndex].x; + int bodyB = pairs[pairIndex].y; + + pairs[pairIndex].w = dstIdx; + + c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB; + c->m_childIndexA =-1; + c->m_childIndexB =-1; + + switch (nReducedContacts) + { + case 4: + c->m_worldPosB[3] = pointsIn[contactIdx.w]; + case 3: + c->m_worldPosB[2] = pointsIn[contactIdx.z]; + case 2: + c->m_worldPosB[1] = pointsIn[contactIdx.y]; + case 1: + if (mprContactIndex<0)//test + c->m_worldPosB[0] = pointsIn[contactIdx.x]; + default: + { + } + }; + + GET_NPOINTS(*c) = nReducedContacts; + + } + + +//#endif + + }// if (numContactsOut>0) + }// if (hasSeparatingAxis[i]) + }// if (i<numPairs) + + + +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h new file mode 100644 index 0000000000..f0ecfc7851 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h @@ -0,0 +1,2099 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* satClipKernelsCL= \ +"#define TRIANGLE_NUM_CONVEX_FACES 5\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile __global int*\n" +"#endif\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"#define max2 max\n" +"#define min2 min\n" +"typedef unsigned int u32;\n" +"#ifndef B3_CONTACT4DATA_H\n" +"#define B3_CONTACT4DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"typedef struct b3Contact4Data b3Contact4Data_t;\n" +"struct b3Contact4Data\n" +"{\n" +" b3Float4 m_worldPosB[4];\n" +"// b3Float4 m_localPosA[4];\n" +"// b3Float4 m_localPosB[4];\n" +" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" +" unsigned short m_restituitionCoeffCmp;\n" +" unsigned short m_frictionCoeffCmp;\n" +" int m_batchIdx;\n" +" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" +" int m_bodyBPtrAndSignBit;\n" +" int m_childIndexA;\n" +" int m_childIndexB;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"};\n" +"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" +"{\n" +" return (int)contact->m_worldNormalOnB.w;\n" +"};\n" +"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" +"{\n" +" contact->m_worldNormalOnB.w = (float)numPoints;\n" +"};\n" +"#endif //B3_CONTACT4DATA_H\n" +"#ifndef B3_CONVEX_POLYHEDRON_DATA_H\n" +"#define B3_CONVEX_POLYHEDRON_DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_QUAT_H\n" +"#define B3_QUAT_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif\n" +"#endif\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Quat;\n" +" #define b3QuatConstArg const b3Quat\n" +" \n" +" \n" +"inline float4 b3FastNormalize4(float4 v)\n" +"{\n" +" v = (float4)(v.xyz,0.f);\n" +" return fast_normalize(v);\n" +"}\n" +" \n" +"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" +"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" +"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" +"{\n" +" b3Quat ans;\n" +" ans = b3Cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - b3Dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" +"{\n" +" b3Quat q;\n" +" q=in;\n" +" //return b3FastNormalize4(in);\n" +" float len = native_sqrt(dot(q, q));\n" +" if(len > 0.f)\n" +" {\n" +" q *= 1.f / len;\n" +" }\n" +" else\n" +" {\n" +" q.x = q.y = q.z = 0.f;\n" +" q.w = 1.f;\n" +" }\n" +" return q;\n" +"}\n" +"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" b3Quat qInv = b3QuatInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" return b3QuatRotate( b3QuatInvert( q ), vec );\n" +"}\n" +"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" +"{\n" +" return b3QuatRotate( orientation, point ) + (translation);\n" +"}\n" +" \n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"typedef struct b3GpuFace b3GpuFace_t;\n" +"struct b3GpuFace\n" +"{\n" +" b3Float4 m_plane;\n" +" int m_indexOffset;\n" +" int m_numIndices;\n" +" int m_unusedPadding1;\n" +" int m_unusedPadding2;\n" +"};\n" +"typedef struct b3ConvexPolyhedronData b3ConvexPolyhedronData_t;\n" +"struct b3ConvexPolyhedronData\n" +"{\n" +" b3Float4 m_localCenter;\n" +" b3Float4 m_extents;\n" +" b3Float4 mC;\n" +" b3Float4 mE;\n" +" float m_radius;\n" +" int m_faceOffset;\n" +" int m_numFaces;\n" +" int m_numVertices;\n" +" int m_vertexOffset;\n" +" int m_uniqueEdgesOffset;\n" +" int m_numUniqueEdges;\n" +" int m_unused;\n" +"};\n" +"#endif //B3_CONVEX_POLYHEDRON_DATA_H\n" +"#ifndef B3_COLLIDABLE_H\n" +"#define B3_COLLIDABLE_H\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"enum b3ShapeTypes\n" +"{\n" +" SHAPE_HEIGHT_FIELD=1,\n" +" SHAPE_CONVEX_HULL=3,\n" +" SHAPE_PLANE=4,\n" +" SHAPE_CONCAVE_TRIMESH=5,\n" +" SHAPE_COMPOUND_OF_CONVEX_HULLS=6,\n" +" SHAPE_SPHERE=7,\n" +" MAX_NUM_SHAPE_TYPES,\n" +"};\n" +"typedef struct b3Collidable b3Collidable_t;\n" +"struct b3Collidable\n" +"{\n" +" union {\n" +" int m_numChildShapes;\n" +" int m_bvhIndex;\n" +" };\n" +" union\n" +" {\n" +" float m_radius;\n" +" int m_compoundBvhIndex;\n" +" };\n" +" int m_shapeType;\n" +" int m_shapeIndex;\n" +"};\n" +"typedef struct b3GpuChildShape b3GpuChildShape_t;\n" +"struct b3GpuChildShape\n" +"{\n" +" b3Float4 m_childPosition;\n" +" b3Quat m_childOrientation;\n" +" int m_shapeIndex;\n" +" int m_unused0;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"};\n" +"struct b3CompoundOverlappingPair\n" +"{\n" +" int m_bodyIndexA;\n" +" int m_bodyIndexB;\n" +"// int m_pairType;\n" +" int m_childShapeIndexA;\n" +" int m_childShapeIndexB;\n" +"};\n" +"#endif //B3_COLLIDABLE_H\n" +"#ifndef B3_RIGIDBODY_DATA_H\n" +"#define B3_RIGIDBODY_DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifndef B3_MAT3x3_H\n" +"#define B3_MAT3x3_H\n" +"#ifndef B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"typedef struct\n" +"{\n" +" b3Float4 m_row[3];\n" +"}b3Mat3x3;\n" +"#define b3Mat3x3ConstArg const b3Mat3x3\n" +"#define b3GetRow(m,row) (m.m_row[row])\n" +"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" +"{\n" +" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" +" b3Mat3x3 out;\n" +" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" +" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" +" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" +" out.m_row[0].w = 0.f;\n" +" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" +" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" +" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" +" out.m_row[1].w = 0.f;\n" +" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" +" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" +" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" +" out.m_row[2].w = 0.f;\n" +" return out;\n" +"}\n" +"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = fabs(matIn.m_row[0]);\n" +" out.m_row[1] = fabs(matIn.m_row[1]);\n" +" out.m_row[2] = fabs(matIn.m_row[2]);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtZero();\n" +"__inline\n" +"b3Mat3x3 mtIdentity();\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Mat3x3 mtZero()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(0.f);\n" +" m.m_row[1] = (b3Float4)(0.f);\n" +" m.m_row[2] = (b3Float4)(0.f);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtIdentity()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(1,0,0,0);\n" +" m.m_row[1] = (b3Float4)(0,1,0,0);\n" +" m.m_row[2] = (b3Float4)(0,0,1,0);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" +" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" +" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" +"{\n" +" b3Mat3x3 transB;\n" +" transB = mtTranspose( b );\n" +" b3Mat3x3 ans;\n" +" // why this doesn't run when 0ing in the for{}\n" +" a.m_row[0].w = 0.f;\n" +" a.m_row[1].w = 0.f;\n" +" a.m_row[2].w = 0.f;\n" +" for(int i=0; i<3; i++)\n" +" {\n" +"// a.m_row[i].w = 0.f;\n" +" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" +" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" +" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" +" ans.m_row[i].w = 0.f;\n" +" }\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" +"{\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a.m_row[0], b );\n" +" ans.y = b3Dot3F4( a.m_row[1], b );\n" +" ans.z = b3Dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" +"{\n" +" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a, colx );\n" +" ans.y = b3Dot3F4( a, coly );\n" +" ans.z = b3Dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"#endif\n" +"#endif //B3_MAT3x3_H\n" +"typedef struct b3RigidBodyData b3RigidBodyData_t;\n" +"struct b3RigidBodyData\n" +"{\n" +" b3Float4 m_pos;\n" +" b3Quat m_quat;\n" +" b3Float4 m_linVel;\n" +" b3Float4 m_angVel;\n" +" int m_collidableIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"};\n" +"typedef struct b3InertiaData b3InertiaData_t;\n" +"struct b3InertiaData\n" +"{\n" +" b3Mat3x3 m_invInertiaWorld;\n" +" b3Mat3x3 m_initInvInertia;\n" +"};\n" +"#endif //B3_RIGIDBODY_DATA_H\n" +" \n" +"#define GET_NPOINTS(x) (x).m_worldNormalOnB.w\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define make_float4 (float4)\n" +"#define make_float2 (float2)\n" +"#define make_uint4 (uint4)\n" +"#define make_int4 (int4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"__inline\n" +"float fastDiv(float numerator, float denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"// return numerator/denominator; \n" +"}\n" +"__inline\n" +"float4 fastDiv4(float4 numerator, float4 denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"}\n" +"__inline\n" +"float4 cross3(float4 a, float4 b)\n" +"{\n" +" return cross(a,b);\n" +"}\n" +"//#define dot3F4 dot\n" +"__inline\n" +"float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = make_float4(a.xyz,0.f);\n" +" float4 b1 = make_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" return fast_normalize(v);\n" +"}\n" +"///////////////////////////////////////\n" +"// Quaternion\n" +"///////////////////////////////////////\n" +"typedef float4 Quaternion;\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b);\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in);\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec);\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q);\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b)\n" +"{\n" +" Quaternion ans;\n" +" ans = cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in)\n" +"{\n" +" return fastNormalize4(in);\n" +"// in /= length( in );\n" +"// return in;\n" +"}\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec)\n" +"{\n" +" Quaternion qInv = qtInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q)\n" +"{\n" +" return (Quaternion)(-q.xyz, q.w);\n" +"}\n" +"__inline\n" +"float4 qtInvRotate(const Quaternion q, float4 vec)\n" +"{\n" +" return qtRotate( qtInvert( q ), vec );\n" +"}\n" +"__inline\n" +"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n" +"{\n" +" return qtRotate( *orientation, *p ) + (*translation);\n" +"}\n" +"__inline\n" +"float4 normalize3(const float4 a)\n" +"{\n" +" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" +" return fastNormalize4( n );\n" +"}\n" +"__inline float4 lerp3(const float4 a,const float4 b, float t)\n" +"{\n" +" return make_float4( a.x + (b.x - a.x) * t,\n" +" a.y + (b.y - a.y) * t,\n" +" a.z + (b.z - a.z) * t,\n" +" 0.f);\n" +"}\n" +"// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut\n" +"int clipFaceGlobal(__global const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, __global float4* ppVtxOut)\n" +"{\n" +" \n" +" int ve;\n" +" float ds, de;\n" +" int numVertsOut = 0;\n" +" //double-check next test\n" +" if (numVertsIn < 2)\n" +" return 0;\n" +" \n" +" float4 firstVertex=pVtxIn[numVertsIn-1];\n" +" float4 endVertex = pVtxIn[0];\n" +" \n" +" ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;\n" +" \n" +" for (ve = 0; ve < numVertsIn; ve++)\n" +" {\n" +" endVertex=pVtxIn[ve];\n" +" de = dot3F4(planeNormalWS,endVertex)+planeEqWS;\n" +" if (ds<0)\n" +" {\n" +" if (de<0)\n" +" {\n" +" // Start < 0, end < 0, so output endVertex\n" +" ppVtxOut[numVertsOut++] = endVertex;\n" +" }\n" +" else\n" +" {\n" +" // Start < 0, end >= 0, so output intersection\n" +" ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n" +" }\n" +" }\n" +" else\n" +" {\n" +" if (de<0)\n" +" {\n" +" // Start >= 0, end < 0 so output intersection and end\n" +" ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n" +" ppVtxOut[numVertsOut++] = endVertex;\n" +" }\n" +" }\n" +" firstVertex = endVertex;\n" +" ds = de;\n" +" }\n" +" return numVertsOut;\n" +"}\n" +"// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut\n" +"int clipFace(const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, float4* ppVtxOut)\n" +"{\n" +" \n" +" int ve;\n" +" float ds, de;\n" +" int numVertsOut = 0;\n" +"//double-check next test\n" +" if (numVertsIn < 2)\n" +" return 0;\n" +" float4 firstVertex=pVtxIn[numVertsIn-1];\n" +" float4 endVertex = pVtxIn[0];\n" +" \n" +" ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;\n" +" for (ve = 0; ve < numVertsIn; ve++)\n" +" {\n" +" endVertex=pVtxIn[ve];\n" +" de = dot3F4(planeNormalWS,endVertex)+planeEqWS;\n" +" if (ds<0)\n" +" {\n" +" if (de<0)\n" +" {\n" +" // Start < 0, end < 0, so output endVertex\n" +" ppVtxOut[numVertsOut++] = endVertex;\n" +" }\n" +" else\n" +" {\n" +" // Start < 0, end >= 0, so output intersection\n" +" ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n" +" }\n" +" }\n" +" else\n" +" {\n" +" if (de<0)\n" +" {\n" +" // Start >= 0, end < 0 so output intersection and end\n" +" ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n" +" ppVtxOut[numVertsOut++] = endVertex;\n" +" }\n" +" }\n" +" firstVertex = endVertex;\n" +" ds = de;\n" +" }\n" +" return numVertsOut;\n" +"}\n" +"int clipFaceAgainstHull(const float4 separatingNormal, __global const b3ConvexPolyhedronData_t* hullA, \n" +" const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,\n" +" float4* worldVertsB2, int capacityWorldVertsB2,\n" +" const float minDist, float maxDist,\n" +" __global const float4* vertices,\n" +" __global const b3GpuFace_t* faces,\n" +" __global const int* indices,\n" +" float4* contactsOut,\n" +" int contactCapacity)\n" +"{\n" +" int numContactsOut = 0;\n" +" float4* pVtxIn = worldVertsB1;\n" +" float4* pVtxOut = worldVertsB2;\n" +" \n" +" int numVertsIn = numWorldVertsB1;\n" +" int numVertsOut = 0;\n" +" int closestFaceA=-1;\n" +" {\n" +" float dmin = FLT_MAX;\n" +" for(int face=0;face<hullA->m_numFaces;face++)\n" +" {\n" +" const float4 Normal = make_float4(\n" +" faces[hullA->m_faceOffset+face].m_plane.x, \n" +" faces[hullA->m_faceOffset+face].m_plane.y, \n" +" faces[hullA->m_faceOffset+face].m_plane.z,0.f);\n" +" const float4 faceANormalWS = qtRotate(ornA,Normal);\n" +" \n" +" float d = dot3F4(faceANormalWS,separatingNormal);\n" +" if (d < dmin)\n" +" {\n" +" dmin = d;\n" +" closestFaceA = face;\n" +" }\n" +" }\n" +" }\n" +" if (closestFaceA<0)\n" +" return numContactsOut;\n" +" b3GpuFace_t polyA = faces[hullA->m_faceOffset+closestFaceA];\n" +" // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n" +" int numVerticesA = polyA.m_numIndices;\n" +" for(int e0=0;e0<numVerticesA;e0++)\n" +" {\n" +" const float4 a = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+e0]];\n" +" const float4 b = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+((e0+1)%numVerticesA)]];\n" +" const float4 edge0 = a - b;\n" +" const float4 WorldEdge0 = qtRotate(ornA,edge0);\n" +" float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n" +" float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);\n" +" float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n" +" float4 worldA1 = transform(&a,&posA,&ornA);\n" +" float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n" +" \n" +" float4 planeNormalWS = planeNormalWS1;\n" +" float planeEqWS=planeEqWS1;\n" +" \n" +" //clip face\n" +" //clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n" +" numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n" +" //btSwap(pVtxIn,pVtxOut);\n" +" float4* tmp = pVtxOut;\n" +" pVtxOut = pVtxIn;\n" +" pVtxIn = tmp;\n" +" numVertsIn = numVertsOut;\n" +" numVertsOut = 0;\n" +" }\n" +" \n" +" // only keep points that are behind the witness face\n" +" {\n" +" float4 localPlaneNormal = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n" +" float localPlaneEq = polyA.m_plane.w;\n" +" float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);\n" +" float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);\n" +" for (int i=0;i<numVertsIn;i++)\n" +" {\n" +" float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n" +" if (depth <=minDist)\n" +" {\n" +" depth = minDist;\n" +" }\n" +" if (depth <=maxDist)\n" +" {\n" +" float4 pointInWorld = pVtxIn[i];\n" +" //resultOut.addContactPoint(separatingNormal,point,depth);\n" +" contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n" +" }\n" +" }\n" +" }\n" +" return numContactsOut;\n" +"}\n" +"int clipFaceAgainstHullLocalA(const float4 separatingNormal, const b3ConvexPolyhedronData_t* hullA, \n" +" const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,\n" +" float4* worldVertsB2, int capacityWorldVertsB2,\n" +" const float minDist, float maxDist,\n" +" const float4* verticesA,\n" +" const b3GpuFace_t* facesA,\n" +" const int* indicesA,\n" +" __global const float4* verticesB,\n" +" __global const b3GpuFace_t* facesB,\n" +" __global const int* indicesB,\n" +" float4* contactsOut,\n" +" int contactCapacity)\n" +"{\n" +" int numContactsOut = 0;\n" +" float4* pVtxIn = worldVertsB1;\n" +" float4* pVtxOut = worldVertsB2;\n" +" \n" +" int numVertsIn = numWorldVertsB1;\n" +" int numVertsOut = 0;\n" +" int closestFaceA=-1;\n" +" {\n" +" float dmin = FLT_MAX;\n" +" for(int face=0;face<hullA->m_numFaces;face++)\n" +" {\n" +" const float4 Normal = make_float4(\n" +" facesA[hullA->m_faceOffset+face].m_plane.x, \n" +" facesA[hullA->m_faceOffset+face].m_plane.y, \n" +" facesA[hullA->m_faceOffset+face].m_plane.z,0.f);\n" +" const float4 faceANormalWS = qtRotate(ornA,Normal);\n" +" \n" +" float d = dot3F4(faceANormalWS,separatingNormal);\n" +" if (d < dmin)\n" +" {\n" +" dmin = d;\n" +" closestFaceA = face;\n" +" }\n" +" }\n" +" }\n" +" if (closestFaceA<0)\n" +" return numContactsOut;\n" +" b3GpuFace_t polyA = facesA[hullA->m_faceOffset+closestFaceA];\n" +" // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n" +" int numVerticesA = polyA.m_numIndices;\n" +" for(int e0=0;e0<numVerticesA;e0++)\n" +" {\n" +" const float4 a = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+e0]];\n" +" const float4 b = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+((e0+1)%numVerticesA)]];\n" +" const float4 edge0 = a - b;\n" +" const float4 WorldEdge0 = qtRotate(ornA,edge0);\n" +" float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n" +" float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);\n" +" float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n" +" float4 worldA1 = transform(&a,&posA,&ornA);\n" +" float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n" +" \n" +" float4 planeNormalWS = planeNormalWS1;\n" +" float planeEqWS=planeEqWS1;\n" +" \n" +" //clip face\n" +" //clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n" +" numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n" +" //btSwap(pVtxIn,pVtxOut);\n" +" float4* tmp = pVtxOut;\n" +" pVtxOut = pVtxIn;\n" +" pVtxIn = tmp;\n" +" numVertsIn = numVertsOut;\n" +" numVertsOut = 0;\n" +" }\n" +" \n" +" // only keep points that are behind the witness face\n" +" {\n" +" float4 localPlaneNormal = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n" +" float localPlaneEq = polyA.m_plane.w;\n" +" float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);\n" +" float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);\n" +" for (int i=0;i<numVertsIn;i++)\n" +" {\n" +" float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n" +" if (depth <=minDist)\n" +" {\n" +" depth = minDist;\n" +" }\n" +" if (depth <=maxDist)\n" +" {\n" +" float4 pointInWorld = pVtxIn[i];\n" +" //resultOut.addContactPoint(separatingNormal,point,depth);\n" +" contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n" +" }\n" +" }\n" +" }\n" +" return numContactsOut;\n" +"}\n" +"int clipHullAgainstHull(const float4 separatingNormal,\n" +" __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, \n" +" const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, \n" +" float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n" +" const float minDist, float maxDist,\n" +" __global const float4* vertices,\n" +" __global const b3GpuFace_t* faces,\n" +" __global const int* indices,\n" +" float4* localContactsOut,\n" +" int localContactCapacity)\n" +"{\n" +" int numContactsOut = 0;\n" +" int numWorldVertsB1= 0;\n" +" int closestFaceB=-1;\n" +" float dmax = -FLT_MAX;\n" +" {\n" +" for(int face=0;face<hullB->m_numFaces;face++)\n" +" {\n" +" const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x, \n" +" faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);\n" +" const float4 WorldNormal = qtRotate(ornB, Normal);\n" +" float d = dot3F4(WorldNormal,separatingNormal);\n" +" if (d > dmax)\n" +" {\n" +" dmax = d;\n" +" closestFaceB = face;\n" +" }\n" +" }\n" +" }\n" +" {\n" +" const b3GpuFace_t polyB = faces[hullB->m_faceOffset+closestFaceB];\n" +" const int numVertices = polyB.m_numIndices;\n" +" for(int e0=0;e0<numVertices;e0++)\n" +" {\n" +" const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];\n" +" worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);\n" +" }\n" +" }\n" +" if (closestFaceB>=0)\n" +" {\n" +" numContactsOut = clipFaceAgainstHull(separatingNormal, hullA, \n" +" posA,ornA,\n" +" worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,vertices,\n" +" faces,\n" +" indices,localContactsOut,localContactCapacity);\n" +" }\n" +" return numContactsOut;\n" +"}\n" +"int clipHullAgainstHullLocalA(const float4 separatingNormal,\n" +" const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, \n" +" const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, \n" +" float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n" +" const float minDist, float maxDist,\n" +" const float4* verticesA,\n" +" const b3GpuFace_t* facesA,\n" +" const int* indicesA,\n" +" __global const float4* verticesB,\n" +" __global const b3GpuFace_t* facesB,\n" +" __global const int* indicesB,\n" +" float4* localContactsOut,\n" +" int localContactCapacity)\n" +"{\n" +" int numContactsOut = 0;\n" +" int numWorldVertsB1= 0;\n" +" int closestFaceB=-1;\n" +" float dmax = -FLT_MAX;\n" +" {\n" +" for(int face=0;face<hullB->m_numFaces;face++)\n" +" {\n" +" const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x, \n" +" facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);\n" +" const float4 WorldNormal = qtRotate(ornB, Normal);\n" +" float d = dot3F4(WorldNormal,separatingNormal);\n" +" if (d > dmax)\n" +" {\n" +" dmax = d;\n" +" closestFaceB = face;\n" +" }\n" +" }\n" +" }\n" +" {\n" +" const b3GpuFace_t polyB = facesB[hullB->m_faceOffset+closestFaceB];\n" +" const int numVertices = polyB.m_numIndices;\n" +" for(int e0=0;e0<numVertices;e0++)\n" +" {\n" +" const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];\n" +" worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);\n" +" }\n" +" }\n" +" if (closestFaceB>=0)\n" +" {\n" +" numContactsOut = clipFaceAgainstHullLocalA(separatingNormal, hullA, \n" +" posA,ornA,\n" +" worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,\n" +" verticesA,facesA,indicesA,\n" +" verticesB,facesB,indicesB,\n" +" localContactsOut,localContactCapacity);\n" +" }\n" +" return numContactsOut;\n" +"}\n" +"#define PARALLEL_SUM(v, n) for(int j=1; j<n; j++) v[0] += v[j];\n" +"#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;}\n" +"#define REDUCE_MAX(v, n) {int i=0; for(int offset=0; offset<n; offset++) v[i] = (v[i].y > v[i+offset].y)? v[i]: v[i+offset]; }\n" +"#define REDUCE_MIN(v, n) {int i=0; for(int offset=0; offset<n; offset++) v[i] = (v[i].y < v[i+offset].y)? v[i]: v[i+offset]; }\n" +"int extractManifoldSequentialGlobal(__global const float4* p, int nPoints, float4 nearNormal, int4* contactIdx)\n" +"{\n" +" if( nPoints == 0 )\n" +" return 0;\n" +" \n" +" if (nPoints <=4)\n" +" return nPoints;\n" +" \n" +" \n" +" if (nPoints >64)\n" +" nPoints = 64;\n" +" \n" +" float4 center = make_float4(0.f);\n" +" {\n" +" \n" +" for (int i=0;i<nPoints;i++)\n" +" center += p[i];\n" +" center /= (float)nPoints;\n" +" }\n" +" \n" +" \n" +" \n" +" // sample 4 directions\n" +" \n" +" float4 aVector = p[0] - center;\n" +" float4 u = cross3( nearNormal, aVector );\n" +" float4 v = cross3( nearNormal, u );\n" +" u = normalize3( u );\n" +" v = normalize3( v );\n" +" \n" +" \n" +" //keep point with deepest penetration\n" +" float minW= FLT_MAX;\n" +" \n" +" int minIndex=-1;\n" +" \n" +" float4 maxDots;\n" +" maxDots.x = FLT_MIN;\n" +" maxDots.y = FLT_MIN;\n" +" maxDots.z = FLT_MIN;\n" +" maxDots.w = FLT_MIN;\n" +" \n" +" // idx, distance\n" +" for(int ie = 0; ie<nPoints; ie++ )\n" +" {\n" +" if (p[ie].w<minW)\n" +" {\n" +" minW = p[ie].w;\n" +" minIndex=ie;\n" +" }\n" +" float f;\n" +" float4 r = p[ie]-center;\n" +" f = dot3F4( u, r );\n" +" if (f<maxDots.x)\n" +" {\n" +" maxDots.x = f;\n" +" contactIdx[0].x = ie;\n" +" }\n" +" \n" +" f = dot3F4( -u, r );\n" +" if (f<maxDots.y)\n" +" {\n" +" maxDots.y = f;\n" +" contactIdx[0].y = ie;\n" +" }\n" +" \n" +" \n" +" f = dot3F4( v, r );\n" +" if (f<maxDots.z)\n" +" {\n" +" maxDots.z = f;\n" +" contactIdx[0].z = ie;\n" +" }\n" +" \n" +" f = dot3F4( -v, r );\n" +" if (f<maxDots.w)\n" +" {\n" +" maxDots.w = f;\n" +" contactIdx[0].w = ie;\n" +" }\n" +" \n" +" }\n" +" \n" +" if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)\n" +" {\n" +" //replace the first contact with minimum (todo: replace contact with least penetration)\n" +" contactIdx[0].x = minIndex;\n" +" }\n" +" \n" +" return 4;\n" +" \n" +"}\n" +"int extractManifoldSequentialGlobalFake(__global const float4* p, int nPoints, float4 nearNormal, int* contactIdx)\n" +"{\n" +" contactIdx[0] = 0;\n" +" contactIdx[1] = 1;\n" +" contactIdx[2] = 2;\n" +" contactIdx[3] = 3;\n" +" \n" +" if( nPoints == 0 ) return 0;\n" +" \n" +" nPoints = min2( nPoints, 4 );\n" +" return nPoints;\n" +" \n" +"}\n" +"int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int* contactIdx)\n" +"{\n" +" if( nPoints == 0 ) return 0;\n" +" nPoints = min2( nPoints, 64 );\n" +" float4 center = make_float4(0.f);\n" +" {\n" +" float4 v[64];\n" +" for (int i=0;i<nPoints;i++)\n" +" v[i] = p[i];\n" +" //memcpy( v, p, nPoints*sizeof(float4) );\n" +" PARALLEL_SUM( v, nPoints );\n" +" center = v[0]/(float)nPoints;\n" +" }\n" +" \n" +" { // sample 4 directions\n" +" if( nPoints < 4 )\n" +" {\n" +" for(int i=0; i<nPoints; i++) \n" +" contactIdx[i] = i;\n" +" return nPoints;\n" +" }\n" +" float4 aVector = p[0] - center;\n" +" float4 u = cross3( nearNormal, aVector );\n" +" float4 v = cross3( nearNormal, u );\n" +" u = normalize3( u );\n" +" v = normalize3( v );\n" +" int idx[4];\n" +" float2 max00 = make_float2(0,FLT_MAX);\n" +" {\n" +" // idx, distance\n" +" {\n" +" {\n" +" int4 a[64];\n" +" for(int ie = 0; ie<nPoints; ie++ )\n" +" {\n" +" \n" +" \n" +" float f;\n" +" float4 r = p[ie]-center;\n" +" f = dot3F4( u, r );\n" +" a[ie].x = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n" +" f = dot3F4( -u, r );\n" +" a[ie].y = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n" +" f = dot3F4( v, r );\n" +" a[ie].z = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n" +" f = dot3F4( -v, r );\n" +" a[ie].w = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n" +" }\n" +" for(int ie=0; ie<nPoints; ie++)\n" +" {\n" +" a[0].x = (a[0].x > a[ie].x )? a[0].x: a[ie].x;\n" +" a[0].y = (a[0].y > a[ie].y )? a[0].y: a[ie].y;\n" +" a[0].z = (a[0].z > a[ie].z )? a[0].z: a[ie].z;\n" +" a[0].w = (a[0].w > a[ie].w )? a[0].w: a[ie].w;\n" +" }\n" +" idx[0] = (int)a[0].x & 0xff;\n" +" idx[1] = (int)a[0].y & 0xff;\n" +" idx[2] = (int)a[0].z & 0xff;\n" +" idx[3] = (int)a[0].w & 0xff;\n" +" }\n" +" }\n" +" {\n" +" float2 h[64];\n" +" PARALLEL_DO( h[ie] = make_float2((float)ie, p[ie].w), nPoints );\n" +" REDUCE_MIN( h, nPoints );\n" +" max00 = h[0];\n" +" }\n" +" }\n" +" contactIdx[0] = idx[0];\n" +" contactIdx[1] = idx[1];\n" +" contactIdx[2] = idx[2];\n" +" contactIdx[3] = idx[3];\n" +" return 4;\n" +" }\n" +"}\n" +"__kernel void extractManifoldAndAddContactKernel(__global const int4* pairs, \n" +" __global const b3RigidBodyData_t* rigidBodies, \n" +" __global const float4* closestPointsWorld,\n" +" __global const float4* separatingNormalsWorld,\n" +" __global const int* contactCounts,\n" +" __global const int* contactOffsets,\n" +" __global struct b3Contact4Data* restrict contactsOut,\n" +" counter32_t nContactsOut,\n" +" int contactCapacity,\n" +" int numPairs,\n" +" int pairIndex\n" +" )\n" +"{\n" +" int idx = get_global_id(0);\n" +" \n" +" if (idx<numPairs)\n" +" {\n" +" float4 normal = separatingNormalsWorld[idx];\n" +" int nPoints = contactCounts[idx];\n" +" __global const float4* pointsIn = &closestPointsWorld[contactOffsets[idx]];\n" +" float4 localPoints[64];\n" +" for (int i=0;i<nPoints;i++)\n" +" {\n" +" localPoints[i] = pointsIn[i];\n" +" }\n" +" int contactIdx[4];// = {-1,-1,-1,-1};\n" +" contactIdx[0] = -1;\n" +" contactIdx[1] = -1;\n" +" contactIdx[2] = -1;\n" +" contactIdx[3] = -1;\n" +" int nContacts = extractManifoldSequential(localPoints, nPoints, normal, contactIdx);\n" +" int dstIdx;\n" +" AppendInc( nContactsOut, dstIdx );\n" +" if (dstIdx<contactCapacity)\n" +" {\n" +" __global struct b3Contact4Data* c = contactsOut + dstIdx;\n" +" c->m_worldNormalOnB = -normal;\n" +" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" +" c->m_batchIdx = idx;\n" +" int bodyA = pairs[pairIndex].x;\n" +" int bodyB = pairs[pairIndex].y;\n" +" c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;\n" +" c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;\n" +" c->m_childIndexA = -1;\n" +" c->m_childIndexB = -1;\n" +" for (int i=0;i<nContacts;i++)\n" +" {\n" +" c->m_worldPosB[i] = localPoints[contactIdx[i]];\n" +" }\n" +" GET_NPOINTS(*c) = nContacts;\n" +" }\n" +" }\n" +"}\n" +"void trInverse(float4 translationIn, Quaternion orientationIn,\n" +" float4* translationOut, Quaternion* orientationOut)\n" +"{\n" +" *orientationOut = qtInvert(orientationIn);\n" +" *translationOut = qtRotate(*orientationOut, -translationIn);\n" +"}\n" +"void trMul(float4 translationA, Quaternion orientationA,\n" +" float4 translationB, Quaternion orientationB,\n" +" float4* translationOut, Quaternion* orientationOut)\n" +"{\n" +" *orientationOut = qtMul(orientationA,orientationB);\n" +" *translationOut = transform(&translationB,&translationA,&orientationA);\n" +"}\n" +"__kernel void clipHullHullKernel( __global int4* pairs, \n" +" __global const b3RigidBodyData_t* rigidBodies, \n" +" __global const b3Collidable_t* collidables,\n" +" __global const b3ConvexPolyhedronData_t* convexShapes, \n" +" __global const float4* vertices,\n" +" __global const float4* uniqueEdges,\n" +" __global const b3GpuFace_t* faces,\n" +" __global const int* indices,\n" +" __global const float4* separatingNormals,\n" +" __global const int* hasSeparatingAxis,\n" +" __global struct b3Contact4Data* restrict globalContactsOut,\n" +" counter32_t nGlobalContactsOut,\n" +" int numPairs,\n" +" int contactCapacity)\n" +"{\n" +" int i = get_global_id(0);\n" +" int pairIndex = i;\n" +" \n" +" float4 worldVertsB1[64];\n" +" float4 worldVertsB2[64];\n" +" int capacityWorldVerts = 64; \n" +" float4 localContactsOut[64];\n" +" int localContactCapacity=64;\n" +" \n" +" float minDist = -1e30f;\n" +" float maxDist = 0.02f;\n" +" if (i<numPairs)\n" +" {\n" +" int bodyIndexA = pairs[i].x;\n" +" int bodyIndexB = pairs[i].y;\n" +" \n" +" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" +" if (hasSeparatingAxis[i])\n" +" {\n" +" \n" +" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" +" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" +" \n" +" \n" +" int numLocalContactsOut = clipHullAgainstHull(separatingNormals[i],\n" +" &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n" +" rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,\n" +" rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,\n" +" worldVertsB1,worldVertsB2,capacityWorldVerts,\n" +" minDist, maxDist,\n" +" vertices,faces,indices,\n" +" localContactsOut,localContactCapacity);\n" +" \n" +" if (numLocalContactsOut>0)\n" +" {\n" +" float4 normal = -separatingNormals[i];\n" +" int nPoints = numLocalContactsOut;\n" +" float4* pointsIn = localContactsOut;\n" +" int contactIdx[4];// = {-1,-1,-1,-1};\n" +" contactIdx[0] = -1;\n" +" contactIdx[1] = -1;\n" +" contactIdx[2] = -1;\n" +" contactIdx[3] = -1;\n" +" \n" +" int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n" +" \n" +" \n" +" int mprContactIndex = pairs[pairIndex].z;\n" +" int dstIdx = mprContactIndex;\n" +" if (dstIdx<0)\n" +" {\n" +" AppendInc( nGlobalContactsOut, dstIdx );\n" +" }\n" +" if (dstIdx<contactCapacity)\n" +" {\n" +" pairs[pairIndex].z = dstIdx;\n" +" __global struct b3Contact4Data* c = globalContactsOut+ dstIdx;\n" +" c->m_worldNormalOnB = -normal;\n" +" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" +" c->m_batchIdx = pairIndex;\n" +" int bodyA = pairs[pairIndex].x;\n" +" int bodyB = pairs[pairIndex].y;\n" +" c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" +" c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" +" c->m_childIndexA = -1;\n" +" c->m_childIndexB = -1;\n" +" for (int i=0;i<nReducedContacts;i++)\n" +" {\n" +" //this condition means: overwrite contact point, unless at index i==0 we have a valid 'mpr' contact\n" +" if (i>0||(mprContactIndex<0))\n" +" {\n" +" c->m_worldPosB[i] = pointsIn[contactIdx[i]];\n" +" }\n" +" }\n" +" GET_NPOINTS(*c) = nReducedContacts;\n" +" }\n" +" \n" +" }// if (numContactsOut>0)\n" +" }// if (hasSeparatingAxis[i])\n" +" }// if (i<numPairs)\n" +"}\n" +"__kernel void clipCompoundsHullHullKernel( __global const int4* gpuCompoundPairs, \n" +" __global const b3RigidBodyData_t* rigidBodies, \n" +" __global const b3Collidable_t* collidables,\n" +" __global const b3ConvexPolyhedronData_t* convexShapes, \n" +" __global const float4* vertices,\n" +" __global const float4* uniqueEdges,\n" +" __global const b3GpuFace_t* faces,\n" +" __global const int* indices,\n" +" __global const b3GpuChildShape_t* gpuChildShapes,\n" +" __global const float4* gpuCompoundSepNormalsOut,\n" +" __global const int* gpuHasCompoundSepNormalsOut,\n" +" __global struct b3Contact4Data* restrict globalContactsOut,\n" +" counter32_t nGlobalContactsOut,\n" +" int numCompoundPairs, int maxContactCapacity)\n" +"{\n" +" int i = get_global_id(0);\n" +" int pairIndex = i;\n" +" \n" +" float4 worldVertsB1[64];\n" +" float4 worldVertsB2[64];\n" +" int capacityWorldVerts = 64; \n" +" float4 localContactsOut[64];\n" +" int localContactCapacity=64;\n" +" \n" +" float minDist = -1e30f;\n" +" float maxDist = 0.02f;\n" +" if (i<numCompoundPairs)\n" +" {\n" +" if (gpuHasCompoundSepNormalsOut[i])\n" +" {\n" +" int bodyIndexA = gpuCompoundPairs[i].x;\n" +" int bodyIndexB = gpuCompoundPairs[i].y;\n" +" \n" +" int childShapeIndexA = gpuCompoundPairs[i].z;\n" +" int childShapeIndexB = gpuCompoundPairs[i].w;\n" +" \n" +" int collidableIndexA = -1;\n" +" int collidableIndexB = -1;\n" +" \n" +" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" +" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" +" \n" +" float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" +" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" +" \n" +" if (childShapeIndexA >= 0)\n" +" {\n" +" collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n" +" float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n" +" float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n" +" float4 newPosA = qtRotate(ornA,childPosA)+posA;\n" +" float4 newOrnA = qtMul(ornA,childOrnA);\n" +" posA = newPosA;\n" +" ornA = newOrnA;\n" +" } else\n" +" {\n" +" collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" }\n" +" \n" +" if (childShapeIndexB>=0)\n" +" {\n" +" collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" +" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" +" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" +" float4 newPosB = transform(&childPosB,&posB,&ornB);\n" +" float4 newOrnB = qtMul(ornB,childOrnB);\n" +" posB = newPosB;\n" +" ornB = newOrnB;\n" +" } else\n" +" {\n" +" collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; \n" +" }\n" +" \n" +" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" +" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" +" \n" +" int numLocalContactsOut = clipHullAgainstHull(gpuCompoundSepNormalsOut[i],\n" +" &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n" +" posA,ornA,\n" +" posB,ornB,\n" +" worldVertsB1,worldVertsB2,capacityWorldVerts,\n" +" minDist, maxDist,\n" +" vertices,faces,indices,\n" +" localContactsOut,localContactCapacity);\n" +" \n" +" if (numLocalContactsOut>0)\n" +" {\n" +" float4 normal = -gpuCompoundSepNormalsOut[i];\n" +" int nPoints = numLocalContactsOut;\n" +" float4* pointsIn = localContactsOut;\n" +" int contactIdx[4];// = {-1,-1,-1,-1};\n" +" contactIdx[0] = -1;\n" +" contactIdx[1] = -1;\n" +" contactIdx[2] = -1;\n" +" contactIdx[3] = -1;\n" +" \n" +" int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n" +" \n" +" int dstIdx;\n" +" AppendInc( nGlobalContactsOut, dstIdx );\n" +" if ((dstIdx+nReducedContacts) < maxContactCapacity)\n" +" {\n" +" __global struct b3Contact4Data* c = globalContactsOut+ dstIdx;\n" +" c->m_worldNormalOnB = -normal;\n" +" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" +" c->m_batchIdx = pairIndex;\n" +" int bodyA = gpuCompoundPairs[pairIndex].x;\n" +" int bodyB = gpuCompoundPairs[pairIndex].y;\n" +" c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" +" c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" +" c->m_childIndexA = childShapeIndexA;\n" +" c->m_childIndexB = childShapeIndexB;\n" +" for (int i=0;i<nReducedContacts;i++)\n" +" {\n" +" c->m_worldPosB[i] = pointsIn[contactIdx[i]];\n" +" }\n" +" GET_NPOINTS(*c) = nReducedContacts;\n" +" }\n" +" \n" +" }// if (numContactsOut>0)\n" +" }// if (gpuHasCompoundSepNormalsOut[i])\n" +" }// if (i<numCompoundPairs)\n" +"}\n" +"__kernel void sphereSphereCollisionKernel( __global const int4* pairs, \n" +" __global const b3RigidBodyData_t* rigidBodies, \n" +" __global const b3Collidable_t* collidables,\n" +" __global const float4* separatingNormals,\n" +" __global const int* hasSeparatingAxis,\n" +" __global struct b3Contact4Data* restrict globalContactsOut,\n" +" counter32_t nGlobalContactsOut,\n" +" int contactCapacity,\n" +" int numPairs)\n" +"{\n" +" int i = get_global_id(0);\n" +" int pairIndex = i;\n" +" \n" +" if (i<numPairs)\n" +" {\n" +" int bodyIndexA = pairs[i].x;\n" +" int bodyIndexB = pairs[i].y;\n" +" \n" +" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" +" if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n" +" collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n" +" {\n" +" //sphere-sphere\n" +" float radiusA = collidables[collidableIndexA].m_radius;\n" +" float radiusB = collidables[collidableIndexB].m_radius;\n" +" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" +" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" +" float4 diff = posA-posB;\n" +" float len = length(diff);\n" +" \n" +" ///iff distance positive, don't generate a new contact\n" +" if ( len <= (radiusA+radiusB))\n" +" {\n" +" ///distance (negative means penetration)\n" +" float dist = len - (radiusA+radiusB);\n" +" float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);\n" +" if (len > 0.00001)\n" +" {\n" +" normalOnSurfaceB = diff / len;\n" +" }\n" +" float4 contactPosB = posB + normalOnSurfaceB*radiusB;\n" +" contactPosB.w = dist;\n" +" \n" +" int dstIdx;\n" +" AppendInc( nGlobalContactsOut, dstIdx );\n" +" if (dstIdx < contactCapacity)\n" +" {\n" +" __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" +" c->m_worldNormalOnB = -normalOnSurfaceB;\n" +" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" +" c->m_batchIdx = pairIndex;\n" +" int bodyA = pairs[pairIndex].x;\n" +" int bodyB = pairs[pairIndex].y;\n" +" c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" +" c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" +" c->m_worldPosB[0] = contactPosB;\n" +" c->m_childIndexA = -1;\n" +" c->m_childIndexB = -1;\n" +" GET_NPOINTS(*c) = 1;\n" +" }//if (dstIdx < numPairs)\n" +" }//if ( len <= (radiusA+radiusB))\n" +" }//SHAPE_SPHERE SHAPE_SPHERE\n" +" }//if (i<numPairs)\n" +"} \n" +"__kernel void clipHullHullConcaveConvexKernel( __global int4* concavePairsIn,\n" +" __global const b3RigidBodyData_t* rigidBodies, \n" +" __global const b3Collidable_t* collidables,\n" +" __global const b3ConvexPolyhedronData_t* convexShapes, \n" +" __global const float4* vertices,\n" +" __global const float4* uniqueEdges,\n" +" __global const b3GpuFace_t* faces,\n" +" __global const int* indices,\n" +" __global const b3GpuChildShape_t* gpuChildShapes,\n" +" __global const float4* separatingNormals,\n" +" __global struct b3Contact4Data* restrict globalContactsOut,\n" +" counter32_t nGlobalContactsOut,\n" +" int contactCapacity,\n" +" int numConcavePairs)\n" +"{\n" +" int i = get_global_id(0);\n" +" int pairIndex = i;\n" +" \n" +" float4 worldVertsB1[64];\n" +" float4 worldVertsB2[64];\n" +" int capacityWorldVerts = 64; \n" +" float4 localContactsOut[64];\n" +" int localContactCapacity=64;\n" +" \n" +" float minDist = -1e30f;\n" +" float maxDist = 0.02f;\n" +" if (i<numConcavePairs)\n" +" {\n" +" //negative value means that the pair is invalid\n" +" if (concavePairsIn[i].w<0)\n" +" return;\n" +" int bodyIndexA = concavePairsIn[i].x;\n" +" int bodyIndexB = concavePairsIn[i].y;\n" +" int f = concavePairsIn[i].z;\n" +" int childShapeIndexA = f;\n" +" \n" +" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" +" \n" +" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" +" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" +" \n" +" ///////////////////////////////////////////////////////////////\n" +" \n" +" \n" +" bool overlap = false;\n" +" \n" +" b3ConvexPolyhedronData_t convexPolyhedronA;\n" +" //add 3 vertices of the triangle\n" +" convexPolyhedronA.m_numVertices = 3;\n" +" convexPolyhedronA.m_vertexOffset = 0;\n" +" float4 localCenter = make_float4(0.f,0.f,0.f,0.f);\n" +" b3GpuFace_t face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n" +" \n" +" float4 verticesA[3];\n" +" for (int i=0;i<3;i++)\n" +" {\n" +" int index = indices[face.m_indexOffset+i];\n" +" float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n" +" verticesA[i] = vert;\n" +" localCenter += vert;\n" +" }\n" +" float dmin = FLT_MAX;\n" +" int localCC=0;\n" +" //a triangle has 3 unique edges\n" +" convexPolyhedronA.m_numUniqueEdges = 3;\n" +" convexPolyhedronA.m_uniqueEdgesOffset = 0;\n" +" float4 uniqueEdgesA[3];\n" +" \n" +" uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n" +" uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n" +" uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n" +" convexPolyhedronA.m_faceOffset = 0;\n" +" \n" +" float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n" +" \n" +" b3GpuFace_t facesA[TRIANGLE_NUM_CONVEX_FACES];\n" +" int indicesA[3+3+2+2+2];\n" +" int curUsedIndices=0;\n" +" int fidx=0;\n" +" //front size of triangle\n" +" {\n" +" facesA[fidx].m_indexOffset=curUsedIndices;\n" +" indicesA[0] = 0;\n" +" indicesA[1] = 1;\n" +" indicesA[2] = 2;\n" +" curUsedIndices+=3;\n" +" float c = face.m_plane.w;\n" +" facesA[fidx].m_plane.x = normal.x;\n" +" facesA[fidx].m_plane.y = normal.y;\n" +" facesA[fidx].m_plane.z = normal.z;\n" +" facesA[fidx].m_plane.w = c;\n" +" facesA[fidx].m_numIndices=3;\n" +" }\n" +" fidx++;\n" +" //back size of triangle\n" +" {\n" +" facesA[fidx].m_indexOffset=curUsedIndices;\n" +" indicesA[3]=2;\n" +" indicesA[4]=1;\n" +" indicesA[5]=0;\n" +" curUsedIndices+=3;\n" +" float c = dot3F4(normal,verticesA[0]);\n" +" float c1 = -face.m_plane.w;\n" +" facesA[fidx].m_plane.x = -normal.x;\n" +" facesA[fidx].m_plane.y = -normal.y;\n" +" facesA[fidx].m_plane.z = -normal.z;\n" +" facesA[fidx].m_plane.w = c;\n" +" facesA[fidx].m_numIndices=3;\n" +" }\n" +" fidx++;\n" +" bool addEdgePlanes = true;\n" +" if (addEdgePlanes)\n" +" {\n" +" int numVertices=3;\n" +" int prevVertex = numVertices-1;\n" +" for (int i=0;i<numVertices;i++)\n" +" {\n" +" float4 v0 = verticesA[i];\n" +" float4 v1 = verticesA[prevVertex];\n" +" \n" +" float4 edgeNormal = normalize(cross(normal,v1-v0));\n" +" float c = -dot3F4(edgeNormal,v0);\n" +" facesA[fidx].m_numIndices = 2;\n" +" facesA[fidx].m_indexOffset=curUsedIndices;\n" +" indicesA[curUsedIndices++]=i;\n" +" indicesA[curUsedIndices++]=prevVertex;\n" +" \n" +" facesA[fidx].m_plane.x = edgeNormal.x;\n" +" facesA[fidx].m_plane.y = edgeNormal.y;\n" +" facesA[fidx].m_plane.z = edgeNormal.z;\n" +" facesA[fidx].m_plane.w = c;\n" +" fidx++;\n" +" prevVertex = i;\n" +" }\n" +" }\n" +" convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n" +" convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n" +" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" +" posA.w = 0.f;\n" +" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" +" posB.w = 0.f;\n" +" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" +" float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" +" float4 sepAxis = separatingNormals[i];\n" +" \n" +" int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n" +" int childShapeIndexB =-1;\n" +" if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" +" {\n" +" ///////////////////\n" +" ///compound shape support\n" +" \n" +" childShapeIndexB = concavePairsIn[pairIndex].w;\n" +" int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" +" shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" +" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" +" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" +" float4 newPosB = transform(&childPosB,&posB,&ornB);\n" +" float4 newOrnB = qtMul(ornB,childOrnB);\n" +" posB = newPosB;\n" +" ornB = newOrnB;\n" +" \n" +" }\n" +" \n" +" ////////////////////////////////////////\n" +" \n" +" \n" +" \n" +" int numLocalContactsOut = clipHullAgainstHullLocalA(sepAxis,\n" +" &convexPolyhedronA, &convexShapes[shapeIndexB],\n" +" posA,ornA,\n" +" posB,ornB,\n" +" worldVertsB1,worldVertsB2,capacityWorldVerts,\n" +" minDist, maxDist,\n" +" &verticesA,&facesA,&indicesA,\n" +" vertices,faces,indices,\n" +" localContactsOut,localContactCapacity);\n" +" \n" +" if (numLocalContactsOut>0)\n" +" {\n" +" float4 normal = -separatingNormals[i];\n" +" int nPoints = numLocalContactsOut;\n" +" float4* pointsIn = localContactsOut;\n" +" int contactIdx[4];// = {-1,-1,-1,-1};\n" +" contactIdx[0] = -1;\n" +" contactIdx[1] = -1;\n" +" contactIdx[2] = -1;\n" +" contactIdx[3] = -1;\n" +" \n" +" int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n" +" \n" +" int dstIdx;\n" +" AppendInc( nGlobalContactsOut, dstIdx );\n" +" if (dstIdx<contactCapacity)\n" +" {\n" +" __global struct b3Contact4Data* c = globalContactsOut+ dstIdx;\n" +" c->m_worldNormalOnB = -normal;\n" +" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" +" c->m_batchIdx = pairIndex;\n" +" int bodyA = concavePairsIn[pairIndex].x;\n" +" int bodyB = concavePairsIn[pairIndex].y;\n" +" c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" +" c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" +" c->m_childIndexA = childShapeIndexA;\n" +" c->m_childIndexB = childShapeIndexB;\n" +" for (int i=0;i<nReducedContacts;i++)\n" +" {\n" +" c->m_worldPosB[i] = pointsIn[contactIdx[i]];\n" +" }\n" +" GET_NPOINTS(*c) = nReducedContacts;\n" +" }\n" +" \n" +" }// if (numContactsOut>0)\n" +" }// if (i<numPairs)\n" +"}\n" +"int findClippingFaces(const float4 separatingNormal,\n" +" __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB,\n" +" const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,\n" +" __global float4* worldVertsA1,\n" +" __global float4* worldNormalsA1,\n" +" __global float4* worldVertsB1,\n" +" int capacityWorldVerts,\n" +" const float minDist, float maxDist,\n" +" __global const float4* vertices,\n" +" __global const b3GpuFace_t* faces,\n" +" __global const int* indices,\n" +" __global int4* clippingFaces, int pairIndex)\n" +"{\n" +" int numContactsOut = 0;\n" +" int numWorldVertsB1= 0;\n" +" \n" +" \n" +" int closestFaceB=-1;\n" +" float dmax = -FLT_MAX;\n" +" \n" +" {\n" +" for(int face=0;face<hullB->m_numFaces;face++)\n" +" {\n" +" const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x,\n" +" faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);\n" +" const float4 WorldNormal = qtRotate(ornB, Normal);\n" +" float d = dot3F4(WorldNormal,separatingNormal);\n" +" if (d > dmax)\n" +" {\n" +" dmax = d;\n" +" closestFaceB = face;\n" +" }\n" +" }\n" +" }\n" +" \n" +" {\n" +" const b3GpuFace_t polyB = faces[hullB->m_faceOffset+closestFaceB];\n" +" const int numVertices = polyB.m_numIndices;\n" +" for(int e0=0;e0<numVertices;e0++)\n" +" {\n" +" const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];\n" +" worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);\n" +" }\n" +" }\n" +" \n" +" int closestFaceA=-1;\n" +" {\n" +" float dmin = FLT_MAX;\n" +" for(int face=0;face<hullA->m_numFaces;face++)\n" +" {\n" +" const float4 Normal = make_float4(\n" +" faces[hullA->m_faceOffset+face].m_plane.x,\n" +" faces[hullA->m_faceOffset+face].m_plane.y,\n" +" faces[hullA->m_faceOffset+face].m_plane.z,\n" +" 0.f);\n" +" const float4 faceANormalWS = qtRotate(ornA,Normal);\n" +" \n" +" float d = dot3F4(faceANormalWS,separatingNormal);\n" +" if (d < dmin)\n" +" {\n" +" dmin = d;\n" +" closestFaceA = face;\n" +" worldNormalsA1[pairIndex] = faceANormalWS;\n" +" }\n" +" }\n" +" }\n" +" \n" +" int numVerticesA = faces[hullA->m_faceOffset+closestFaceA].m_numIndices;\n" +" for(int e0=0;e0<numVerticesA;e0++)\n" +" {\n" +" const float4 a = vertices[hullA->m_vertexOffset+indices[faces[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];\n" +" worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);\n" +" }\n" +" \n" +" clippingFaces[pairIndex].x = closestFaceA;\n" +" clippingFaces[pairIndex].y = closestFaceB;\n" +" clippingFaces[pairIndex].z = numVerticesA;\n" +" clippingFaces[pairIndex].w = numWorldVertsB1;\n" +" \n" +" \n" +" return numContactsOut;\n" +"}\n" +"int clipFaces(__global float4* worldVertsA1,\n" +" __global float4* worldNormalsA1,\n" +" __global float4* worldVertsB1,\n" +" __global float4* worldVertsB2, \n" +" int capacityWorldVertsB2,\n" +" const float minDist, float maxDist,\n" +" __global int4* clippingFaces,\n" +" int pairIndex)\n" +"{\n" +" int numContactsOut = 0;\n" +" \n" +" int closestFaceA = clippingFaces[pairIndex].x;\n" +" int closestFaceB = clippingFaces[pairIndex].y;\n" +" int numVertsInA = clippingFaces[pairIndex].z;\n" +" int numVertsInB = clippingFaces[pairIndex].w;\n" +" \n" +" int numVertsOut = 0;\n" +" \n" +" if (closestFaceA<0)\n" +" return numContactsOut;\n" +" \n" +" __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];\n" +" __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];\n" +" \n" +" \n" +" \n" +" // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n" +" \n" +" for(int e0=0;e0<numVertsInA;e0++)\n" +" {\n" +" const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];\n" +" const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];\n" +" const float4 WorldEdge0 = aw - bw;\n" +" float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];\n" +" float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n" +" float4 worldA1 = aw;\n" +" float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n" +" float4 planeNormalWS = planeNormalWS1;\n" +" float planeEqWS=planeEqWS1;\n" +" numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);\n" +" __global float4* tmp = pVtxOut;\n" +" pVtxOut = pVtxIn;\n" +" pVtxIn = tmp;\n" +" numVertsInB = numVertsOut;\n" +" numVertsOut = 0;\n" +" }\n" +" \n" +" //float4 planeNormalWS = worldNormalsA1[pairIndex];\n" +" //float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n" +" \n" +" /*for (int i=0;i<numVertsInB;i++)\n" +" {\n" +" pVtxOut[i] = pVtxIn[i];\n" +" }*/\n" +" \n" +" \n" +" \n" +" \n" +" //numVertsInB=0;\n" +" \n" +" float4 planeNormalWS = worldNormalsA1[pairIndex];\n" +" float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n" +" for (int i=0;i<numVertsInB;i++)\n" +" {\n" +" float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n" +" if (depth <=minDist)\n" +" {\n" +" depth = minDist;\n" +" }\n" +" \n" +" if (depth <=maxDist)\n" +" {\n" +" float4 pointInWorld = pVtxIn[i];\n" +" pVtxOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n" +" }\n" +" }\n" +" \n" +" clippingFaces[pairIndex].w =numContactsOut;\n" +" \n" +" \n" +" return numContactsOut;\n" +"}\n" +"__kernel void findClippingFacesKernel( __global const int4* pairs,\n" +" __global const b3RigidBodyData_t* rigidBodies,\n" +" __global const b3Collidable_t* collidables,\n" +" __global const b3ConvexPolyhedronData_t* convexShapes,\n" +" __global const float4* vertices,\n" +" __global const float4* uniqueEdges,\n" +" __global const b3GpuFace_t* faces,\n" +" __global const int* indices,\n" +" __global const float4* separatingNormals,\n" +" __global const int* hasSeparatingAxis,\n" +" __global int4* clippingFacesOut,\n" +" __global float4* worldVertsA1,\n" +" __global float4* worldNormalsA1,\n" +" __global float4* worldVertsB1,\n" +" int capacityWorldVerts,\n" +" int numPairs\n" +" )\n" +"{\n" +" \n" +" int i = get_global_id(0);\n" +" int pairIndex = i;\n" +" \n" +" \n" +" float minDist = -1e30f;\n" +" float maxDist = 0.02f;\n" +" \n" +" if (i<numPairs)\n" +" {\n" +" \n" +" if (hasSeparatingAxis[i])\n" +" {\n" +" \n" +" int bodyIndexA = pairs[i].x;\n" +" int bodyIndexB = pairs[i].y;\n" +" \n" +" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" +" \n" +" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" +" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" +" \n" +" \n" +" \n" +" int numLocalContactsOut = findClippingFaces(separatingNormals[i],\n" +" &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n" +" rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,\n" +" rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,\n" +" worldVertsA1,\n" +" worldNormalsA1,\n" +" worldVertsB1,capacityWorldVerts,\n" +" minDist, maxDist,\n" +" vertices,faces,indices,\n" +" clippingFacesOut,i);\n" +" \n" +" \n" +" }// if (hasSeparatingAxis[i])\n" +" }// if (i<numPairs)\n" +" \n" +"}\n" +"__kernel void clipFacesAndFindContactsKernel( __global const float4* separatingNormals,\n" +" __global const int* hasSeparatingAxis,\n" +" __global int4* clippingFacesOut,\n" +" __global float4* worldVertsA1,\n" +" __global float4* worldNormalsA1,\n" +" __global float4* worldVertsB1,\n" +" __global float4* worldVertsB2,\n" +" int vertexFaceCapacity,\n" +" int numPairs,\n" +" int debugMode\n" +" )\n" +"{\n" +" int i = get_global_id(0);\n" +" int pairIndex = i;\n" +" \n" +" \n" +" float minDist = -1e30f;\n" +" float maxDist = 0.02f;\n" +" \n" +" if (i<numPairs)\n" +" {\n" +" \n" +" if (hasSeparatingAxis[i])\n" +" {\n" +" \n" +"// int bodyIndexA = pairs[i].x;\n" +" // int bodyIndexB = pairs[i].y;\n" +" \n" +" int numLocalContactsOut = 0;\n" +" int capacityWorldVertsB2 = vertexFaceCapacity;\n" +" \n" +" __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];\n" +" __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];\n" +" \n" +" {\n" +" __global int4* clippingFaces = clippingFacesOut;\n" +" \n" +" \n" +" int closestFaceA = clippingFaces[pairIndex].x;\n" +" int closestFaceB = clippingFaces[pairIndex].y;\n" +" int numVertsInA = clippingFaces[pairIndex].z;\n" +" int numVertsInB = clippingFaces[pairIndex].w;\n" +" \n" +" int numVertsOut = 0;\n" +" \n" +" if (closestFaceA>=0)\n" +" {\n" +" \n" +" \n" +" \n" +" // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n" +" \n" +" for(int e0=0;e0<numVertsInA;e0++)\n" +" {\n" +" const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];\n" +" const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];\n" +" const float4 WorldEdge0 = aw - bw;\n" +" float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];\n" +" float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n" +" float4 worldA1 = aw;\n" +" float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n" +" float4 planeNormalWS = planeNormalWS1;\n" +" float planeEqWS=planeEqWS1;\n" +" numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);\n" +" __global float4* tmp = pVtxOut;\n" +" pVtxOut = pVtxIn;\n" +" pVtxIn = tmp;\n" +" numVertsInB = numVertsOut;\n" +" numVertsOut = 0;\n" +" }\n" +" \n" +" float4 planeNormalWS = worldNormalsA1[pairIndex];\n" +" float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n" +" \n" +" for (int i=0;i<numVertsInB;i++)\n" +" {\n" +" float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n" +" if (depth <=minDist)\n" +" {\n" +" depth = minDist;\n" +" }\n" +" \n" +" if (depth <=maxDist)\n" +" {\n" +" float4 pointInWorld = pVtxIn[i];\n" +" pVtxOut[numLocalContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n" +" }\n" +" }\n" +" \n" +" }\n" +" clippingFaces[pairIndex].w =numLocalContactsOut;\n" +" \n" +" }\n" +" \n" +" for (int i=0;i<numLocalContactsOut;i++)\n" +" pVtxIn[i] = pVtxOut[i];\n" +" \n" +" }// if (hasSeparatingAxis[i])\n" +" }// if (i<numPairs)\n" +" \n" +"}\n" +"__kernel void newContactReductionKernel( __global int4* pairs,\n" +" __global const b3RigidBodyData_t* rigidBodies,\n" +" __global const float4* separatingNormals,\n" +" __global const int* hasSeparatingAxis,\n" +" __global struct b3Contact4Data* globalContactsOut,\n" +" __global int4* clippingFaces,\n" +" __global float4* worldVertsB2,\n" +" volatile __global int* nGlobalContactsOut,\n" +" int vertexFaceCapacity,\n" +" int contactCapacity,\n" +" int numPairs\n" +" )\n" +"{\n" +" int i = get_global_id(0);\n" +" int pairIndex = i;\n" +" \n" +" int4 contactIdx;\n" +" contactIdx=make_int4(0,1,2,3);\n" +" \n" +" if (i<numPairs)\n" +" {\n" +" \n" +" if (hasSeparatingAxis[i])\n" +" {\n" +" \n" +" \n" +" \n" +" \n" +" int nPoints = clippingFaces[pairIndex].w;\n" +" \n" +" if (nPoints>0)\n" +" {\n" +" __global float4* pointsIn = &worldVertsB2[pairIndex*vertexFaceCapacity];\n" +" float4 normal = -separatingNormals[i];\n" +" \n" +" int nReducedContacts = extractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx);\n" +" \n" +" int mprContactIndex = pairs[pairIndex].z;\n" +" int dstIdx = mprContactIndex;\n" +" if (dstIdx<0)\n" +" {\n" +" AppendInc( nGlobalContactsOut, dstIdx );\n" +" }\n" +"//#if 0\n" +" \n" +" if (dstIdx < contactCapacity)\n" +" {\n" +" __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" +" c->m_worldNormalOnB = -normal;\n" +" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" +" c->m_batchIdx = pairIndex;\n" +" int bodyA = pairs[pairIndex].x;\n" +" int bodyB = pairs[pairIndex].y;\n" +" pairs[pairIndex].w = dstIdx;\n" +" c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" +" c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" +" c->m_childIndexA =-1;\n" +" c->m_childIndexB =-1;\n" +" switch (nReducedContacts)\n" +" {\n" +" case 4:\n" +" c->m_worldPosB[3] = pointsIn[contactIdx.w];\n" +" case 3:\n" +" c->m_worldPosB[2] = pointsIn[contactIdx.z];\n" +" case 2:\n" +" c->m_worldPosB[1] = pointsIn[contactIdx.y];\n" +" case 1:\n" +" if (mprContactIndex<0)//test\n" +" c->m_worldPosB[0] = pointsIn[contactIdx.x];\n" +" default:\n" +" {\n" +" }\n" +" };\n" +" \n" +" GET_NPOINTS(*c) = nReducedContacts;\n" +" \n" +" }\n" +" \n" +" \n" +"//#endif\n" +" \n" +" }// if (numContactsOut>0)\n" +" }// if (hasSeparatingAxis[i])\n" +" }// if (i<numPairs)\n" +" \n" +" \n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcave.cl b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcave.cl new file mode 100644 index 0000000000..31ca43b8cd --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcave.cl @@ -0,0 +1,1220 @@ + +//keep this enum in sync with the CPU version (in btCollidable.h) +//written by Erwin Coumans + + +#define SHAPE_CONVEX_HULL 3 +#define SHAPE_CONCAVE_TRIMESH 5 +#define TRIANGLE_NUM_CONVEX_FACES 5 +#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6 + +#define B3_MAX_STACK_DEPTH 256 + + +typedef unsigned int u32; + +///keep this in sync with btCollidable.h +typedef struct +{ + union { + int m_numChildShapes; + int m_bvhIndex; + }; + union + { + float m_radius; + int m_compoundBvhIndex; + }; + + int m_shapeType; + int m_shapeIndex; + +} btCollidableGpu; + +#define MAX_NUM_PARTS_IN_BITS 10 + +///b3QuantizedBvhNode is a compressed aabb node, 16 bytes. +///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range). +typedef struct +{ + //12 bytes + unsigned short int m_quantizedAabbMin[3]; + unsigned short int m_quantizedAabbMax[3]; + //4 bytes + int m_escapeIndexOrTriangleIndex; +} b3QuantizedBvhNode; + +typedef struct +{ + float4 m_aabbMin; + float4 m_aabbMax; + float4 m_quantization; + int m_numNodes; + int m_numSubTrees; + int m_nodeOffset; + int m_subTreeOffset; + +} b3BvhInfo; + + +int getTriangleIndex(const b3QuantizedBvhNode* rootNode) +{ + unsigned int x=0; + unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS); + // Get only the lower bits where the triangle index is stored + return (rootNode->m_escapeIndexOrTriangleIndex&~(y)); +} + +int getTriangleIndexGlobal(__global const b3QuantizedBvhNode* rootNode) +{ + unsigned int x=0; + unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS); + // Get only the lower bits where the triangle index is stored + return (rootNode->m_escapeIndexOrTriangleIndex&~(y)); +} + +int isLeafNode(const b3QuantizedBvhNode* rootNode) +{ + //skipindex is negative (internal node), triangleindex >=0 (leafnode) + return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0; +} + +int isLeafNodeGlobal(__global const b3QuantizedBvhNode* rootNode) +{ + //skipindex is negative (internal node), triangleindex >=0 (leafnode) + return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0; +} + +int getEscapeIndex(const b3QuantizedBvhNode* rootNode) +{ + return -rootNode->m_escapeIndexOrTriangleIndex; +} + +int getEscapeIndexGlobal(__global const b3QuantizedBvhNode* rootNode) +{ + return -rootNode->m_escapeIndexOrTriangleIndex; +} + + +typedef struct +{ + //12 bytes + unsigned short int m_quantizedAabbMin[3]; + unsigned short int m_quantizedAabbMax[3]; + //4 bytes, points to the root of the subtree + int m_rootNodeIndex; + //4 bytes + int m_subtreeSize; + int m_padding[3]; +} b3BvhSubtreeInfo; + + + + + + + +typedef struct +{ + float4 m_childPosition; + float4 m_childOrientation; + int m_shapeIndex; + int m_unused0; + int m_unused1; + int m_unused2; +} btGpuChildShape; + + +typedef struct +{ + float4 m_pos; + float4 m_quat; + float4 m_linVel; + float4 m_angVel; + + u32 m_collidableIdx; + float m_invMass; + float m_restituitionCoeff; + float m_frictionCoeff; +} BodyData; + + +typedef struct +{ + float4 m_localCenter; + float4 m_extents; + float4 mC; + float4 mE; + + float m_radius; + int m_faceOffset; + int m_numFaces; + int m_numVertices; + + int m_vertexOffset; + int m_uniqueEdgesOffset; + int m_numUniqueEdges; + int m_unused; +} ConvexPolyhedronCL; + +typedef struct +{ + union + { + float4 m_min; + float m_minElems[4]; + int m_minIndices[4]; + }; + union + { + float4 m_max; + float m_maxElems[4]; + int m_maxIndices[4]; + }; +} btAabbCL; + +#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h" +#include "Bullet3Common/shared/b3Int2.h" + + + +typedef struct +{ + float4 m_plane; + int m_indexOffset; + int m_numIndices; +} btGpuFace; + +#define make_float4 (float4) + + +__inline +float4 cross3(float4 a, float4 b) +{ + return cross(a,b); + + +// float4 a1 = make_float4(a.xyz,0.f); +// float4 b1 = make_float4(b.xyz,0.f); + +// return cross(a1,b1); + +//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f); + + // float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f); + + //return c; +} + +__inline +float dot3F4(float4 a, float4 b) +{ + float4 a1 = make_float4(a.xyz,0.f); + float4 b1 = make_float4(b.xyz,0.f); + return dot(a1, b1); +} + +__inline +float4 fastNormalize4(float4 v) +{ + v = make_float4(v.xyz,0.f); + return fast_normalize(v); +} + + +/////////////////////////////////////// +// Quaternion +/////////////////////////////////////// + +typedef float4 Quaternion; + +__inline +Quaternion qtMul(Quaternion a, Quaternion b); + +__inline +Quaternion qtNormalize(Quaternion in); + +__inline +float4 qtRotate(Quaternion q, float4 vec); + +__inline +Quaternion qtInvert(Quaternion q); + + + + +__inline +Quaternion qtMul(Quaternion a, Quaternion b) +{ + Quaternion ans; + ans = cross3( a, b ); + ans += a.w*b+b.w*a; +// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z); + ans.w = a.w*b.w - dot3F4(a, b); + return ans; +} + +__inline +Quaternion qtNormalize(Quaternion in) +{ + return fastNormalize4(in); +// in /= length( in ); +// return in; +} +__inline +float4 qtRotate(Quaternion q, float4 vec) +{ + Quaternion qInv = qtInvert( q ); + float4 vcpy = vec; + vcpy.w = 0.f; + float4 out = qtMul(qtMul(q,vcpy),qInv); + return out; +} + +__inline +Quaternion qtInvert(Quaternion q) +{ + return (Quaternion)(-q.xyz, q.w); +} + +__inline +float4 qtInvRotate(const Quaternion q, float4 vec) +{ + return qtRotate( qtInvert( q ), vec ); +} + +__inline +float4 transform(const float4* p, const float4* translation, const Quaternion* orientation) +{ + return qtRotate( *orientation, *p ) + (*translation); +} + + + +__inline +float4 normalize3(const float4 a) +{ + float4 n = make_float4(a.x, a.y, a.z, 0.f); + return fastNormalize4( n ); +} + +inline void projectLocal(const ConvexPolyhedronCL* hull, const float4 pos, const float4 orn, +const float4* dir, const float4* vertices, float* min, float* max) +{ + min[0] = FLT_MAX; + max[0] = -FLT_MAX; + int numVerts = hull->m_numVertices; + + const float4 localDir = qtInvRotate(orn,*dir); + float offset = dot(pos,*dir); + for(int i=0;i<numVerts;i++) + { + float dp = dot(vertices[hull->m_vertexOffset+i],localDir); + if(dp < min[0]) + min[0] = dp; + if(dp > max[0]) + max[0] = dp; + } + if(min[0]>max[0]) + { + float tmp = min[0]; + min[0] = max[0]; + max[0] = tmp; + } + min[0] += offset; + max[0] += offset; +} + +inline void project(__global const ConvexPolyhedronCL* hull, const float4 pos, const float4 orn, +const float4* dir, __global const float4* vertices, float* min, float* max) +{ + min[0] = FLT_MAX; + max[0] = -FLT_MAX; + int numVerts = hull->m_numVertices; + + const float4 localDir = qtInvRotate(orn,*dir); + float offset = dot(pos,*dir); + for(int i=0;i<numVerts;i++) + { + float dp = dot(vertices[hull->m_vertexOffset+i],localDir); + if(dp < min[0]) + min[0] = dp; + if(dp > max[0]) + max[0] = dp; + } + if(min[0]>max[0]) + { + float tmp = min[0]; + min[0] = max[0]; + max[0] = tmp; + } + min[0] += offset; + max[0] += offset; +} + +inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, + const float4 posA,const float4 ornA, + const float4 posB,const float4 ornB, + float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth) +{ + float Min0,Max0; + float Min1,Max1; + projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0); + project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1); + + if(Max0<Min1 || Max1<Min0) + return false; + + float d0 = Max0 - Min1; + float d1 = Max1 - Min0; + *depth = d0<d1 ? d0:d1; + return true; +} + + + + +inline bool IsAlmostZero(const float4 v) +{ + if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f) + return false; + return true; +} + + + +bool findSeparatingAxisLocalA( const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, + const float4 posA1, + const float4 ornA, + const float4 posB1, + const float4 ornB, + const float4 DeltaC2, + + const float4* verticesA, + const float4* uniqueEdgesA, + const btGpuFace* facesA, + const int* indicesA, + + __global const float4* verticesB, + __global const float4* uniqueEdgesB, + __global const btGpuFace* facesB, + __global const int* indicesB, + float4* sep, + float* dmin) +{ + + + float4 posA = posA1; + posA.w = 0.f; + float4 posB = posB1; + posB.w = 0.f; + int curPlaneTests=0; + { + int numFacesA = hullA->m_numFaces; + // Test normals from hullA + for(int i=0;i<numFacesA;i++) + { + const float4 normal = facesA[hullA->m_faceOffset+i].m_plane; + float4 faceANormalWS = qtRotate(ornA,normal); + if (dot3F4(DeltaC2,faceANormalWS)<0) + faceANormalWS*=-1.f; + curPlaneTests++; + float d; + if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d)) + return false; + if(d<*dmin) + { + *dmin = d; + *sep = faceANormalWS; + } + } + } + if((dot3F4(-DeltaC2,*sep))>0.0f) + { + *sep = -(*sep); + } + return true; +} + +bool findSeparatingAxisLocalB( __global const ConvexPolyhedronCL* hullA, const ConvexPolyhedronCL* hullB, + const float4 posA1, + const float4 ornA, + const float4 posB1, + const float4 ornB, + const float4 DeltaC2, + __global const float4* verticesA, + __global const float4* uniqueEdgesA, + __global const btGpuFace* facesA, + __global const int* indicesA, + const float4* verticesB, + const float4* uniqueEdgesB, + const btGpuFace* facesB, + const int* indicesB, + float4* sep, + float* dmin) +{ + + + float4 posA = posA1; + posA.w = 0.f; + float4 posB = posB1; + posB.w = 0.f; + int curPlaneTests=0; + { + int numFacesA = hullA->m_numFaces; + // Test normals from hullA + for(int i=0;i<numFacesA;i++) + { + const float4 normal = facesA[hullA->m_faceOffset+i].m_plane; + float4 faceANormalWS = qtRotate(ornA,normal); + if (dot3F4(DeltaC2,faceANormalWS)<0) + faceANormalWS *= -1.f; + curPlaneTests++; + float d; + if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d)) + return false; + if(d<*dmin) + { + *dmin = d; + *sep = faceANormalWS; + } + } + } + if((dot3F4(-DeltaC2,*sep))>0.0f) + { + *sep = -(*sep); + } + return true; +} + + + +bool findSeparatingAxisEdgeEdgeLocalA( const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, + const float4 posA1, + const float4 ornA, + const float4 posB1, + const float4 ornB, + const float4 DeltaC2, + const float4* verticesA, + const float4* uniqueEdgesA, + const btGpuFace* facesA, + const int* indicesA, + __global const float4* verticesB, + __global const float4* uniqueEdgesB, + __global const btGpuFace* facesB, + __global const int* indicesB, + float4* sep, + float* dmin) +{ + + + float4 posA = posA1; + posA.w = 0.f; + float4 posB = posB1; + posB.w = 0.f; + + int curPlaneTests=0; + + int curEdgeEdge = 0; + // Test edges + for(int e0=0;e0<hullA->m_numUniqueEdges;e0++) + { + const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0]; + float4 edge0World = qtRotate(ornA,edge0); + + for(int e1=0;e1<hullB->m_numUniqueEdges;e1++) + { + const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1]; + float4 edge1World = qtRotate(ornB,edge1); + + + float4 crossje = cross3(edge0World,edge1World); + + curEdgeEdge++; + if(!IsAlmostZero(crossje)) + { + crossje = normalize3(crossje); + if (dot3F4(DeltaC2,crossje)<0) + crossje *= -1.f; + + float dist; + bool result = true; + { + float Min0,Max0; + float Min1,Max1; + projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0); + project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1); + + if(Max0<Min1 || Max1<Min0) + result = false; + + float d0 = Max0 - Min1; + float d1 = Max1 - Min0; + dist = d0<d1 ? d0:d1; + result = true; + + } + + + if(dist<*dmin) + { + *dmin = dist; + *sep = crossje; + } + } + } + + } + + + if((dot3F4(-DeltaC2,*sep))>0.0f) + { + *sep = -(*sep); + } + return true; +} + + + +inline int findClippingFaces(const float4 separatingNormal, + const ConvexPolyhedronCL* hullA, + __global const ConvexPolyhedronCL* hullB, + const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, + __global float4* worldVertsA1, + __global float4* worldNormalsA1, + __global float4* worldVertsB1, + int capacityWorldVerts, + const float minDist, float maxDist, + const float4* verticesA, + const btGpuFace* facesA, + const int* indicesA, + __global const float4* verticesB, + __global const btGpuFace* facesB, + __global const int* indicesB, + __global int4* clippingFaces, int pairIndex) +{ + int numContactsOut = 0; + int numWorldVertsB1= 0; + + + int closestFaceB=0; + float dmax = -FLT_MAX; + + { + for(int face=0;face<hullB->m_numFaces;face++) + { + const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x, + facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f); + const float4 WorldNormal = qtRotate(ornB, Normal); + float d = dot3F4(WorldNormal,separatingNormal); + if (d > dmax) + { + dmax = d; + closestFaceB = face; + } + } + } + + { + const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB]; + int numVertices = polyB.m_numIndices; + if (numVertices>capacityWorldVerts) + numVertices = capacityWorldVerts; + if (numVertices<0) + numVertices = 0; + + for(int e0=0;e0<numVertices;e0++) + { + if (e0<capacityWorldVerts) + { + const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]]; + worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB); + } + } + } + + int closestFaceA=0; + { + float dmin = FLT_MAX; + for(int face=0;face<hullA->m_numFaces;face++) + { + const float4 Normal = make_float4( + facesA[hullA->m_faceOffset+face].m_plane.x, + facesA[hullA->m_faceOffset+face].m_plane.y, + facesA[hullA->m_faceOffset+face].m_plane.z, + 0.f); + const float4 faceANormalWS = qtRotate(ornA,Normal); + + float d = dot3F4(faceANormalWS,separatingNormal); + if (d < dmin) + { + dmin = d; + closestFaceA = face; + worldNormalsA1[pairIndex] = faceANormalWS; + } + } + } + + int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices; + if (numVerticesA>capacityWorldVerts) + numVerticesA = capacityWorldVerts; + if (numVerticesA<0) + numVerticesA=0; + + for(int e0=0;e0<numVerticesA;e0++) + { + if (e0<capacityWorldVerts) + { + const float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]]; + worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA); + } + } + + clippingFaces[pairIndex].x = closestFaceA; + clippingFaces[pairIndex].y = closestFaceB; + clippingFaces[pairIndex].z = numVerticesA; + clippingFaces[pairIndex].w = numWorldVertsB1; + + + return numContactsOut; +} + + + + +// work-in-progress +__kernel void findConcaveSeparatingAxisVertexFaceKernel( __global int4* concavePairs, + __global const BodyData* rigidBodies, + __global const btCollidableGpu* collidables, + __global const ConvexPolyhedronCL* convexShapes, + __global const float4* vertices, + __global const float4* uniqueEdges, + __global const btGpuFace* faces, + __global const int* indices, + __global const btGpuChildShape* gpuChildShapes, + __global btAabbCL* aabbs, + __global float4* concaveSeparatingNormalsOut, + __global int* concaveHasSeparatingNormals, + __global int4* clippingFacesOut, + __global float4* worldVertsA1GPU, + __global float4* worldNormalsAGPU, + __global float4* worldVertsB1GPU, + __global float* dmins, + int vertexFaceCapacity, + int numConcavePairs + ) +{ + + int i = get_global_id(0); + if (i>=numConcavePairs) + return; + + concaveHasSeparatingNormals[i] = 0; + + int pairIdx = i; + + int bodyIndexA = concavePairs[i].x; + int bodyIndexB = concavePairs[i].y; + + int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&& + collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS) + { + concavePairs[pairIdx].w = -1; + return; + } + + + + int numFacesA = convexShapes[shapeIndexA].m_numFaces; + int numActualConcaveConvexTests = 0; + + int f = concavePairs[i].z; + + bool overlap = false; + + ConvexPolyhedronCL convexPolyhedronA; + + //add 3 vertices of the triangle + convexPolyhedronA.m_numVertices = 3; + convexPolyhedronA.m_vertexOffset = 0; + float4 localCenter = make_float4(0.f,0.f,0.f,0.f); + + btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f]; + float4 triMinAabb, triMaxAabb; + btAabbCL triAabb; + triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f); + triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f); + + float4 verticesA[3]; + for (int i=0;i<3;i++) + { + int index = indices[face.m_indexOffset+i]; + float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index]; + verticesA[i] = vert; + localCenter += vert; + + triAabb.m_min = min(triAabb.m_min,vert); + triAabb.m_max = max(triAabb.m_max,vert); + + } + + overlap = true; + overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap; + overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap; + overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap; + + if (overlap) + { + float dmin = FLT_MAX; + int hasSeparatingAxis=5; + float4 sepAxis=make_float4(1,2,3,4); + + int localCC=0; + numActualConcaveConvexTests++; + + //a triangle has 3 unique edges + convexPolyhedronA.m_numUniqueEdges = 3; + convexPolyhedronA.m_uniqueEdgesOffset = 0; + float4 uniqueEdgesA[3]; + + uniqueEdgesA[0] = (verticesA[1]-verticesA[0]); + uniqueEdgesA[1] = (verticesA[2]-verticesA[1]); + uniqueEdgesA[2] = (verticesA[0]-verticesA[2]); + + + convexPolyhedronA.m_faceOffset = 0; + + float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f); + + btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES]; + int indicesA[3+3+2+2+2]; + int curUsedIndices=0; + int fidx=0; + + //front size of triangle + { + facesA[fidx].m_indexOffset=curUsedIndices; + indicesA[0] = 0; + indicesA[1] = 1; + indicesA[2] = 2; + curUsedIndices+=3; + float c = face.m_plane.w; + facesA[fidx].m_plane.x = normal.x; + facesA[fidx].m_plane.y = normal.y; + facesA[fidx].m_plane.z = normal.z; + facesA[fidx].m_plane.w = c; + facesA[fidx].m_numIndices=3; + } + fidx++; + //back size of triangle + { + facesA[fidx].m_indexOffset=curUsedIndices; + indicesA[3]=2; + indicesA[4]=1; + indicesA[5]=0; + curUsedIndices+=3; + float c = dot(normal,verticesA[0]); + float c1 = -face.m_plane.w; + facesA[fidx].m_plane.x = -normal.x; + facesA[fidx].m_plane.y = -normal.y; + facesA[fidx].m_plane.z = -normal.z; + facesA[fidx].m_plane.w = c; + facesA[fidx].m_numIndices=3; + } + fidx++; + + bool addEdgePlanes = true; + if (addEdgePlanes) + { + int numVertices=3; + int prevVertex = numVertices-1; + for (int i=0;i<numVertices;i++) + { + float4 v0 = verticesA[i]; + float4 v1 = verticesA[prevVertex]; + + float4 edgeNormal = normalize(cross(normal,v1-v0)); + float c = -dot(edgeNormal,v0); + + facesA[fidx].m_numIndices = 2; + facesA[fidx].m_indexOffset=curUsedIndices; + indicesA[curUsedIndices++]=i; + indicesA[curUsedIndices++]=prevVertex; + + facesA[fidx].m_plane.x = edgeNormal.x; + facesA[fidx].m_plane.y = edgeNormal.y; + facesA[fidx].m_plane.z = edgeNormal.z; + facesA[fidx].m_plane.w = c; + fidx++; + prevVertex = i; + } + } + convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES; + convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f); + + + float4 posA = rigidBodies[bodyIndexA].m_pos; + posA.w = 0.f; + float4 posB = rigidBodies[bodyIndexB].m_pos; + posB.w = 0.f; + + float4 ornA = rigidBodies[bodyIndexA].m_quat; + float4 ornB =rigidBodies[bodyIndexB].m_quat; + + + + + /////////////////// + ///compound shape support + + if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + { + int compoundChild = concavePairs[pairIdx].w; + int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild; + int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex; + float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition; + float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; + float4 newPosB = transform(&childPosB,&posB,&ornB); + float4 newOrnB = qtMul(ornB,childOrnB); + posB = newPosB; + ornB = newOrnB; + shapeIndexB = collidables[childColIndexB].m_shapeIndex; + } + ////////////////// + + float4 c0local = convexPolyhedronA.m_localCenter; + float4 c0 = transform(&c0local, &posA, &ornA); + float4 c1local = convexShapes[shapeIndexB].m_localCenter; + float4 c1 = transform(&c1local,&posB,&ornB); + const float4 DeltaC2 = c0 - c1; + + + bool sepA = findSeparatingAxisLocalA( &convexPolyhedronA, &convexShapes[shapeIndexB], + posA,ornA, + posB,ornB, + DeltaC2, + verticesA,uniqueEdgesA,facesA,indicesA, + vertices,uniqueEdges,faces,indices, + &sepAxis,&dmin); + hasSeparatingAxis = 4; + if (!sepA) + { + hasSeparatingAxis = 0; + } else + { + bool sepB = findSeparatingAxisLocalB( &convexShapes[shapeIndexB],&convexPolyhedronA, + posB,ornB, + posA,ornA, + DeltaC2, + vertices,uniqueEdges,faces,indices, + verticesA,uniqueEdgesA,facesA,indicesA, + &sepAxis,&dmin); + + if (!sepB) + { + hasSeparatingAxis = 0; + } else + { + hasSeparatingAxis = 1; + } + } + + if (hasSeparatingAxis) + { + dmins[i] = dmin; + concaveSeparatingNormalsOut[pairIdx]=sepAxis; + concaveHasSeparatingNormals[i]=1; + + } else + { + //mark this pair as in-active + concavePairs[pairIdx].w = -1; + } + } + else + { + //mark this pair as in-active + concavePairs[pairIdx].w = -1; + } +} + + + + +// work-in-progress +__kernel void findConcaveSeparatingAxisEdgeEdgeKernel( __global int4* concavePairs, + __global const BodyData* rigidBodies, + __global const btCollidableGpu* collidables, + __global const ConvexPolyhedronCL* convexShapes, + __global const float4* vertices, + __global const float4* uniqueEdges, + __global const btGpuFace* faces, + __global const int* indices, + __global const btGpuChildShape* gpuChildShapes, + __global btAabbCL* aabbs, + __global float4* concaveSeparatingNormalsOut, + __global int* concaveHasSeparatingNormals, + __global int4* clippingFacesOut, + __global float4* worldVertsA1GPU, + __global float4* worldNormalsAGPU, + __global float4* worldVertsB1GPU, + __global float* dmins, + int vertexFaceCapacity, + int numConcavePairs + ) +{ + + int i = get_global_id(0); + if (i>=numConcavePairs) + return; + + if (!concaveHasSeparatingNormals[i]) + return; + + int pairIdx = i; + + int bodyIndexA = concavePairs[i].x; + int bodyIndexB = concavePairs[i].y; + + int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; + int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; + + + int numFacesA = convexShapes[shapeIndexA].m_numFaces; + int numActualConcaveConvexTests = 0; + + int f = concavePairs[i].z; + + bool overlap = false; + + ConvexPolyhedronCL convexPolyhedronA; + + //add 3 vertices of the triangle + convexPolyhedronA.m_numVertices = 3; + convexPolyhedronA.m_vertexOffset = 0; + float4 localCenter = make_float4(0.f,0.f,0.f,0.f); + + btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f]; + float4 triMinAabb, triMaxAabb; + btAabbCL triAabb; + triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f); + triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f); + + float4 verticesA[3]; + for (int i=0;i<3;i++) + { + int index = indices[face.m_indexOffset+i]; + float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index]; + verticesA[i] = vert; + localCenter += vert; + + triAabb.m_min = min(triAabb.m_min,vert); + triAabb.m_max = max(triAabb.m_max,vert); + + } + + overlap = true; + overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap; + overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap; + overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap; + + if (overlap) + { + float dmin = dmins[i]; + int hasSeparatingAxis=5; + float4 sepAxis=make_float4(1,2,3,4); + sepAxis = concaveSeparatingNormalsOut[pairIdx]; + + int localCC=0; + numActualConcaveConvexTests++; + + //a triangle has 3 unique edges + convexPolyhedronA.m_numUniqueEdges = 3; + convexPolyhedronA.m_uniqueEdgesOffset = 0; + float4 uniqueEdgesA[3]; + + uniqueEdgesA[0] = (verticesA[1]-verticesA[0]); + uniqueEdgesA[1] = (verticesA[2]-verticesA[1]); + uniqueEdgesA[2] = (verticesA[0]-verticesA[2]); + + + convexPolyhedronA.m_faceOffset = 0; + + float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f); + + btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES]; + int indicesA[3+3+2+2+2]; + int curUsedIndices=0; + int fidx=0; + + //front size of triangle + { + facesA[fidx].m_indexOffset=curUsedIndices; + indicesA[0] = 0; + indicesA[1] = 1; + indicesA[2] = 2; + curUsedIndices+=3; + float c = face.m_plane.w; + facesA[fidx].m_plane.x = normal.x; + facesA[fidx].m_plane.y = normal.y; + facesA[fidx].m_plane.z = normal.z; + facesA[fidx].m_plane.w = c; + facesA[fidx].m_numIndices=3; + } + fidx++; + //back size of triangle + { + facesA[fidx].m_indexOffset=curUsedIndices; + indicesA[3]=2; + indicesA[4]=1; + indicesA[5]=0; + curUsedIndices+=3; + float c = dot(normal,verticesA[0]); + float c1 = -face.m_plane.w; + facesA[fidx].m_plane.x = -normal.x; + facesA[fidx].m_plane.y = -normal.y; + facesA[fidx].m_plane.z = -normal.z; + facesA[fidx].m_plane.w = c; + facesA[fidx].m_numIndices=3; + } + fidx++; + + bool addEdgePlanes = true; + if (addEdgePlanes) + { + int numVertices=3; + int prevVertex = numVertices-1; + for (int i=0;i<numVertices;i++) + { + float4 v0 = verticesA[i]; + float4 v1 = verticesA[prevVertex]; + + float4 edgeNormal = normalize(cross(normal,v1-v0)); + float c = -dot(edgeNormal,v0); + + facesA[fidx].m_numIndices = 2; + facesA[fidx].m_indexOffset=curUsedIndices; + indicesA[curUsedIndices++]=i; + indicesA[curUsedIndices++]=prevVertex; + + facesA[fidx].m_plane.x = edgeNormal.x; + facesA[fidx].m_plane.y = edgeNormal.y; + facesA[fidx].m_plane.z = edgeNormal.z; + facesA[fidx].m_plane.w = c; + fidx++; + prevVertex = i; + } + } + convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES; + convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f); + + + float4 posA = rigidBodies[bodyIndexA].m_pos; + posA.w = 0.f; + float4 posB = rigidBodies[bodyIndexB].m_pos; + posB.w = 0.f; + + float4 ornA = rigidBodies[bodyIndexA].m_quat; + float4 ornB =rigidBodies[bodyIndexB].m_quat; + + + + + /////////////////// + ///compound shape support + + if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + { + int compoundChild = concavePairs[pairIdx].w; + int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild; + int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex; + float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition; + float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; + float4 newPosB = transform(&childPosB,&posB,&ornB); + float4 newOrnB = qtMul(ornB,childOrnB); + posB = newPosB; + ornB = newOrnB; + shapeIndexB = collidables[childColIndexB].m_shapeIndex; + } + ////////////////// + + float4 c0local = convexPolyhedronA.m_localCenter; + float4 c0 = transform(&c0local, &posA, &ornA); + float4 c1local = convexShapes[shapeIndexB].m_localCenter; + float4 c1 = transform(&c1local,&posB,&ornB); + const float4 DeltaC2 = c0 - c1; + + + { + bool sepEE = findSeparatingAxisEdgeEdgeLocalA( &convexPolyhedronA, &convexShapes[shapeIndexB], + posA,ornA, + posB,ornB, + DeltaC2, + verticesA,uniqueEdgesA,facesA,indicesA, + vertices,uniqueEdges,faces,indices, + &sepAxis,&dmin); + + if (!sepEE) + { + hasSeparatingAxis = 0; + } else + { + hasSeparatingAxis = 1; + } + } + + + if (hasSeparatingAxis) + { + sepAxis.w = dmin; + dmins[i] = dmin; + concaveSeparatingNormalsOut[pairIdx]=sepAxis; + concaveHasSeparatingNormals[i]=1; + + float minDist = -1e30f; + float maxDist = 0.02f; + + + findClippingFaces(sepAxis, + &convexPolyhedronA, + &convexShapes[shapeIndexB], + posA,ornA, + posB,ornB, + worldVertsA1GPU, + worldNormalsAGPU, + worldVertsB1GPU, + vertexFaceCapacity, + minDist, maxDist, + verticesA, + facesA, + indicesA, + vertices, + faces, + indices, + clippingFacesOut, pairIdx); + + + } else + { + //mark this pair as in-active + concavePairs[pairIdx].w = -1; + } + } + else + { + //mark this pair as in-active + concavePairs[pairIdx].w = -1; + } + + concavePairs[i].z = -1;//for the next stage, z is used to determine existing contact points +} + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h new file mode 100644 index 0000000000..611569cacf --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h @@ -0,0 +1,1457 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* satConcaveKernelsCL= \ +"//keep this enum in sync with the CPU version (in btCollidable.h)\n" +"//written by Erwin Coumans\n" +"#define SHAPE_CONVEX_HULL 3\n" +"#define SHAPE_CONCAVE_TRIMESH 5\n" +"#define TRIANGLE_NUM_CONVEX_FACES 5\n" +"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n" +"#define B3_MAX_STACK_DEPTH 256\n" +"typedef unsigned int u32;\n" +"///keep this in sync with btCollidable.h\n" +"typedef struct\n" +"{\n" +" union {\n" +" int m_numChildShapes;\n" +" int m_bvhIndex;\n" +" };\n" +" union\n" +" {\n" +" float m_radius;\n" +" int m_compoundBvhIndex;\n" +" };\n" +" \n" +" int m_shapeType;\n" +" int m_shapeIndex;\n" +" \n" +"} btCollidableGpu;\n" +"#define MAX_NUM_PARTS_IN_BITS 10\n" +"///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.\n" +"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n" +"typedef struct\n" +"{\n" +" //12 bytes\n" +" unsigned short int m_quantizedAabbMin[3];\n" +" unsigned short int m_quantizedAabbMax[3];\n" +" //4 bytes\n" +" int m_escapeIndexOrTriangleIndex;\n" +"} b3QuantizedBvhNode;\n" +"typedef struct\n" +"{\n" +" float4 m_aabbMin;\n" +" float4 m_aabbMax;\n" +" float4 m_quantization;\n" +" int m_numNodes;\n" +" int m_numSubTrees;\n" +" int m_nodeOffset;\n" +" int m_subTreeOffset;\n" +"} b3BvhInfo;\n" +"int getTriangleIndex(const b3QuantizedBvhNode* rootNode)\n" +"{\n" +" unsigned int x=0;\n" +" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n" +" // Get only the lower bits where the triangle index is stored\n" +" return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n" +"}\n" +"int getTriangleIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n" +"{\n" +" unsigned int x=0;\n" +" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n" +" // Get only the lower bits where the triangle index is stored\n" +" return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n" +"}\n" +"int isLeafNode(const b3QuantizedBvhNode* rootNode)\n" +"{\n" +" //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n" +" return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n" +"}\n" +"int isLeafNodeGlobal(__global const b3QuantizedBvhNode* rootNode)\n" +"{\n" +" //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n" +" return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n" +"}\n" +" \n" +"int getEscapeIndex(const b3QuantizedBvhNode* rootNode)\n" +"{\n" +" return -rootNode->m_escapeIndexOrTriangleIndex;\n" +"}\n" +"int getEscapeIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n" +"{\n" +" return -rootNode->m_escapeIndexOrTriangleIndex;\n" +"}\n" +"typedef struct\n" +"{\n" +" //12 bytes\n" +" unsigned short int m_quantizedAabbMin[3];\n" +" unsigned short int m_quantizedAabbMax[3];\n" +" //4 bytes, points to the root of the subtree\n" +" int m_rootNodeIndex;\n" +" //4 bytes\n" +" int m_subtreeSize;\n" +" int m_padding[3];\n" +"} b3BvhSubtreeInfo;\n" +"typedef struct\n" +"{\n" +" float4 m_childPosition;\n" +" float4 m_childOrientation;\n" +" int m_shapeIndex;\n" +" int m_unused0;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"} btGpuChildShape;\n" +"typedef struct\n" +"{\n" +" float4 m_pos;\n" +" float4 m_quat;\n" +" float4 m_linVel;\n" +" float4 m_angVel;\n" +" u32 m_collidableIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"} BodyData;\n" +"typedef struct \n" +"{\n" +" float4 m_localCenter;\n" +" float4 m_extents;\n" +" float4 mC;\n" +" float4 mE;\n" +" \n" +" float m_radius;\n" +" int m_faceOffset;\n" +" int m_numFaces;\n" +" int m_numVertices;\n" +" int m_vertexOffset;\n" +" int m_uniqueEdgesOffset;\n" +" int m_numUniqueEdges;\n" +" int m_unused;\n" +"} ConvexPolyhedronCL;\n" +"typedef struct \n" +"{\n" +" union\n" +" {\n" +" float4 m_min;\n" +" float m_minElems[4];\n" +" int m_minIndices[4];\n" +" };\n" +" union\n" +" {\n" +" float4 m_max;\n" +" float m_maxElems[4];\n" +" int m_maxIndices[4];\n" +" };\n" +"} btAabbCL;\n" +"#ifndef B3_AABB_H\n" +"#define B3_AABB_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_MAT3x3_H\n" +"#define B3_MAT3x3_H\n" +"#ifndef B3_QUAT_H\n" +"#define B3_QUAT_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif\n" +"#endif\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Quat;\n" +" #define b3QuatConstArg const b3Quat\n" +" \n" +" \n" +"inline float4 b3FastNormalize4(float4 v)\n" +"{\n" +" v = (float4)(v.xyz,0.f);\n" +" return fast_normalize(v);\n" +"}\n" +" \n" +"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" +"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" +"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" +"{\n" +" b3Quat ans;\n" +" ans = b3Cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - b3Dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" +"{\n" +" b3Quat q;\n" +" q=in;\n" +" //return b3FastNormalize4(in);\n" +" float len = native_sqrt(dot(q, q));\n" +" if(len > 0.f)\n" +" {\n" +" q *= 1.f / len;\n" +" }\n" +" else\n" +" {\n" +" q.x = q.y = q.z = 0.f;\n" +" q.w = 1.f;\n" +" }\n" +" return q;\n" +"}\n" +"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" b3Quat qInv = b3QuatInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" return b3QuatRotate( b3QuatInvert( q ), vec );\n" +"}\n" +"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" +"{\n" +" return b3QuatRotate( orientation, point ) + (translation);\n" +"}\n" +" \n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"typedef struct\n" +"{\n" +" b3Float4 m_row[3];\n" +"}b3Mat3x3;\n" +"#define b3Mat3x3ConstArg const b3Mat3x3\n" +"#define b3GetRow(m,row) (m.m_row[row])\n" +"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" +"{\n" +" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" +" b3Mat3x3 out;\n" +" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" +" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" +" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" +" out.m_row[0].w = 0.f;\n" +" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" +" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" +" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" +" out.m_row[1].w = 0.f;\n" +" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" +" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" +" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" +" out.m_row[2].w = 0.f;\n" +" return out;\n" +"}\n" +"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = fabs(matIn.m_row[0]);\n" +" out.m_row[1] = fabs(matIn.m_row[1]);\n" +" out.m_row[2] = fabs(matIn.m_row[2]);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtZero();\n" +"__inline\n" +"b3Mat3x3 mtIdentity();\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Mat3x3 mtZero()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(0.f);\n" +" m.m_row[1] = (b3Float4)(0.f);\n" +" m.m_row[2] = (b3Float4)(0.f);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtIdentity()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(1,0,0,0);\n" +" m.m_row[1] = (b3Float4)(0,1,0,0);\n" +" m.m_row[2] = (b3Float4)(0,0,1,0);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" +" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" +" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" +"{\n" +" b3Mat3x3 transB;\n" +" transB = mtTranspose( b );\n" +" b3Mat3x3 ans;\n" +" // why this doesn't run when 0ing in the for{}\n" +" a.m_row[0].w = 0.f;\n" +" a.m_row[1].w = 0.f;\n" +" a.m_row[2].w = 0.f;\n" +" for(int i=0; i<3; i++)\n" +" {\n" +"// a.m_row[i].w = 0.f;\n" +" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" +" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" +" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" +" ans.m_row[i].w = 0.f;\n" +" }\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" +"{\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a.m_row[0], b );\n" +" ans.y = b3Dot3F4( a.m_row[1], b );\n" +" ans.z = b3Dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" +"{\n" +" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a, colx );\n" +" ans.y = b3Dot3F4( a, coly );\n" +" ans.z = b3Dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"#endif\n" +"#endif //B3_MAT3x3_H\n" +"typedef struct b3Aabb b3Aabb_t;\n" +"struct b3Aabb\n" +"{\n" +" union\n" +" {\n" +" float m_min[4];\n" +" b3Float4 m_minVec;\n" +" int m_minIndices[4];\n" +" };\n" +" union\n" +" {\n" +" float m_max[4];\n" +" b3Float4 m_maxVec;\n" +" int m_signedMaxIndices[4];\n" +" };\n" +"};\n" +"inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n" +" b3Float4ConstArg pos,\n" +" b3QuatConstArg orn,\n" +" b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n" +"{\n" +" b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n" +" localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n" +" b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n" +" b3Mat3x3 m;\n" +" m = b3QuatGetRotationMatrix(orn);\n" +" b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n" +" b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n" +" \n" +" b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n" +" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n" +" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n" +" 0.f);\n" +" *aabbMinOut = center-extent;\n" +" *aabbMaxOut = center+extent;\n" +"}\n" +"/// conservative test for overlap between two aabbs\n" +"inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n" +" b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n" +"{\n" +" bool overlap = true;\n" +" overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n" +" overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n" +" overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"#endif //B3_AABB_H\n" +"/*\n" +"Bullet Continuous Collision Detection and Physics Library\n" +"Copyright (c) 2003-2013 Erwin Coumans http://bulletphysics.org\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose,\n" +"including commercial applications, and to alter it and redistribute it freely,\n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"#ifndef B3_INT2_H\n" +"#define B3_INT2_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#define b3UnsignedInt2 uint2\n" +"#define b3Int2 int2\n" +"#define b3MakeInt2 (int2)\n" +"#endif //__cplusplus\n" +"#endif\n" +"typedef struct\n" +"{\n" +" float4 m_plane;\n" +" int m_indexOffset;\n" +" int m_numIndices;\n" +"} btGpuFace;\n" +"#define make_float4 (float4)\n" +"__inline\n" +"float4 cross3(float4 a, float4 b)\n" +"{\n" +" return cross(a,b);\n" +" \n" +"// float4 a1 = make_float4(a.xyz,0.f);\n" +"// float4 b1 = make_float4(b.xyz,0.f);\n" +"// return cross(a1,b1);\n" +"//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);\n" +" \n" +" // float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);\n" +" \n" +" //return c;\n" +"}\n" +"__inline\n" +"float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = make_float4(a.xyz,0.f);\n" +" float4 b1 = make_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" v = make_float4(v.xyz,0.f);\n" +" return fast_normalize(v);\n" +"}\n" +"///////////////////////////////////////\n" +"// Quaternion\n" +"///////////////////////////////////////\n" +"typedef float4 Quaternion;\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b);\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in);\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec);\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q);\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b)\n" +"{\n" +" Quaternion ans;\n" +" ans = cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in)\n" +"{\n" +" return fastNormalize4(in);\n" +"// in /= length( in );\n" +"// return in;\n" +"}\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec)\n" +"{\n" +" Quaternion qInv = qtInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q)\n" +"{\n" +" return (Quaternion)(-q.xyz, q.w);\n" +"}\n" +"__inline\n" +"float4 qtInvRotate(const Quaternion q, float4 vec)\n" +"{\n" +" return qtRotate( qtInvert( q ), vec );\n" +"}\n" +"__inline\n" +"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n" +"{\n" +" return qtRotate( *orientation, *p ) + (*translation);\n" +"}\n" +"__inline\n" +"float4 normalize3(const float4 a)\n" +"{\n" +" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" +" return fastNormalize4( n );\n" +"}\n" +"inline void projectLocal(const ConvexPolyhedronCL* hull, const float4 pos, const float4 orn, \n" +"const float4* dir, const float4* vertices, float* min, float* max)\n" +"{\n" +" min[0] = FLT_MAX;\n" +" max[0] = -FLT_MAX;\n" +" int numVerts = hull->m_numVertices;\n" +" const float4 localDir = qtInvRotate(orn,*dir);\n" +" float offset = dot(pos,*dir);\n" +" for(int i=0;i<numVerts;i++)\n" +" {\n" +" float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n" +" if(dp < min[0]) \n" +" min[0] = dp;\n" +" if(dp > max[0]) \n" +" max[0] = dp;\n" +" }\n" +" if(min[0]>max[0])\n" +" {\n" +" float tmp = min[0];\n" +" min[0] = max[0];\n" +" max[0] = tmp;\n" +" }\n" +" min[0] += offset;\n" +" max[0] += offset;\n" +"}\n" +"inline void project(__global const ConvexPolyhedronCL* hull, const float4 pos, const float4 orn, \n" +"const float4* dir, __global const float4* vertices, float* min, float* max)\n" +"{\n" +" min[0] = FLT_MAX;\n" +" max[0] = -FLT_MAX;\n" +" int numVerts = hull->m_numVertices;\n" +" const float4 localDir = qtInvRotate(orn,*dir);\n" +" float offset = dot(pos,*dir);\n" +" for(int i=0;i<numVerts;i++)\n" +" {\n" +" float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n" +" if(dp < min[0]) \n" +" min[0] = dp;\n" +" if(dp > max[0]) \n" +" max[0] = dp;\n" +" }\n" +" if(min[0]>max[0])\n" +" {\n" +" float tmp = min[0];\n" +" min[0] = max[0];\n" +" max[0] = tmp;\n" +" }\n" +" min[0] += offset;\n" +" max[0] += offset;\n" +"}\n" +"inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" +" const float4 posA,const float4 ornA,\n" +" const float4 posB,const float4 ornB,\n" +" float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth)\n" +"{\n" +" float Min0,Max0;\n" +" float Min1,Max1;\n" +" projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);\n" +" project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);\n" +" if(Max0<Min1 || Max1<Min0)\n" +" return false;\n" +" float d0 = Max0 - Min1;\n" +" float d1 = Max1 - Min0;\n" +" *depth = d0<d1 ? d0:d1;\n" +" return true;\n" +"}\n" +"inline bool IsAlmostZero(const float4 v)\n" +"{\n" +" if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)\n" +" return false;\n" +" return true;\n" +"}\n" +"bool findSeparatingAxisLocalA( const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" +" const float4 posA1,\n" +" const float4 ornA,\n" +" const float4 posB1,\n" +" const float4 ornB,\n" +" const float4 DeltaC2,\n" +" \n" +" const float4* verticesA, \n" +" const float4* uniqueEdgesA, \n" +" const btGpuFace* facesA,\n" +" const int* indicesA,\n" +" __global const float4* verticesB, \n" +" __global const float4* uniqueEdgesB, \n" +" __global const btGpuFace* facesB,\n" +" __global const int* indicesB,\n" +" float4* sep,\n" +" float* dmin)\n" +"{\n" +" \n" +" float4 posA = posA1;\n" +" posA.w = 0.f;\n" +" float4 posB = posB1;\n" +" posB.w = 0.f;\n" +" int curPlaneTests=0;\n" +" {\n" +" int numFacesA = hullA->m_numFaces;\n" +" // Test normals from hullA\n" +" for(int i=0;i<numFacesA;i++)\n" +" {\n" +" const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n" +" float4 faceANormalWS = qtRotate(ornA,normal);\n" +" if (dot3F4(DeltaC2,faceANormalWS)<0)\n" +" faceANormalWS*=-1.f;\n" +" curPlaneTests++;\n" +" float d;\n" +" if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))\n" +" return false;\n" +" if(d<*dmin)\n" +" {\n" +" *dmin = d;\n" +" *sep = faceANormalWS;\n" +" }\n" +" }\n" +" }\n" +" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" +" {\n" +" *sep = -(*sep);\n" +" }\n" +" return true;\n" +"}\n" +"bool findSeparatingAxisLocalB( __global const ConvexPolyhedronCL* hullA, const ConvexPolyhedronCL* hullB, \n" +" const float4 posA1,\n" +" const float4 ornA,\n" +" const float4 posB1,\n" +" const float4 ornB,\n" +" const float4 DeltaC2,\n" +" __global const float4* verticesA, \n" +" __global const float4* uniqueEdgesA, \n" +" __global const btGpuFace* facesA,\n" +" __global const int* indicesA,\n" +" const float4* verticesB,\n" +" const float4* uniqueEdgesB, \n" +" const btGpuFace* facesB,\n" +" const int* indicesB,\n" +" float4* sep,\n" +" float* dmin)\n" +"{\n" +" float4 posA = posA1;\n" +" posA.w = 0.f;\n" +" float4 posB = posB1;\n" +" posB.w = 0.f;\n" +" int curPlaneTests=0;\n" +" {\n" +" int numFacesA = hullA->m_numFaces;\n" +" // Test normals from hullA\n" +" for(int i=0;i<numFacesA;i++)\n" +" {\n" +" const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n" +" float4 faceANormalWS = qtRotate(ornA,normal);\n" +" if (dot3F4(DeltaC2,faceANormalWS)<0)\n" +" faceANormalWS *= -1.f;\n" +" curPlaneTests++;\n" +" float d;\n" +" if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d))\n" +" return false;\n" +" if(d<*dmin)\n" +" {\n" +" *dmin = d;\n" +" *sep = faceANormalWS;\n" +" }\n" +" }\n" +" }\n" +" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" +" {\n" +" *sep = -(*sep);\n" +" }\n" +" return true;\n" +"}\n" +"bool findSeparatingAxisEdgeEdgeLocalA( const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" +" const float4 posA1,\n" +" const float4 ornA,\n" +" const float4 posB1,\n" +" const float4 ornB,\n" +" const float4 DeltaC2,\n" +" const float4* verticesA, \n" +" const float4* uniqueEdgesA, \n" +" const btGpuFace* facesA,\n" +" const int* indicesA,\n" +" __global const float4* verticesB, \n" +" __global const float4* uniqueEdgesB, \n" +" __global const btGpuFace* facesB,\n" +" __global const int* indicesB,\n" +" float4* sep,\n" +" float* dmin)\n" +"{\n" +" float4 posA = posA1;\n" +" posA.w = 0.f;\n" +" float4 posB = posB1;\n" +" posB.w = 0.f;\n" +" int curPlaneTests=0;\n" +" int curEdgeEdge = 0;\n" +" // Test edges\n" +" for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n" +" {\n" +" const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];\n" +" float4 edge0World = qtRotate(ornA,edge0);\n" +" for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n" +" {\n" +" const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];\n" +" float4 edge1World = qtRotate(ornB,edge1);\n" +" float4 crossje = cross3(edge0World,edge1World);\n" +" curEdgeEdge++;\n" +" if(!IsAlmostZero(crossje))\n" +" {\n" +" crossje = normalize3(crossje);\n" +" if (dot3F4(DeltaC2,crossje)<0)\n" +" crossje *= -1.f;\n" +" float dist;\n" +" bool result = true;\n" +" {\n" +" float Min0,Max0;\n" +" float Min1,Max1;\n" +" projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);\n" +" project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);\n" +" \n" +" if(Max0<Min1 || Max1<Min0)\n" +" result = false;\n" +" \n" +" float d0 = Max0 - Min1;\n" +" float d1 = Max1 - Min0;\n" +" dist = d0<d1 ? d0:d1;\n" +" result = true;\n" +" }\n" +" \n" +" if(dist<*dmin)\n" +" {\n" +" *dmin = dist;\n" +" *sep = crossje;\n" +" }\n" +" }\n" +" }\n" +" }\n" +" \n" +" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" +" {\n" +" *sep = -(*sep);\n" +" }\n" +" return true;\n" +"}\n" +"inline int findClippingFaces(const float4 separatingNormal,\n" +" const ConvexPolyhedronCL* hullA, \n" +" __global const ConvexPolyhedronCL* hullB,\n" +" const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,\n" +" __global float4* worldVertsA1,\n" +" __global float4* worldNormalsA1,\n" +" __global float4* worldVertsB1,\n" +" int capacityWorldVerts,\n" +" const float minDist, float maxDist,\n" +" const float4* verticesA,\n" +" const btGpuFace* facesA,\n" +" const int* indicesA,\n" +" __global const float4* verticesB,\n" +" __global const btGpuFace* facesB,\n" +" __global const int* indicesB,\n" +" __global int4* clippingFaces, int pairIndex)\n" +"{\n" +" int numContactsOut = 0;\n" +" int numWorldVertsB1= 0;\n" +" \n" +" \n" +" int closestFaceB=0;\n" +" float dmax = -FLT_MAX;\n" +" \n" +" {\n" +" for(int face=0;face<hullB->m_numFaces;face++)\n" +" {\n" +" const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x,\n" +" facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);\n" +" const float4 WorldNormal = qtRotate(ornB, Normal);\n" +" float d = dot3F4(WorldNormal,separatingNormal);\n" +" if (d > dmax)\n" +" {\n" +" dmax = d;\n" +" closestFaceB = face;\n" +" }\n" +" }\n" +" }\n" +" \n" +" {\n" +" const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n" +" int numVertices = polyB.m_numIndices;\n" +" if (numVertices>capacityWorldVerts)\n" +" numVertices = capacityWorldVerts;\n" +" if (numVertices<0)\n" +" numVertices = 0;\n" +" \n" +" for(int e0=0;e0<numVertices;e0++)\n" +" {\n" +" if (e0<capacityWorldVerts)\n" +" {\n" +" const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];\n" +" worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);\n" +" }\n" +" }\n" +" }\n" +" \n" +" int closestFaceA=0;\n" +" {\n" +" float dmin = FLT_MAX;\n" +" for(int face=0;face<hullA->m_numFaces;face++)\n" +" {\n" +" const float4 Normal = make_float4(\n" +" facesA[hullA->m_faceOffset+face].m_plane.x,\n" +" facesA[hullA->m_faceOffset+face].m_plane.y,\n" +" facesA[hullA->m_faceOffset+face].m_plane.z,\n" +" 0.f);\n" +" const float4 faceANormalWS = qtRotate(ornA,Normal);\n" +" \n" +" float d = dot3F4(faceANormalWS,separatingNormal);\n" +" if (d < dmin)\n" +" {\n" +" dmin = d;\n" +" closestFaceA = face;\n" +" worldNormalsA1[pairIndex] = faceANormalWS;\n" +" }\n" +" }\n" +" }\n" +" \n" +" int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices;\n" +" if (numVerticesA>capacityWorldVerts)\n" +" numVerticesA = capacityWorldVerts;\n" +" if (numVerticesA<0)\n" +" numVerticesA=0;\n" +" \n" +" for(int e0=0;e0<numVerticesA;e0++)\n" +" {\n" +" if (e0<capacityWorldVerts)\n" +" {\n" +" const float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];\n" +" worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);\n" +" }\n" +" }\n" +" \n" +" clippingFaces[pairIndex].x = closestFaceA;\n" +" clippingFaces[pairIndex].y = closestFaceB;\n" +" clippingFaces[pairIndex].z = numVerticesA;\n" +" clippingFaces[pairIndex].w = numWorldVertsB1;\n" +" \n" +" \n" +" return numContactsOut;\n" +"}\n" +"// work-in-progress\n" +"__kernel void findConcaveSeparatingAxisVertexFaceKernel( __global int4* concavePairs,\n" +" __global const BodyData* rigidBodies,\n" +" __global const btCollidableGpu* collidables,\n" +" __global const ConvexPolyhedronCL* convexShapes,\n" +" __global const float4* vertices,\n" +" __global const float4* uniqueEdges,\n" +" __global const btGpuFace* faces,\n" +" __global const int* indices,\n" +" __global const btGpuChildShape* gpuChildShapes,\n" +" __global btAabbCL* aabbs,\n" +" __global float4* concaveSeparatingNormalsOut,\n" +" __global int* concaveHasSeparatingNormals,\n" +" __global int4* clippingFacesOut,\n" +" __global float4* worldVertsA1GPU,\n" +" __global float4* worldNormalsAGPU,\n" +" __global float4* worldVertsB1GPU,\n" +" __global float* dmins,\n" +" int vertexFaceCapacity,\n" +" int numConcavePairs\n" +" )\n" +"{\n" +" \n" +" int i = get_global_id(0);\n" +" if (i>=numConcavePairs)\n" +" return;\n" +" \n" +" concaveHasSeparatingNormals[i] = 0;\n" +" \n" +" int pairIdx = i;\n" +" \n" +" int bodyIndexA = concavePairs[i].x;\n" +" int bodyIndexB = concavePairs[i].y;\n" +" \n" +" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" +" \n" +" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" +" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" +" \n" +" if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&&\n" +" collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" +" {\n" +" concavePairs[pairIdx].w = -1;\n" +" return;\n" +" }\n" +" \n" +" \n" +" \n" +" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" +" int numActualConcaveConvexTests = 0;\n" +" \n" +" int f = concavePairs[i].z;\n" +" \n" +" bool overlap = false;\n" +" \n" +" ConvexPolyhedronCL convexPolyhedronA;\n" +" \n" +" //add 3 vertices of the triangle\n" +" convexPolyhedronA.m_numVertices = 3;\n" +" convexPolyhedronA.m_vertexOffset = 0;\n" +" float4 localCenter = make_float4(0.f,0.f,0.f,0.f);\n" +" \n" +" btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n" +" float4 triMinAabb, triMaxAabb;\n" +" btAabbCL triAabb;\n" +" triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n" +" triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n" +" \n" +" float4 verticesA[3];\n" +" for (int i=0;i<3;i++)\n" +" {\n" +" int index = indices[face.m_indexOffset+i];\n" +" float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n" +" verticesA[i] = vert;\n" +" localCenter += vert;\n" +" \n" +" triAabb.m_min = min(triAabb.m_min,vert);\n" +" triAabb.m_max = max(triAabb.m_max,vert);\n" +" \n" +" }\n" +" \n" +" overlap = true;\n" +" overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;\n" +" overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;\n" +" overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;\n" +" \n" +" if (overlap)\n" +" {\n" +" float dmin = FLT_MAX;\n" +" int hasSeparatingAxis=5;\n" +" float4 sepAxis=make_float4(1,2,3,4);\n" +" \n" +" int localCC=0;\n" +" numActualConcaveConvexTests++;\n" +" \n" +" //a triangle has 3 unique edges\n" +" convexPolyhedronA.m_numUniqueEdges = 3;\n" +" convexPolyhedronA.m_uniqueEdgesOffset = 0;\n" +" float4 uniqueEdgesA[3];\n" +" \n" +" uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n" +" uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n" +" uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n" +" \n" +" \n" +" convexPolyhedronA.m_faceOffset = 0;\n" +" \n" +" float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n" +" \n" +" btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n" +" int indicesA[3+3+2+2+2];\n" +" int curUsedIndices=0;\n" +" int fidx=0;\n" +" \n" +" //front size of triangle\n" +" {\n" +" facesA[fidx].m_indexOffset=curUsedIndices;\n" +" indicesA[0] = 0;\n" +" indicesA[1] = 1;\n" +" indicesA[2] = 2;\n" +" curUsedIndices+=3;\n" +" float c = face.m_plane.w;\n" +" facesA[fidx].m_plane.x = normal.x;\n" +" facesA[fidx].m_plane.y = normal.y;\n" +" facesA[fidx].m_plane.z = normal.z;\n" +" facesA[fidx].m_plane.w = c;\n" +" facesA[fidx].m_numIndices=3;\n" +" }\n" +" fidx++;\n" +" //back size of triangle\n" +" {\n" +" facesA[fidx].m_indexOffset=curUsedIndices;\n" +" indicesA[3]=2;\n" +" indicesA[4]=1;\n" +" indicesA[5]=0;\n" +" curUsedIndices+=3;\n" +" float c = dot(normal,verticesA[0]);\n" +" float c1 = -face.m_plane.w;\n" +" facesA[fidx].m_plane.x = -normal.x;\n" +" facesA[fidx].m_plane.y = -normal.y;\n" +" facesA[fidx].m_plane.z = -normal.z;\n" +" facesA[fidx].m_plane.w = c;\n" +" facesA[fidx].m_numIndices=3;\n" +" }\n" +" fidx++;\n" +" \n" +" bool addEdgePlanes = true;\n" +" if (addEdgePlanes)\n" +" {\n" +" int numVertices=3;\n" +" int prevVertex = numVertices-1;\n" +" for (int i=0;i<numVertices;i++)\n" +" {\n" +" float4 v0 = verticesA[i];\n" +" float4 v1 = verticesA[prevVertex];\n" +" \n" +" float4 edgeNormal = normalize(cross(normal,v1-v0));\n" +" float c = -dot(edgeNormal,v0);\n" +" \n" +" facesA[fidx].m_numIndices = 2;\n" +" facesA[fidx].m_indexOffset=curUsedIndices;\n" +" indicesA[curUsedIndices++]=i;\n" +" indicesA[curUsedIndices++]=prevVertex;\n" +" \n" +" facesA[fidx].m_plane.x = edgeNormal.x;\n" +" facesA[fidx].m_plane.y = edgeNormal.y;\n" +" facesA[fidx].m_plane.z = edgeNormal.z;\n" +" facesA[fidx].m_plane.w = c;\n" +" fidx++;\n" +" prevVertex = i;\n" +" }\n" +" }\n" +" convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n" +" convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n" +" \n" +" \n" +" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" +" posA.w = 0.f;\n" +" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" +" posB.w = 0.f;\n" +" \n" +" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" +" float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" +" \n" +" \n" +" \n" +" \n" +" ///////////////////\n" +" ///compound shape support\n" +" \n" +" if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" +" {\n" +" int compoundChild = concavePairs[pairIdx].w;\n" +" int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;\n" +" int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" +" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" +" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" +" float4 newPosB = transform(&childPosB,&posB,&ornB);\n" +" float4 newOrnB = qtMul(ornB,childOrnB);\n" +" posB = newPosB;\n" +" ornB = newOrnB;\n" +" shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" +" }\n" +" //////////////////\n" +" \n" +" float4 c0local = convexPolyhedronA.m_localCenter;\n" +" float4 c0 = transform(&c0local, &posA, &ornA);\n" +" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" +" float4 c1 = transform(&c1local,&posB,&ornB);\n" +" const float4 DeltaC2 = c0 - c1;\n" +" \n" +" \n" +" bool sepA = findSeparatingAxisLocalA( &convexPolyhedronA, &convexShapes[shapeIndexB],\n" +" posA,ornA,\n" +" posB,ornB,\n" +" DeltaC2,\n" +" verticesA,uniqueEdgesA,facesA,indicesA,\n" +" vertices,uniqueEdges,faces,indices,\n" +" &sepAxis,&dmin);\n" +" hasSeparatingAxis = 4;\n" +" if (!sepA)\n" +" {\n" +" hasSeparatingAxis = 0;\n" +" } else\n" +" {\n" +" bool sepB = findSeparatingAxisLocalB( &convexShapes[shapeIndexB],&convexPolyhedronA,\n" +" posB,ornB,\n" +" posA,ornA,\n" +" DeltaC2,\n" +" vertices,uniqueEdges,faces,indices,\n" +" verticesA,uniqueEdgesA,facesA,indicesA,\n" +" &sepAxis,&dmin);\n" +" \n" +" if (!sepB)\n" +" {\n" +" hasSeparatingAxis = 0;\n" +" } else\n" +" {\n" +" hasSeparatingAxis = 1;\n" +" }\n" +" } \n" +" \n" +" if (hasSeparatingAxis)\n" +" {\n" +" dmins[i] = dmin;\n" +" concaveSeparatingNormalsOut[pairIdx]=sepAxis;\n" +" concaveHasSeparatingNormals[i]=1;\n" +" \n" +" } else\n" +" { \n" +" //mark this pair as in-active\n" +" concavePairs[pairIdx].w = -1;\n" +" }\n" +" }\n" +" else\n" +" { \n" +" //mark this pair as in-active\n" +" concavePairs[pairIdx].w = -1;\n" +" }\n" +"}\n" +"// work-in-progress\n" +"__kernel void findConcaveSeparatingAxisEdgeEdgeKernel( __global int4* concavePairs,\n" +" __global const BodyData* rigidBodies,\n" +" __global const btCollidableGpu* collidables,\n" +" __global const ConvexPolyhedronCL* convexShapes,\n" +" __global const float4* vertices,\n" +" __global const float4* uniqueEdges,\n" +" __global const btGpuFace* faces,\n" +" __global const int* indices,\n" +" __global const btGpuChildShape* gpuChildShapes,\n" +" __global btAabbCL* aabbs,\n" +" __global float4* concaveSeparatingNormalsOut,\n" +" __global int* concaveHasSeparatingNormals,\n" +" __global int4* clippingFacesOut,\n" +" __global float4* worldVertsA1GPU,\n" +" __global float4* worldNormalsAGPU,\n" +" __global float4* worldVertsB1GPU,\n" +" __global float* dmins,\n" +" int vertexFaceCapacity,\n" +" int numConcavePairs\n" +" )\n" +"{\n" +" \n" +" int i = get_global_id(0);\n" +" if (i>=numConcavePairs)\n" +" return;\n" +" \n" +" if (!concaveHasSeparatingNormals[i])\n" +" return;\n" +" \n" +" int pairIdx = i;\n" +" \n" +" int bodyIndexA = concavePairs[i].x;\n" +" int bodyIndexB = concavePairs[i].y;\n" +" \n" +" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" +" \n" +" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" +" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" +" \n" +" \n" +" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" +" int numActualConcaveConvexTests = 0;\n" +" \n" +" int f = concavePairs[i].z;\n" +" \n" +" bool overlap = false;\n" +" \n" +" ConvexPolyhedronCL convexPolyhedronA;\n" +" \n" +" //add 3 vertices of the triangle\n" +" convexPolyhedronA.m_numVertices = 3;\n" +" convexPolyhedronA.m_vertexOffset = 0;\n" +" float4 localCenter = make_float4(0.f,0.f,0.f,0.f);\n" +" \n" +" btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n" +" float4 triMinAabb, triMaxAabb;\n" +" btAabbCL triAabb;\n" +" triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n" +" triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n" +" \n" +" float4 verticesA[3];\n" +" for (int i=0;i<3;i++)\n" +" {\n" +" int index = indices[face.m_indexOffset+i];\n" +" float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n" +" verticesA[i] = vert;\n" +" localCenter += vert;\n" +" \n" +" triAabb.m_min = min(triAabb.m_min,vert);\n" +" triAabb.m_max = max(triAabb.m_max,vert);\n" +" \n" +" }\n" +" \n" +" overlap = true;\n" +" overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;\n" +" overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;\n" +" overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;\n" +" \n" +" if (overlap)\n" +" {\n" +" float dmin = dmins[i];\n" +" int hasSeparatingAxis=5;\n" +" float4 sepAxis=make_float4(1,2,3,4);\n" +" sepAxis = concaveSeparatingNormalsOut[pairIdx];\n" +" \n" +" int localCC=0;\n" +" numActualConcaveConvexTests++;\n" +" \n" +" //a triangle has 3 unique edges\n" +" convexPolyhedronA.m_numUniqueEdges = 3;\n" +" convexPolyhedronA.m_uniqueEdgesOffset = 0;\n" +" float4 uniqueEdgesA[3];\n" +" \n" +" uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n" +" uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n" +" uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n" +" \n" +" \n" +" convexPolyhedronA.m_faceOffset = 0;\n" +" \n" +" float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n" +" \n" +" btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n" +" int indicesA[3+3+2+2+2];\n" +" int curUsedIndices=0;\n" +" int fidx=0;\n" +" \n" +" //front size of triangle\n" +" {\n" +" facesA[fidx].m_indexOffset=curUsedIndices;\n" +" indicesA[0] = 0;\n" +" indicesA[1] = 1;\n" +" indicesA[2] = 2;\n" +" curUsedIndices+=3;\n" +" float c = face.m_plane.w;\n" +" facesA[fidx].m_plane.x = normal.x;\n" +" facesA[fidx].m_plane.y = normal.y;\n" +" facesA[fidx].m_plane.z = normal.z;\n" +" facesA[fidx].m_plane.w = c;\n" +" facesA[fidx].m_numIndices=3;\n" +" }\n" +" fidx++;\n" +" //back size of triangle\n" +" {\n" +" facesA[fidx].m_indexOffset=curUsedIndices;\n" +" indicesA[3]=2;\n" +" indicesA[4]=1;\n" +" indicesA[5]=0;\n" +" curUsedIndices+=3;\n" +" float c = dot(normal,verticesA[0]);\n" +" float c1 = -face.m_plane.w;\n" +" facesA[fidx].m_plane.x = -normal.x;\n" +" facesA[fidx].m_plane.y = -normal.y;\n" +" facesA[fidx].m_plane.z = -normal.z;\n" +" facesA[fidx].m_plane.w = c;\n" +" facesA[fidx].m_numIndices=3;\n" +" }\n" +" fidx++;\n" +" \n" +" bool addEdgePlanes = true;\n" +" if (addEdgePlanes)\n" +" {\n" +" int numVertices=3;\n" +" int prevVertex = numVertices-1;\n" +" for (int i=0;i<numVertices;i++)\n" +" {\n" +" float4 v0 = verticesA[i];\n" +" float4 v1 = verticesA[prevVertex];\n" +" \n" +" float4 edgeNormal = normalize(cross(normal,v1-v0));\n" +" float c = -dot(edgeNormal,v0);\n" +" \n" +" facesA[fidx].m_numIndices = 2;\n" +" facesA[fidx].m_indexOffset=curUsedIndices;\n" +" indicesA[curUsedIndices++]=i;\n" +" indicesA[curUsedIndices++]=prevVertex;\n" +" \n" +" facesA[fidx].m_plane.x = edgeNormal.x;\n" +" facesA[fidx].m_plane.y = edgeNormal.y;\n" +" facesA[fidx].m_plane.z = edgeNormal.z;\n" +" facesA[fidx].m_plane.w = c;\n" +" fidx++;\n" +" prevVertex = i;\n" +" }\n" +" }\n" +" convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n" +" convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n" +" \n" +" \n" +" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" +" posA.w = 0.f;\n" +" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" +" posB.w = 0.f;\n" +" \n" +" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" +" float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" +" \n" +" \n" +" \n" +" \n" +" ///////////////////\n" +" ///compound shape support\n" +" \n" +" if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" +" {\n" +" int compoundChild = concavePairs[pairIdx].w;\n" +" int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;\n" +" int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" +" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" +" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" +" float4 newPosB = transform(&childPosB,&posB,&ornB);\n" +" float4 newOrnB = qtMul(ornB,childOrnB);\n" +" posB = newPosB;\n" +" ornB = newOrnB;\n" +" shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" +" }\n" +" //////////////////\n" +" \n" +" float4 c0local = convexPolyhedronA.m_localCenter;\n" +" float4 c0 = transform(&c0local, &posA, &ornA);\n" +" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" +" float4 c1 = transform(&c1local,&posB,&ornB);\n" +" const float4 DeltaC2 = c0 - c1;\n" +" \n" +" \n" +" {\n" +" bool sepEE = findSeparatingAxisEdgeEdgeLocalA( &convexPolyhedronA, &convexShapes[shapeIndexB],\n" +" posA,ornA,\n" +" posB,ornB,\n" +" DeltaC2,\n" +" verticesA,uniqueEdgesA,facesA,indicesA,\n" +" vertices,uniqueEdges,faces,indices,\n" +" &sepAxis,&dmin);\n" +" \n" +" if (!sepEE)\n" +" {\n" +" hasSeparatingAxis = 0;\n" +" } else\n" +" {\n" +" hasSeparatingAxis = 1;\n" +" }\n" +" }\n" +" \n" +" \n" +" if (hasSeparatingAxis)\n" +" {\n" +" sepAxis.w = dmin;\n" +" dmins[i] = dmin;\n" +" concaveSeparatingNormalsOut[pairIdx]=sepAxis;\n" +" concaveHasSeparatingNormals[i]=1;\n" +" \n" +" float minDist = -1e30f;\n" +" float maxDist = 0.02f;\n" +" \n" +" findClippingFaces(sepAxis,\n" +" &convexPolyhedronA,\n" +" &convexShapes[shapeIndexB],\n" +" posA,ornA,\n" +" posB,ornB,\n" +" worldVertsA1GPU,\n" +" worldNormalsAGPU,\n" +" worldVertsB1GPU,\n" +" vertexFaceCapacity,\n" +" minDist, maxDist,\n" +" verticesA,\n" +" facesA,\n" +" indicesA,\n" +" vertices,\n" +" faces,\n" +" indices,\n" +" clippingFacesOut, pairIdx);\n" +" \n" +" \n" +" } else\n" +" { \n" +" //mark this pair as in-active\n" +" concavePairs[pairIdx].w = -1;\n" +" }\n" +" }\n" +" else\n" +" { \n" +" //mark this pair as in-active\n" +" concavePairs[pairIdx].w = -1;\n" +" }\n" +" \n" +" concavePairs[i].z = -1;//for the next stage, z is used to determine existing contact points\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h new file mode 100644 index 0000000000..6f8b0a90db --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h @@ -0,0 +1,2104 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* satKernelsCL= \ +"//keep this enum in sync with the CPU version (in btCollidable.h)\n" +"//written by Erwin Coumans\n" +"#define SHAPE_CONVEX_HULL 3\n" +"#define SHAPE_CONCAVE_TRIMESH 5\n" +"#define TRIANGLE_NUM_CONVEX_FACES 5\n" +"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n" +"#define B3_MAX_STACK_DEPTH 256\n" +"typedef unsigned int u32;\n" +"///keep this in sync with btCollidable.h\n" +"typedef struct\n" +"{\n" +" union {\n" +" int m_numChildShapes;\n" +" int m_bvhIndex;\n" +" };\n" +" union\n" +" {\n" +" float m_radius;\n" +" int m_compoundBvhIndex;\n" +" };\n" +" \n" +" int m_shapeType;\n" +" int m_shapeIndex;\n" +" \n" +"} btCollidableGpu;\n" +"#define MAX_NUM_PARTS_IN_BITS 10\n" +"///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.\n" +"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n" +"typedef struct\n" +"{\n" +" //12 bytes\n" +" unsigned short int m_quantizedAabbMin[3];\n" +" unsigned short int m_quantizedAabbMax[3];\n" +" //4 bytes\n" +" int m_escapeIndexOrTriangleIndex;\n" +"} b3QuantizedBvhNode;\n" +"typedef struct\n" +"{\n" +" float4 m_aabbMin;\n" +" float4 m_aabbMax;\n" +" float4 m_quantization;\n" +" int m_numNodes;\n" +" int m_numSubTrees;\n" +" int m_nodeOffset;\n" +" int m_subTreeOffset;\n" +"} b3BvhInfo;\n" +"int getTriangleIndex(const b3QuantizedBvhNode* rootNode)\n" +"{\n" +" unsigned int x=0;\n" +" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n" +" // Get only the lower bits where the triangle index is stored\n" +" return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n" +"}\n" +"int getTriangleIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n" +"{\n" +" unsigned int x=0;\n" +" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n" +" // Get only the lower bits where the triangle index is stored\n" +" return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n" +"}\n" +"int isLeafNode(const b3QuantizedBvhNode* rootNode)\n" +"{\n" +" //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n" +" return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n" +"}\n" +"int isLeafNodeGlobal(__global const b3QuantizedBvhNode* rootNode)\n" +"{\n" +" //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n" +" return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n" +"}\n" +" \n" +"int getEscapeIndex(const b3QuantizedBvhNode* rootNode)\n" +"{\n" +" return -rootNode->m_escapeIndexOrTriangleIndex;\n" +"}\n" +"int getEscapeIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n" +"{\n" +" return -rootNode->m_escapeIndexOrTriangleIndex;\n" +"}\n" +"typedef struct\n" +"{\n" +" //12 bytes\n" +" unsigned short int m_quantizedAabbMin[3];\n" +" unsigned short int m_quantizedAabbMax[3];\n" +" //4 bytes, points to the root of the subtree\n" +" int m_rootNodeIndex;\n" +" //4 bytes\n" +" int m_subtreeSize;\n" +" int m_padding[3];\n" +"} b3BvhSubtreeInfo;\n" +"typedef struct\n" +"{\n" +" float4 m_childPosition;\n" +" float4 m_childOrientation;\n" +" int m_shapeIndex;\n" +" int m_unused0;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"} btGpuChildShape;\n" +"typedef struct\n" +"{\n" +" float4 m_pos;\n" +" float4 m_quat;\n" +" float4 m_linVel;\n" +" float4 m_angVel;\n" +" u32 m_collidableIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"} BodyData;\n" +"typedef struct \n" +"{\n" +" float4 m_localCenter;\n" +" float4 m_extents;\n" +" float4 mC;\n" +" float4 mE;\n" +" \n" +" float m_radius;\n" +" int m_faceOffset;\n" +" int m_numFaces;\n" +" int m_numVertices;\n" +" int m_vertexOffset;\n" +" int m_uniqueEdgesOffset;\n" +" int m_numUniqueEdges;\n" +" int m_unused;\n" +"} ConvexPolyhedronCL;\n" +"typedef struct \n" +"{\n" +" union\n" +" {\n" +" float4 m_min;\n" +" float m_minElems[4];\n" +" int m_minIndices[4];\n" +" };\n" +" union\n" +" {\n" +" float4 m_max;\n" +" float m_maxElems[4];\n" +" int m_maxIndices[4];\n" +" };\n" +"} btAabbCL;\n" +"#ifndef B3_AABB_H\n" +"#define B3_AABB_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_MAT3x3_H\n" +"#define B3_MAT3x3_H\n" +"#ifndef B3_QUAT_H\n" +"#define B3_QUAT_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif\n" +"#endif\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Quat;\n" +" #define b3QuatConstArg const b3Quat\n" +" \n" +" \n" +"inline float4 b3FastNormalize4(float4 v)\n" +"{\n" +" v = (float4)(v.xyz,0.f);\n" +" return fast_normalize(v);\n" +"}\n" +" \n" +"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" +"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" +"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" +"{\n" +" b3Quat ans;\n" +" ans = b3Cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - b3Dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" +"{\n" +" b3Quat q;\n" +" q=in;\n" +" //return b3FastNormalize4(in);\n" +" float len = native_sqrt(dot(q, q));\n" +" if(len > 0.f)\n" +" {\n" +" q *= 1.f / len;\n" +" }\n" +" else\n" +" {\n" +" q.x = q.y = q.z = 0.f;\n" +" q.w = 1.f;\n" +" }\n" +" return q;\n" +"}\n" +"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" b3Quat qInv = b3QuatInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" return b3QuatRotate( b3QuatInvert( q ), vec );\n" +"}\n" +"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" +"{\n" +" return b3QuatRotate( orientation, point ) + (translation);\n" +"}\n" +" \n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"typedef struct\n" +"{\n" +" b3Float4 m_row[3];\n" +"}b3Mat3x3;\n" +"#define b3Mat3x3ConstArg const b3Mat3x3\n" +"#define b3GetRow(m,row) (m.m_row[row])\n" +"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" +"{\n" +" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" +" b3Mat3x3 out;\n" +" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" +" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" +" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" +" out.m_row[0].w = 0.f;\n" +" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" +" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" +" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" +" out.m_row[1].w = 0.f;\n" +" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" +" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" +" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" +" out.m_row[2].w = 0.f;\n" +" return out;\n" +"}\n" +"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = fabs(matIn.m_row[0]);\n" +" out.m_row[1] = fabs(matIn.m_row[1]);\n" +" out.m_row[2] = fabs(matIn.m_row[2]);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtZero();\n" +"__inline\n" +"b3Mat3x3 mtIdentity();\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Mat3x3 mtZero()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(0.f);\n" +" m.m_row[1] = (b3Float4)(0.f);\n" +" m.m_row[2] = (b3Float4)(0.f);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtIdentity()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(1,0,0,0);\n" +" m.m_row[1] = (b3Float4)(0,1,0,0);\n" +" m.m_row[2] = (b3Float4)(0,0,1,0);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" +" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" +" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" +"{\n" +" b3Mat3x3 transB;\n" +" transB = mtTranspose( b );\n" +" b3Mat3x3 ans;\n" +" // why this doesn't run when 0ing in the for{}\n" +" a.m_row[0].w = 0.f;\n" +" a.m_row[1].w = 0.f;\n" +" a.m_row[2].w = 0.f;\n" +" for(int i=0; i<3; i++)\n" +" {\n" +"// a.m_row[i].w = 0.f;\n" +" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" +" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" +" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" +" ans.m_row[i].w = 0.f;\n" +" }\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" +"{\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a.m_row[0], b );\n" +" ans.y = b3Dot3F4( a.m_row[1], b );\n" +" ans.z = b3Dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" +"{\n" +" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a, colx );\n" +" ans.y = b3Dot3F4( a, coly );\n" +" ans.z = b3Dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"#endif\n" +"#endif //B3_MAT3x3_H\n" +"typedef struct b3Aabb b3Aabb_t;\n" +"struct b3Aabb\n" +"{\n" +" union\n" +" {\n" +" float m_min[4];\n" +" b3Float4 m_minVec;\n" +" int m_minIndices[4];\n" +" };\n" +" union\n" +" {\n" +" float m_max[4];\n" +" b3Float4 m_maxVec;\n" +" int m_signedMaxIndices[4];\n" +" };\n" +"};\n" +"inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n" +" b3Float4ConstArg pos,\n" +" b3QuatConstArg orn,\n" +" b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n" +"{\n" +" b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n" +" localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n" +" b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n" +" b3Mat3x3 m;\n" +" m = b3QuatGetRotationMatrix(orn);\n" +" b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n" +" b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n" +" \n" +" b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n" +" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n" +" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n" +" 0.f);\n" +" *aabbMinOut = center-extent;\n" +" *aabbMaxOut = center+extent;\n" +"}\n" +"/// conservative test for overlap between two aabbs\n" +"inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n" +" b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n" +"{\n" +" bool overlap = true;\n" +" overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n" +" overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n" +" overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"#endif //B3_AABB_H\n" +"/*\n" +"Bullet Continuous Collision Detection and Physics Library\n" +"Copyright (c) 2003-2013 Erwin Coumans http://bulletphysics.org\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose,\n" +"including commercial applications, and to alter it and redistribute it freely,\n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"#ifndef B3_INT2_H\n" +"#define B3_INT2_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#define b3UnsignedInt2 uint2\n" +"#define b3Int2 int2\n" +"#define b3MakeInt2 (int2)\n" +"#endif //__cplusplus\n" +"#endif\n" +"typedef struct\n" +"{\n" +" float4 m_plane;\n" +" int m_indexOffset;\n" +" int m_numIndices;\n" +"} btGpuFace;\n" +"#define make_float4 (float4)\n" +"__inline\n" +"float4 cross3(float4 a, float4 b)\n" +"{\n" +" return cross(a,b);\n" +" \n" +"// float4 a1 = make_float4(a.xyz,0.f);\n" +"// float4 b1 = make_float4(b.xyz,0.f);\n" +"// return cross(a1,b1);\n" +"//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);\n" +" \n" +" // float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);\n" +" \n" +" //return c;\n" +"}\n" +"__inline\n" +"float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = make_float4(a.xyz,0.f);\n" +" float4 b1 = make_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" v = make_float4(v.xyz,0.f);\n" +" return fast_normalize(v);\n" +"}\n" +"///////////////////////////////////////\n" +"// Quaternion\n" +"///////////////////////////////////////\n" +"typedef float4 Quaternion;\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b);\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in);\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec);\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q);\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b)\n" +"{\n" +" Quaternion ans;\n" +" ans = cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in)\n" +"{\n" +" return fastNormalize4(in);\n" +"// in /= length( in );\n" +"// return in;\n" +"}\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec)\n" +"{\n" +" Quaternion qInv = qtInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q)\n" +"{\n" +" return (Quaternion)(-q.xyz, q.w);\n" +"}\n" +"__inline\n" +"float4 qtInvRotate(const Quaternion q, float4 vec)\n" +"{\n" +" return qtRotate( qtInvert( q ), vec );\n" +"}\n" +"__inline\n" +"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n" +"{\n" +" return qtRotate( *orientation, *p ) + (*translation);\n" +"}\n" +"__inline\n" +"float4 normalize3(const float4 a)\n" +"{\n" +" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" +" return fastNormalize4( n );\n" +"}\n" +"inline void projectLocal(const ConvexPolyhedronCL* hull, const float4 pos, const float4 orn, \n" +"const float4* dir, const float4* vertices, float* min, float* max)\n" +"{\n" +" min[0] = FLT_MAX;\n" +" max[0] = -FLT_MAX;\n" +" int numVerts = hull->m_numVertices;\n" +" const float4 localDir = qtInvRotate(orn,*dir);\n" +" float offset = dot(pos,*dir);\n" +" for(int i=0;i<numVerts;i++)\n" +" {\n" +" float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n" +" if(dp < min[0]) \n" +" min[0] = dp;\n" +" if(dp > max[0]) \n" +" max[0] = dp;\n" +" }\n" +" if(min[0]>max[0])\n" +" {\n" +" float tmp = min[0];\n" +" min[0] = max[0];\n" +" max[0] = tmp;\n" +" }\n" +" min[0] += offset;\n" +" max[0] += offset;\n" +"}\n" +"inline void project(__global const ConvexPolyhedronCL* hull, const float4 pos, const float4 orn, \n" +"const float4* dir, __global const float4* vertices, float* min, float* max)\n" +"{\n" +" min[0] = FLT_MAX;\n" +" max[0] = -FLT_MAX;\n" +" int numVerts = hull->m_numVertices;\n" +" const float4 localDir = qtInvRotate(orn,*dir);\n" +" float offset = dot(pos,*dir);\n" +" for(int i=0;i<numVerts;i++)\n" +" {\n" +" float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n" +" if(dp < min[0]) \n" +" min[0] = dp;\n" +" if(dp > max[0]) \n" +" max[0] = dp;\n" +" }\n" +" if(min[0]>max[0])\n" +" {\n" +" float tmp = min[0];\n" +" min[0] = max[0];\n" +" max[0] = tmp;\n" +" }\n" +" min[0] += offset;\n" +" max[0] += offset;\n" +"}\n" +"inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" +" const float4 posA,const float4 ornA,\n" +" const float4 posB,const float4 ornB,\n" +" float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth)\n" +"{\n" +" float Min0,Max0;\n" +" float Min1,Max1;\n" +" projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);\n" +" project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);\n" +" if(Max0<Min1 || Max1<Min0)\n" +" return false;\n" +" float d0 = Max0 - Min1;\n" +" float d1 = Max1 - Min0;\n" +" *depth = d0<d1 ? d0:d1;\n" +" return true;\n" +"}\n" +"inline bool IsAlmostZero(const float4 v)\n" +"{\n" +" if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)\n" +" return false;\n" +" return true;\n" +"}\n" +"bool findSeparatingAxisLocalA( const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" +" const float4 posA1,\n" +" const float4 ornA,\n" +" const float4 posB1,\n" +" const float4 ornB,\n" +" const float4 DeltaC2,\n" +" \n" +" const float4* verticesA, \n" +" const float4* uniqueEdgesA, \n" +" const btGpuFace* facesA,\n" +" const int* indicesA,\n" +" __global const float4* verticesB, \n" +" __global const float4* uniqueEdgesB, \n" +" __global const btGpuFace* facesB,\n" +" __global const int* indicesB,\n" +" float4* sep,\n" +" float* dmin)\n" +"{\n" +" \n" +" float4 posA = posA1;\n" +" posA.w = 0.f;\n" +" float4 posB = posB1;\n" +" posB.w = 0.f;\n" +" int curPlaneTests=0;\n" +" {\n" +" int numFacesA = hullA->m_numFaces;\n" +" // Test normals from hullA\n" +" for(int i=0;i<numFacesA;i++)\n" +" {\n" +" const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n" +" float4 faceANormalWS = qtRotate(ornA,normal);\n" +" if (dot3F4(DeltaC2,faceANormalWS)<0)\n" +" faceANormalWS*=-1.f;\n" +" curPlaneTests++;\n" +" float d;\n" +" if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))\n" +" return false;\n" +" if(d<*dmin)\n" +" {\n" +" *dmin = d;\n" +" *sep = faceANormalWS;\n" +" }\n" +" }\n" +" }\n" +" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" +" {\n" +" *sep = -(*sep);\n" +" }\n" +" return true;\n" +"}\n" +"bool findSeparatingAxisLocalB( __global const ConvexPolyhedronCL* hullA, const ConvexPolyhedronCL* hullB, \n" +" const float4 posA1,\n" +" const float4 ornA,\n" +" const float4 posB1,\n" +" const float4 ornB,\n" +" const float4 DeltaC2,\n" +" __global const float4* verticesA, \n" +" __global const float4* uniqueEdgesA, \n" +" __global const btGpuFace* facesA,\n" +" __global const int* indicesA,\n" +" const float4* verticesB,\n" +" const float4* uniqueEdgesB, \n" +" const btGpuFace* facesB,\n" +" const int* indicesB,\n" +" float4* sep,\n" +" float* dmin)\n" +"{\n" +" float4 posA = posA1;\n" +" posA.w = 0.f;\n" +" float4 posB = posB1;\n" +" posB.w = 0.f;\n" +" int curPlaneTests=0;\n" +" {\n" +" int numFacesA = hullA->m_numFaces;\n" +" // Test normals from hullA\n" +" for(int i=0;i<numFacesA;i++)\n" +" {\n" +" const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n" +" float4 faceANormalWS = qtRotate(ornA,normal);\n" +" if (dot3F4(DeltaC2,faceANormalWS)<0)\n" +" faceANormalWS *= -1.f;\n" +" curPlaneTests++;\n" +" float d;\n" +" if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d))\n" +" return false;\n" +" if(d<*dmin)\n" +" {\n" +" *dmin = d;\n" +" *sep = faceANormalWS;\n" +" }\n" +" }\n" +" }\n" +" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" +" {\n" +" *sep = -(*sep);\n" +" }\n" +" return true;\n" +"}\n" +"bool findSeparatingAxisEdgeEdgeLocalA( const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" +" const float4 posA1,\n" +" const float4 ornA,\n" +" const float4 posB1,\n" +" const float4 ornB,\n" +" const float4 DeltaC2,\n" +" const float4* verticesA, \n" +" const float4* uniqueEdgesA, \n" +" const btGpuFace* facesA,\n" +" const int* indicesA,\n" +" __global const float4* verticesB, \n" +" __global const float4* uniqueEdgesB, \n" +" __global const btGpuFace* facesB,\n" +" __global const int* indicesB,\n" +" float4* sep,\n" +" float* dmin)\n" +"{\n" +" float4 posA = posA1;\n" +" posA.w = 0.f;\n" +" float4 posB = posB1;\n" +" posB.w = 0.f;\n" +" int curPlaneTests=0;\n" +" int curEdgeEdge = 0;\n" +" // Test edges\n" +" for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n" +" {\n" +" const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];\n" +" float4 edge0World = qtRotate(ornA,edge0);\n" +" for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n" +" {\n" +" const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];\n" +" float4 edge1World = qtRotate(ornB,edge1);\n" +" float4 crossje = cross3(edge0World,edge1World);\n" +" curEdgeEdge++;\n" +" if(!IsAlmostZero(crossje))\n" +" {\n" +" crossje = normalize3(crossje);\n" +" if (dot3F4(DeltaC2,crossje)<0)\n" +" crossje *= -1.f;\n" +" float dist;\n" +" bool result = true;\n" +" {\n" +" float Min0,Max0;\n" +" float Min1,Max1;\n" +" projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);\n" +" project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);\n" +" \n" +" if(Max0<Min1 || Max1<Min0)\n" +" result = false;\n" +" \n" +" float d0 = Max0 - Min1;\n" +" float d1 = Max1 - Min0;\n" +" dist = d0<d1 ? d0:d1;\n" +" result = true;\n" +" }\n" +" \n" +" if(dist<*dmin)\n" +" {\n" +" *dmin = dist;\n" +" *sep = crossje;\n" +" }\n" +" }\n" +" }\n" +" }\n" +" \n" +" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" +" {\n" +" *sep = -(*sep);\n" +" }\n" +" return true;\n" +"}\n" +"inline bool TestSepAxis(__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" +" const float4 posA,const float4 ornA,\n" +" const float4 posB,const float4 ornB,\n" +" float4* sep_axis, __global const float4* vertices,float* depth)\n" +"{\n" +" float Min0,Max0;\n" +" float Min1,Max1;\n" +" project(hullA,posA,ornA,sep_axis,vertices, &Min0, &Max0);\n" +" project(hullB,posB,ornB, sep_axis,vertices, &Min1, &Max1);\n" +" if(Max0<Min1 || Max1<Min0)\n" +" return false;\n" +" float d0 = Max0 - Min1;\n" +" float d1 = Max1 - Min0;\n" +" *depth = d0<d1 ? d0:d1;\n" +" return true;\n" +"}\n" +"bool findSeparatingAxis( __global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" +" const float4 posA1,\n" +" const float4 ornA,\n" +" const float4 posB1,\n" +" const float4 ornB,\n" +" const float4 DeltaC2,\n" +" __global const float4* vertices, \n" +" __global const float4* uniqueEdges, \n" +" __global const btGpuFace* faces,\n" +" __global const int* indices,\n" +" float4* sep,\n" +" float* dmin)\n" +"{\n" +" \n" +" float4 posA = posA1;\n" +" posA.w = 0.f;\n" +" float4 posB = posB1;\n" +" posB.w = 0.f;\n" +" \n" +" int curPlaneTests=0;\n" +" {\n" +" int numFacesA = hullA->m_numFaces;\n" +" // Test normals from hullA\n" +" for(int i=0;i<numFacesA;i++)\n" +" {\n" +" const float4 normal = faces[hullA->m_faceOffset+i].m_plane;\n" +" float4 faceANormalWS = qtRotate(ornA,normal);\n" +" \n" +" if (dot3F4(DeltaC2,faceANormalWS)<0)\n" +" faceANormalWS*=-1.f;\n" +" \n" +" curPlaneTests++;\n" +" \n" +" float d;\n" +" if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, vertices,&d))\n" +" return false;\n" +" \n" +" if(d<*dmin)\n" +" {\n" +" *dmin = d;\n" +" *sep = faceANormalWS;\n" +" }\n" +" }\n" +" }\n" +" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" +" {\n" +" *sep = -(*sep);\n" +" }\n" +" \n" +" return true;\n" +"}\n" +"bool findSeparatingAxisUnitSphere( __global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" +" const float4 posA1,\n" +" const float4 ornA,\n" +" const float4 posB1,\n" +" const float4 ornB,\n" +" const float4 DeltaC2,\n" +" __global const float4* vertices,\n" +" __global const float4* unitSphereDirections,\n" +" int numUnitSphereDirections,\n" +" float4* sep,\n" +" float* dmin)\n" +"{\n" +" \n" +" float4 posA = posA1;\n" +" posA.w = 0.f;\n" +" float4 posB = posB1;\n" +" posB.w = 0.f;\n" +" int curPlaneTests=0;\n" +" int curEdgeEdge = 0;\n" +" // Test unit sphere directions\n" +" for (int i=0;i<numUnitSphereDirections;i++)\n" +" {\n" +" float4 crossje;\n" +" crossje = unitSphereDirections[i]; \n" +" if (dot3F4(DeltaC2,crossje)>0)\n" +" crossje *= -1.f;\n" +" {\n" +" float dist;\n" +" bool result = true;\n" +" float Min0,Max0;\n" +" float Min1,Max1;\n" +" project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);\n" +" project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);\n" +" \n" +" if(Max0<Min1 || Max1<Min0)\n" +" return false;\n" +" \n" +" float d0 = Max0 - Min1;\n" +" float d1 = Max1 - Min0;\n" +" dist = d0<d1 ? d0:d1;\n" +" result = true;\n" +" \n" +" if(dist<*dmin)\n" +" {\n" +" *dmin = dist;\n" +" *sep = crossje;\n" +" }\n" +" }\n" +" }\n" +" \n" +" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" +" {\n" +" *sep = -(*sep);\n" +" }\n" +" return true;\n" +"}\n" +"bool findSeparatingAxisEdgeEdge( __global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" +" const float4 posA1,\n" +" const float4 ornA,\n" +" const float4 posB1,\n" +" const float4 ornB,\n" +" const float4 DeltaC2,\n" +" __global const float4* vertices, \n" +" __global const float4* uniqueEdges, \n" +" __global const btGpuFace* faces,\n" +" __global const int* indices,\n" +" float4* sep,\n" +" float* dmin)\n" +"{\n" +" \n" +" float4 posA = posA1;\n" +" posA.w = 0.f;\n" +" float4 posB = posB1;\n" +" posB.w = 0.f;\n" +" int curPlaneTests=0;\n" +" int curEdgeEdge = 0;\n" +" // Test edges\n" +" for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n" +" {\n" +" const float4 edge0 = uniqueEdges[hullA->m_uniqueEdgesOffset+e0];\n" +" float4 edge0World = qtRotate(ornA,edge0);\n" +" for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n" +" {\n" +" const float4 edge1 = uniqueEdges[hullB->m_uniqueEdgesOffset+e1];\n" +" float4 edge1World = qtRotate(ornB,edge1);\n" +" float4 crossje = cross3(edge0World,edge1World);\n" +" curEdgeEdge++;\n" +" if(!IsAlmostZero(crossje))\n" +" {\n" +" crossje = normalize3(crossje);\n" +" if (dot3F4(DeltaC2,crossje)<0)\n" +" crossje*=-1.f;\n" +" \n" +" float dist;\n" +" bool result = true;\n" +" {\n" +" float Min0,Max0;\n" +" float Min1,Max1;\n" +" project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);\n" +" project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);\n" +" \n" +" if(Max0<Min1 || Max1<Min0)\n" +" return false;\n" +" \n" +" float d0 = Max0 - Min1;\n" +" float d1 = Max1 - Min0;\n" +" dist = d0<d1 ? d0:d1;\n" +" result = true;\n" +" }\n" +" \n" +" if(dist<*dmin)\n" +" {\n" +" *dmin = dist;\n" +" *sep = crossje;\n" +" }\n" +" }\n" +" }\n" +" }\n" +" \n" +" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" +" {\n" +" *sep = -(*sep);\n" +" }\n" +" return true;\n" +"}\n" +"// work-in-progress\n" +"__kernel void processCompoundPairsKernel( __global const int4* gpuCompoundPairs,\n" +" __global const BodyData* rigidBodies, \n" +" __global const btCollidableGpu* collidables,\n" +" __global const ConvexPolyhedronCL* convexShapes, \n" +" __global const float4* vertices,\n" +" __global const float4* uniqueEdges,\n" +" __global const btGpuFace* faces,\n" +" __global const int* indices,\n" +" __global btAabbCL* aabbs,\n" +" __global const btGpuChildShape* gpuChildShapes,\n" +" __global volatile float4* gpuCompoundSepNormalsOut,\n" +" __global volatile int* gpuHasCompoundSepNormalsOut,\n" +" int numCompoundPairs\n" +" )\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i<numCompoundPairs)\n" +" {\n" +" int bodyIndexA = gpuCompoundPairs[i].x;\n" +" int bodyIndexB = gpuCompoundPairs[i].y;\n" +" int childShapeIndexA = gpuCompoundPairs[i].z;\n" +" int childShapeIndexB = gpuCompoundPairs[i].w;\n" +" \n" +" int collidableIndexA = -1;\n" +" int collidableIndexB = -1;\n" +" \n" +" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" +" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" +" \n" +" float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" +" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" +" \n" +" if (childShapeIndexA >= 0)\n" +" {\n" +" collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n" +" float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n" +" float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n" +" float4 newPosA = qtRotate(ornA,childPosA)+posA;\n" +" float4 newOrnA = qtMul(ornA,childOrnA);\n" +" posA = newPosA;\n" +" ornA = newOrnA;\n" +" } else\n" +" {\n" +" collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" }\n" +" \n" +" if (childShapeIndexB>=0)\n" +" {\n" +" collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" +" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" +" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" +" float4 newPosB = transform(&childPosB,&posB,&ornB);\n" +" float4 newOrnB = qtMul(ornB,childOrnB);\n" +" posB = newPosB;\n" +" ornB = newOrnB;\n" +" } else\n" +" {\n" +" collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; \n" +" }\n" +" \n" +" gpuHasCompoundSepNormalsOut[i] = 0;\n" +" \n" +" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" +" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" +" \n" +" int shapeTypeA = collidables[collidableIndexA].m_shapeType;\n" +" int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n" +" \n" +" if ((shapeTypeA != SHAPE_CONVEX_HULL) || (shapeTypeB != SHAPE_CONVEX_HULL))\n" +" {\n" +" return;\n" +" }\n" +" int hasSeparatingAxis = 5;\n" +" \n" +" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" +" float dmin = FLT_MAX;\n" +" posA.w = 0.f;\n" +" posB.w = 0.f;\n" +" float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" +" float4 c0 = transform(&c0local, &posA, &ornA);\n" +" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" +" float4 c1 = transform(&c1local,&posB,&ornB);\n" +" const float4 DeltaC2 = c0 - c1;\n" +" float4 sepNormal = make_float4(1,0,0,0);\n" +" bool sepA = findSeparatingAxis( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n" +" hasSeparatingAxis = 4;\n" +" if (!sepA)\n" +" {\n" +" hasSeparatingAxis = 0;\n" +" } else\n" +" {\n" +" bool sepB = findSeparatingAxis( &convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,posA,ornA,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n" +" if (!sepB)\n" +" {\n" +" hasSeparatingAxis = 0;\n" +" } else//(!sepB)\n" +" {\n" +" bool sepEE = findSeparatingAxisEdgeEdge( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n" +" if (sepEE)\n" +" {\n" +" gpuCompoundSepNormalsOut[i] = sepNormal;//fastNormalize4(sepNormal);\n" +" gpuHasCompoundSepNormalsOut[i] = 1;\n" +" }//sepEE\n" +" }//(!sepB)\n" +" }//(!sepA)\n" +" \n" +" \n" +" }\n" +" \n" +"}\n" +"inline b3Float4 MyUnQuantize(const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin)\n" +"{\n" +" b3Float4 vecOut;\n" +" vecOut = b3MakeFloat4(\n" +" (float)(vecIn[0]) / (quantization.x),\n" +" (float)(vecIn[1]) / (quantization.y),\n" +" (float)(vecIn[2]) / (quantization.z),\n" +" 0.f);\n" +" vecOut += bvhAabbMin;\n" +" return vecOut;\n" +"}\n" +"inline b3Float4 MyUnQuantizeGlobal(__global const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin)\n" +"{\n" +" b3Float4 vecOut;\n" +" vecOut = b3MakeFloat4(\n" +" (float)(vecIn[0]) / (quantization.x),\n" +" (float)(vecIn[1]) / (quantization.y),\n" +" (float)(vecIn[2]) / (quantization.z),\n" +" 0.f);\n" +" vecOut += bvhAabbMin;\n" +" return vecOut;\n" +"}\n" +"// work-in-progress\n" +"__kernel void findCompoundPairsKernel( __global const int4* pairs, \n" +" __global const BodyData* rigidBodies, \n" +" __global const btCollidableGpu* collidables,\n" +" __global const ConvexPolyhedronCL* convexShapes, \n" +" __global const float4* vertices,\n" +" __global const float4* uniqueEdges,\n" +" __global const btGpuFace* faces,\n" +" __global const int* indices,\n" +" __global b3Aabb_t* aabbLocalSpace,\n" +" __global const btGpuChildShape* gpuChildShapes,\n" +" __global volatile int4* gpuCompoundPairsOut,\n" +" __global volatile int* numCompoundPairsOut,\n" +" __global const b3BvhSubtreeInfo* subtrees,\n" +" __global const b3QuantizedBvhNode* quantizedNodes,\n" +" __global const b3BvhInfo* bvhInfos,\n" +" int numPairs,\n" +" int maxNumCompoundPairsCapacity\n" +" )\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i<numPairs)\n" +" {\n" +" int bodyIndexA = pairs[i].x;\n" +" int bodyIndexB = pairs[i].y;\n" +" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" +" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" +" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" +" //once the broadphase avoids static-static pairs, we can remove this test\n" +" if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n" +" {\n" +" return;\n" +" }\n" +" if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) &&(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n" +" {\n" +" int bvhA = collidables[collidableIndexA].m_compoundBvhIndex;\n" +" int bvhB = collidables[collidableIndexB].m_compoundBvhIndex;\n" +" int numSubTreesA = bvhInfos[bvhA].m_numSubTrees;\n" +" int subTreesOffsetA = bvhInfos[bvhA].m_subTreeOffset;\n" +" int subTreesOffsetB = bvhInfos[bvhB].m_subTreeOffset;\n" +" int numSubTreesB = bvhInfos[bvhB].m_numSubTrees;\n" +" \n" +" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" +" b3Quat ornA = rigidBodies[bodyIndexA].m_quat;\n" +" b3Quat ornB = rigidBodies[bodyIndexB].m_quat;\n" +" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" +" \n" +" for (int p=0;p<numSubTreesA;p++)\n" +" {\n" +" b3BvhSubtreeInfo subtreeA = subtrees[subTreesOffsetA+p];\n" +" //bvhInfos[bvhA].m_quantization\n" +" b3Float4 treeAminLocal = MyUnQuantize(subtreeA.m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n" +" b3Float4 treeAmaxLocal = MyUnQuantize(subtreeA.m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n" +" b3Float4 aabbAMinOut,aabbAMaxOut;\n" +" float margin=0.f;\n" +" b3TransformAabb2(treeAminLocal,treeAmaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut);\n" +" \n" +" for (int q=0;q<numSubTreesB;q++)\n" +" {\n" +" b3BvhSubtreeInfo subtreeB = subtrees[subTreesOffsetB+q];\n" +" b3Float4 treeBminLocal = MyUnQuantize(subtreeB.m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n" +" b3Float4 treeBmaxLocal = MyUnQuantize(subtreeB.m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n" +" b3Float4 aabbBMinOut,aabbBMaxOut;\n" +" float margin=0.f;\n" +" b3TransformAabb2(treeBminLocal,treeBmaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut);\n" +" \n" +" \n" +" bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);\n" +" if (aabbOverlap)\n" +" {\n" +" \n" +" int startNodeIndexA = subtreeA.m_rootNodeIndex+bvhInfos[bvhA].m_nodeOffset;\n" +" int endNodeIndexA = startNodeIndexA+subtreeA.m_subtreeSize;\n" +" int startNodeIndexB = subtreeB.m_rootNodeIndex+bvhInfos[bvhB].m_nodeOffset;\n" +" int endNodeIndexB = startNodeIndexB+subtreeB.m_subtreeSize;\n" +" b3Int2 nodeStack[B3_MAX_STACK_DEPTH];\n" +" b3Int2 node0;\n" +" node0.x = startNodeIndexA;\n" +" node0.y = startNodeIndexB;\n" +" int maxStackDepth = B3_MAX_STACK_DEPTH;\n" +" int depth=0;\n" +" nodeStack[depth++]=node0;\n" +" do\n" +" {\n" +" b3Int2 node = nodeStack[--depth];\n" +" b3Float4 aMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n" +" b3Float4 aMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n" +" b3Float4 bMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n" +" b3Float4 bMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n" +" float margin=0.f;\n" +" b3Float4 aabbAMinOut,aabbAMaxOut;\n" +" b3TransformAabb2(aMinLocal,aMaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut);\n" +" b3Float4 aabbBMinOut,aabbBMaxOut;\n" +" b3TransformAabb2(bMinLocal,bMaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut);\n" +" \n" +" bool nodeOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);\n" +" if (nodeOverlap)\n" +" {\n" +" bool isLeafA = isLeafNodeGlobal(&quantizedNodes[node.x]);\n" +" bool isLeafB = isLeafNodeGlobal(&quantizedNodes[node.y]);\n" +" bool isInternalA = !isLeafA;\n" +" bool isInternalB = !isLeafB;\n" +" //fail, even though it might hit two leaf nodes\n" +" if (depth+4>maxStackDepth && !(isLeafA && isLeafB))\n" +" {\n" +" //printf(\"Error: traversal exceeded maxStackDepth\");\n" +" continue;\n" +" }\n" +" if(isInternalA)\n" +" {\n" +" int nodeAleftChild = node.x+1;\n" +" bool isNodeALeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.x+1]);\n" +" int nodeArightChild = isNodeALeftChildLeaf? node.x+2 : node.x+1 + getEscapeIndexGlobal(&quantizedNodes[node.x+1]);\n" +" if(isInternalB)\n" +" { \n" +" int nodeBleftChild = node.y+1;\n" +" bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]);\n" +" int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]);\n" +" nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBleftChild);\n" +" nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBleftChild);\n" +" nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBrightChild);\n" +" nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBrightChild);\n" +" }\n" +" else\n" +" {\n" +" nodeStack[depth++] = b3MakeInt2(nodeAleftChild,node.y);\n" +" nodeStack[depth++] = b3MakeInt2(nodeArightChild,node.y);\n" +" }\n" +" }\n" +" else\n" +" {\n" +" if(isInternalB)\n" +" {\n" +" int nodeBleftChild = node.y+1;\n" +" bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]);\n" +" int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]);\n" +" nodeStack[depth++] = b3MakeInt2(node.x,nodeBleftChild);\n" +" nodeStack[depth++] = b3MakeInt2(node.x,nodeBrightChild);\n" +" }\n" +" else\n" +" {\n" +" int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n" +" if (compoundPairIdx<maxNumCompoundPairsCapacity)\n" +" {\n" +" int childShapeIndexA = getTriangleIndexGlobal(&quantizedNodes[node.x]);\n" +" int childShapeIndexB = getTriangleIndexGlobal(&quantizedNodes[node.y]);\n" +" gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);\n" +" }\n" +" }\n" +" }\n" +" }\n" +" } while (depth);\n" +" }\n" +" }\n" +" }\n" +" \n" +" return;\n" +" }\n" +" if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n" +" {\n" +" if (collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) \n" +" {\n" +" int numChildrenA = collidables[collidableIndexA].m_numChildShapes;\n" +" for (int c=0;c<numChildrenA;c++)\n" +" {\n" +" int childShapeIndexA = collidables[collidableIndexA].m_shapeIndex+c;\n" +" int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n" +" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" +" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" +" float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n" +" float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n" +" float4 newPosA = qtRotate(ornA,childPosA)+posA;\n" +" float4 newOrnA = qtMul(ornA,childOrnA);\n" +" int shapeIndexA = collidables[childColIndexA].m_shapeIndex;\n" +" b3Aabb_t aabbAlocal = aabbLocalSpace[shapeIndexA];\n" +" float margin = 0.f;\n" +" \n" +" b3Float4 aabbAMinWS;\n" +" b3Float4 aabbAMaxWS;\n" +" \n" +" b3TransformAabb2(aabbAlocal.m_minVec,aabbAlocal.m_maxVec,margin,\n" +" newPosA,\n" +" newOrnA,\n" +" &aabbAMinWS,&aabbAMaxWS);\n" +" \n" +" \n" +" if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" +" {\n" +" int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n" +" for (int b=0;b<numChildrenB;b++)\n" +" {\n" +" int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n" +" int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" +" float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" +" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" +" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" +" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" +" float4 newPosB = transform(&childPosB,&posB,&ornB);\n" +" float4 newOrnB = qtMul(ornB,childOrnB);\n" +" int shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" +" b3Aabb_t aabbBlocal = aabbLocalSpace[shapeIndexB];\n" +" \n" +" b3Float4 aabbBMinWS;\n" +" b3Float4 aabbBMaxWS;\n" +" \n" +" b3TransformAabb2(aabbBlocal.m_minVec,aabbBlocal.m_maxVec,margin,\n" +" newPosB,\n" +" newOrnB,\n" +" &aabbBMinWS,&aabbBMaxWS);\n" +" \n" +" \n" +" \n" +" bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinWS,aabbAMaxWS,aabbBMinWS,aabbBMaxWS);\n" +" if (aabbOverlap)\n" +" {\n" +" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" +" float dmin = FLT_MAX;\n" +" float4 posA = newPosA;\n" +" posA.w = 0.f;\n" +" float4 posB = newPosB;\n" +" posB.w = 0.f;\n" +" float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" +" float4 ornA = newOrnA;\n" +" float4 c0 = transform(&c0local, &posA, &ornA);\n" +" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" +" float4 ornB =newOrnB;\n" +" float4 c1 = transform(&c1local,&posB,&ornB);\n" +" const float4 DeltaC2 = c0 - c1;\n" +" {//\n" +" int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n" +" if (compoundPairIdx<maxNumCompoundPairsCapacity)\n" +" {\n" +" gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);\n" +" }\n" +" }//\n" +" }//fi(1)\n" +" } //for (int b=0\n" +" }//if (collidables[collidableIndexB].\n" +" else//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" +" {\n" +" if (1)\n" +" {\n" +" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" +" float dmin = FLT_MAX;\n" +" float4 posA = newPosA;\n" +" posA.w = 0.f;\n" +" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" +" posB.w = 0.f;\n" +" float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" +" float4 ornA = newOrnA;\n" +" float4 c0 = transform(&c0local, &posA, &ornA);\n" +" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" +" float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" +" float4 c1 = transform(&c1local,&posB,&ornB);\n" +" const float4 DeltaC2 = c0 - c1;\n" +" {\n" +" int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n" +" if (compoundPairIdx<maxNumCompoundPairsCapacity)\n" +" {\n" +" gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,-1);\n" +" }//if (compoundPairIdx<maxNumCompoundPairsCapacity)\n" +" }//\n" +" }//fi (1)\n" +" }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" +" }//for (int b=0;b<numChildrenB;b++) \n" +" return;\n" +" }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" +" if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH) \n" +" && (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n" +" {\n" +" int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n" +" for (int b=0;b<numChildrenB;b++)\n" +" {\n" +" int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n" +" int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" +" float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" +" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" +" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" +" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" +" float4 newPosB = qtRotate(ornB,childPosB)+posB;\n" +" float4 newOrnB = qtMul(ornB,childOrnB);\n" +" int shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" +" //////////////////////////////////////\n" +" if (1)\n" +" {\n" +" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" +" float dmin = FLT_MAX;\n" +" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" +" posA.w = 0.f;\n" +" float4 posB = newPosB;\n" +" posB.w = 0.f;\n" +" float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" +" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" +" float4 c0 = transform(&c0local, &posA, &ornA);\n" +" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" +" float4 ornB =newOrnB;\n" +" float4 c1 = transform(&c1local,&posB,&ornB);\n" +" const float4 DeltaC2 = c0 - c1;\n" +" {//\n" +" int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n" +" if (compoundPairIdx<maxNumCompoundPairsCapacity)\n" +" {\n" +" gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,-1,childShapeIndexB);\n" +" }//fi (compoundPairIdx<maxNumCompoundPairsCapacity)\n" +" }//\n" +" }//fi (1) \n" +" }//for (int b=0;b<numChildrenB;b++)\n" +" return;\n" +" }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" +" return;\n" +" }//fi ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n" +" }//i<numPairs\n" +"}\n" +"// work-in-progress\n" +"__kernel void findSeparatingAxisKernel( __global const int4* pairs, \n" +" __global const BodyData* rigidBodies, \n" +" __global const btCollidableGpu* collidables,\n" +" __global const ConvexPolyhedronCL* convexShapes, \n" +" __global const float4* vertices,\n" +" __global const float4* uniqueEdges,\n" +" __global const btGpuFace* faces,\n" +" __global const int* indices,\n" +" __global btAabbCL* aabbs,\n" +" __global volatile float4* separatingNormals,\n" +" __global volatile int* hasSeparatingAxis,\n" +" int numPairs\n" +" )\n" +"{\n" +" int i = get_global_id(0);\n" +" \n" +" if (i<numPairs)\n" +" {\n" +" \n" +" int bodyIndexA = pairs[i].x;\n" +" int bodyIndexB = pairs[i].y;\n" +" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" +" \n" +" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" +" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" +" \n" +" \n" +" //once the broadphase avoids static-static pairs, we can remove this test\n" +" if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n" +" {\n" +" hasSeparatingAxis[i] = 0;\n" +" return;\n" +" }\n" +" \n" +" if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))\n" +" {\n" +" hasSeparatingAxis[i] = 0;\n" +" return;\n" +" }\n" +" \n" +" if ((collidables[collidableIndexA].m_shapeType==SHAPE_CONCAVE_TRIMESH))\n" +" {\n" +" hasSeparatingAxis[i] = 0;\n" +" return;\n" +" }\n" +" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" +" float dmin = FLT_MAX;\n" +" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" +" posA.w = 0.f;\n" +" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" +" posB.w = 0.f;\n" +" float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" +" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" +" float4 c0 = transform(&c0local, &posA, &ornA);\n" +" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" +" float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" +" float4 c1 = transform(&c1local,&posB,&ornB);\n" +" const float4 DeltaC2 = c0 - c1;\n" +" float4 sepNormal;\n" +" \n" +" bool sepA = findSeparatingAxis( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" +" posB,ornB,\n" +" DeltaC2,\n" +" vertices,uniqueEdges,faces,\n" +" indices,&sepNormal,&dmin);\n" +" hasSeparatingAxis[i] = 4;\n" +" if (!sepA)\n" +" {\n" +" hasSeparatingAxis[i] = 0;\n" +" } else\n" +" {\n" +" bool sepB = findSeparatingAxis( &convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,\n" +" posA,ornA,\n" +" DeltaC2,\n" +" vertices,uniqueEdges,faces,\n" +" indices,&sepNormal,&dmin);\n" +" if (!sepB)\n" +" {\n" +" hasSeparatingAxis[i] = 0;\n" +" } else\n" +" {\n" +" bool sepEE = findSeparatingAxisEdgeEdge( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" +" posB,ornB,\n" +" DeltaC2,\n" +" vertices,uniqueEdges,faces,\n" +" indices,&sepNormal,&dmin);\n" +" if (!sepEE)\n" +" {\n" +" hasSeparatingAxis[i] = 0;\n" +" } else\n" +" {\n" +" hasSeparatingAxis[i] = 1;\n" +" separatingNormals[i] = sepNormal;\n" +" }\n" +" }\n" +" }\n" +" \n" +" }\n" +"}\n" +"__kernel void findSeparatingAxisVertexFaceKernel( __global const int4* pairs, \n" +" __global const BodyData* rigidBodies, \n" +" __global const btCollidableGpu* collidables,\n" +" __global const ConvexPolyhedronCL* convexShapes, \n" +" __global const float4* vertices,\n" +" __global const float4* uniqueEdges,\n" +" __global const btGpuFace* faces,\n" +" __global const int* indices,\n" +" __global btAabbCL* aabbs,\n" +" __global volatile float4* separatingNormals,\n" +" __global volatile int* hasSeparatingAxis,\n" +" __global float* dmins,\n" +" int numPairs\n" +" )\n" +"{\n" +" int i = get_global_id(0);\n" +" \n" +" if (i<numPairs)\n" +" {\n" +" \n" +" int bodyIndexA = pairs[i].x;\n" +" int bodyIndexB = pairs[i].y;\n" +" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" +" \n" +" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" +" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" +" \n" +" hasSeparatingAxis[i] = 0; \n" +" \n" +" //once the broadphase avoids static-static pairs, we can remove this test\n" +" if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n" +" {\n" +" return;\n" +" }\n" +" \n" +" if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))\n" +" {\n" +" return;\n" +" }\n" +" \n" +" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" +" float dmin = FLT_MAX;\n" +" dmins[i] = dmin;\n" +" \n" +" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" +" posA.w = 0.f;\n" +" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" +" posB.w = 0.f;\n" +" float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" +" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" +" float4 c0 = transform(&c0local, &posA, &ornA);\n" +" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" +" float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" +" float4 c1 = transform(&c1local,&posB,&ornB);\n" +" const float4 DeltaC2 = c0 - c1;\n" +" float4 sepNormal;\n" +" \n" +" bool sepA = findSeparatingAxis( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" +" posB,ornB,\n" +" DeltaC2,\n" +" vertices,uniqueEdges,faces,\n" +" indices,&sepNormal,&dmin);\n" +" hasSeparatingAxis[i] = 4;\n" +" if (!sepA)\n" +" {\n" +" hasSeparatingAxis[i] = 0;\n" +" } else\n" +" {\n" +" bool sepB = findSeparatingAxis( &convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,\n" +" posA,ornA,\n" +" DeltaC2,\n" +" vertices,uniqueEdges,faces,\n" +" indices,&sepNormal,&dmin);\n" +" if (sepB)\n" +" {\n" +" dmins[i] = dmin;\n" +" hasSeparatingAxis[i] = 1;\n" +" separatingNormals[i] = sepNormal;\n" +" }\n" +" }\n" +" \n" +" }\n" +"}\n" +"__kernel void findSeparatingAxisEdgeEdgeKernel( __global const int4* pairs, \n" +" __global const BodyData* rigidBodies, \n" +" __global const btCollidableGpu* collidables,\n" +" __global const ConvexPolyhedronCL* convexShapes, \n" +" __global const float4* vertices,\n" +" __global const float4* uniqueEdges,\n" +" __global const btGpuFace* faces,\n" +" __global const int* indices,\n" +" __global btAabbCL* aabbs,\n" +" __global float4* separatingNormals,\n" +" __global int* hasSeparatingAxis,\n" +" __global float* dmins,\n" +" __global const float4* unitSphereDirections,\n" +" int numUnitSphereDirections,\n" +" int numPairs\n" +" )\n" +"{\n" +" int i = get_global_id(0);\n" +" \n" +" if (i<numPairs)\n" +" {\n" +" if (hasSeparatingAxis[i])\n" +" {\n" +" \n" +" int bodyIndexA = pairs[i].x;\n" +" int bodyIndexB = pairs[i].y;\n" +" \n" +" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" +" \n" +" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" +" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" +" \n" +" \n" +" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" +" \n" +" float dmin = dmins[i];\n" +" \n" +" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" +" posA.w = 0.f;\n" +" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" +" posB.w = 0.f;\n" +" float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" +" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" +" float4 c0 = transform(&c0local, &posA, &ornA);\n" +" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" +" float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" +" float4 c1 = transform(&c1local,&posB,&ornB);\n" +" const float4 DeltaC2 = c0 - c1;\n" +" float4 sepNormal = separatingNormals[i];\n" +" \n" +" \n" +" \n" +" bool sepEE = false;\n" +" int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;\n" +" if (numEdgeEdgeDirections<=numUnitSphereDirections)\n" +" {\n" +" sepEE = findSeparatingAxisEdgeEdge( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" +" posB,ornB,\n" +" DeltaC2,\n" +" vertices,uniqueEdges,faces,\n" +" indices,&sepNormal,&dmin);\n" +" \n" +" if (!sepEE)\n" +" {\n" +" hasSeparatingAxis[i] = 0;\n" +" } else\n" +" {\n" +" hasSeparatingAxis[i] = 1;\n" +" separatingNormals[i] = sepNormal;\n" +" }\n" +" }\n" +" /*\n" +" ///else case is a separate kernel, to make Mac OSX OpenCL compiler happy\n" +" else\n" +" {\n" +" sepEE = findSeparatingAxisUnitSphere(&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" +" posB,ornB,\n" +" DeltaC2,\n" +" vertices,unitSphereDirections,numUnitSphereDirections,\n" +" &sepNormal,&dmin);\n" +" if (!sepEE)\n" +" {\n" +" hasSeparatingAxis[i] = 0;\n" +" } else\n" +" {\n" +" hasSeparatingAxis[i] = 1;\n" +" separatingNormals[i] = sepNormal;\n" +" }\n" +" }\n" +" */\n" +" } //if (hasSeparatingAxis[i])\n" +" }//(i<numPairs)\n" +"}\n" +"inline int findClippingFaces(const float4 separatingNormal,\n" +" const ConvexPolyhedronCL* hullA, \n" +" __global const ConvexPolyhedronCL* hullB,\n" +" const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,\n" +" __global float4* worldVertsA1,\n" +" __global float4* worldNormalsA1,\n" +" __global float4* worldVertsB1,\n" +" int capacityWorldVerts,\n" +" const float minDist, float maxDist,\n" +" const float4* verticesA,\n" +" const btGpuFace* facesA,\n" +" const int* indicesA,\n" +" __global const float4* verticesB,\n" +" __global const btGpuFace* facesB,\n" +" __global const int* indicesB,\n" +" __global int4* clippingFaces, int pairIndex)\n" +"{\n" +" int numContactsOut = 0;\n" +" int numWorldVertsB1= 0;\n" +" \n" +" \n" +" int closestFaceB=0;\n" +" float dmax = -FLT_MAX;\n" +" \n" +" {\n" +" for(int face=0;face<hullB->m_numFaces;face++)\n" +" {\n" +" const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x,\n" +" facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);\n" +" const float4 WorldNormal = qtRotate(ornB, Normal);\n" +" float d = dot3F4(WorldNormal,separatingNormal);\n" +" if (d > dmax)\n" +" {\n" +" dmax = d;\n" +" closestFaceB = face;\n" +" }\n" +" }\n" +" }\n" +" \n" +" {\n" +" const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n" +" int numVertices = polyB.m_numIndices;\n" +" if (numVertices>capacityWorldVerts)\n" +" numVertices = capacityWorldVerts;\n" +" \n" +" for(int e0=0;e0<numVertices;e0++)\n" +" {\n" +" if (e0<capacityWorldVerts)\n" +" {\n" +" const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];\n" +" worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);\n" +" }\n" +" }\n" +" }\n" +" \n" +" int closestFaceA=0;\n" +" {\n" +" float dmin = FLT_MAX;\n" +" for(int face=0;face<hullA->m_numFaces;face++)\n" +" {\n" +" const float4 Normal = make_float4(\n" +" facesA[hullA->m_faceOffset+face].m_plane.x,\n" +" facesA[hullA->m_faceOffset+face].m_plane.y,\n" +" facesA[hullA->m_faceOffset+face].m_plane.z,\n" +" 0.f);\n" +" const float4 faceANormalWS = qtRotate(ornA,Normal);\n" +" \n" +" float d = dot3F4(faceANormalWS,separatingNormal);\n" +" if (d < dmin)\n" +" {\n" +" dmin = d;\n" +" closestFaceA = face;\n" +" worldNormalsA1[pairIndex] = faceANormalWS;\n" +" }\n" +" }\n" +" }\n" +" \n" +" int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices;\n" +" if (numVerticesA>capacityWorldVerts)\n" +" numVerticesA = capacityWorldVerts;\n" +" \n" +" for(int e0=0;e0<numVerticesA;e0++)\n" +" {\n" +" if (e0<capacityWorldVerts)\n" +" {\n" +" const float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];\n" +" worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);\n" +" }\n" +" }\n" +" \n" +" clippingFaces[pairIndex].x = closestFaceA;\n" +" clippingFaces[pairIndex].y = closestFaceB;\n" +" clippingFaces[pairIndex].z = numVerticesA;\n" +" clippingFaces[pairIndex].w = numWorldVertsB1;\n" +" \n" +" \n" +" return numContactsOut;\n" +"}\n" +"// work-in-progress\n" +"__kernel void findConcaveSeparatingAxisKernel( __global int4* concavePairs,\n" +" __global const BodyData* rigidBodies,\n" +" __global const btCollidableGpu* collidables,\n" +" __global const ConvexPolyhedronCL* convexShapes, \n" +" __global const float4* vertices,\n" +" __global const float4* uniqueEdges,\n" +" __global const btGpuFace* faces,\n" +" __global const int* indices,\n" +" __global const btGpuChildShape* gpuChildShapes,\n" +" __global btAabbCL* aabbs,\n" +" __global float4* concaveSeparatingNormalsOut,\n" +" __global int* concaveHasSeparatingNormals,\n" +" __global int4* clippingFacesOut,\n" +" __global float4* worldVertsA1GPU,\n" +" __global float4* worldNormalsAGPU,\n" +" __global float4* worldVertsB1GPU,\n" +" int vertexFaceCapacity,\n" +" int numConcavePairs\n" +" )\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numConcavePairs)\n" +" return;\n" +" concaveHasSeparatingNormals[i] = 0;\n" +" int pairIdx = i;\n" +" int bodyIndexA = concavePairs[i].x;\n" +" int bodyIndexB = concavePairs[i].y;\n" +" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" +" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" +" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" +" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" +" if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&&\n" +" collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" +" {\n" +" concavePairs[pairIdx].w = -1;\n" +" return;\n" +" }\n" +" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" +" int numActualConcaveConvexTests = 0;\n" +" \n" +" int f = concavePairs[i].z;\n" +" \n" +" bool overlap = false;\n" +" \n" +" ConvexPolyhedronCL convexPolyhedronA;\n" +" //add 3 vertices of the triangle\n" +" convexPolyhedronA.m_numVertices = 3;\n" +" convexPolyhedronA.m_vertexOffset = 0;\n" +" float4 localCenter = make_float4(0.f,0.f,0.f,0.f);\n" +" btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n" +" float4 triMinAabb, triMaxAabb;\n" +" btAabbCL triAabb;\n" +" triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n" +" triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n" +" \n" +" float4 verticesA[3];\n" +" for (int i=0;i<3;i++)\n" +" {\n" +" int index = indices[face.m_indexOffset+i];\n" +" float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n" +" verticesA[i] = vert;\n" +" localCenter += vert;\n" +" \n" +" triAabb.m_min = min(triAabb.m_min,vert); \n" +" triAabb.m_max = max(triAabb.m_max,vert); \n" +" }\n" +" overlap = true;\n" +" overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;\n" +" overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;\n" +" overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;\n" +" \n" +" if (overlap)\n" +" {\n" +" float dmin = FLT_MAX;\n" +" int hasSeparatingAxis=5;\n" +" float4 sepAxis=make_float4(1,2,3,4);\n" +" int localCC=0;\n" +" numActualConcaveConvexTests++;\n" +" //a triangle has 3 unique edges\n" +" convexPolyhedronA.m_numUniqueEdges = 3;\n" +" convexPolyhedronA.m_uniqueEdgesOffset = 0;\n" +" float4 uniqueEdgesA[3];\n" +" \n" +" uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n" +" uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n" +" uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n" +" convexPolyhedronA.m_faceOffset = 0;\n" +" \n" +" float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n" +" \n" +" btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n" +" int indicesA[3+3+2+2+2];\n" +" int curUsedIndices=0;\n" +" int fidx=0;\n" +" //front size of triangle\n" +" {\n" +" facesA[fidx].m_indexOffset=curUsedIndices;\n" +" indicesA[0] = 0;\n" +" indicesA[1] = 1;\n" +" indicesA[2] = 2;\n" +" curUsedIndices+=3;\n" +" float c = face.m_plane.w;\n" +" facesA[fidx].m_plane.x = normal.x;\n" +" facesA[fidx].m_plane.y = normal.y;\n" +" facesA[fidx].m_plane.z = normal.z;\n" +" facesA[fidx].m_plane.w = c;\n" +" facesA[fidx].m_numIndices=3;\n" +" }\n" +" fidx++;\n" +" //back size of triangle\n" +" {\n" +" facesA[fidx].m_indexOffset=curUsedIndices;\n" +" indicesA[3]=2;\n" +" indicesA[4]=1;\n" +" indicesA[5]=0;\n" +" curUsedIndices+=3;\n" +" float c = dot(normal,verticesA[0]);\n" +" float c1 = -face.m_plane.w;\n" +" facesA[fidx].m_plane.x = -normal.x;\n" +" facesA[fidx].m_plane.y = -normal.y;\n" +" facesA[fidx].m_plane.z = -normal.z;\n" +" facesA[fidx].m_plane.w = c;\n" +" facesA[fidx].m_numIndices=3;\n" +" }\n" +" fidx++;\n" +" bool addEdgePlanes = true;\n" +" if (addEdgePlanes)\n" +" {\n" +" int numVertices=3;\n" +" int prevVertex = numVertices-1;\n" +" for (int i=0;i<numVertices;i++)\n" +" {\n" +" float4 v0 = verticesA[i];\n" +" float4 v1 = verticesA[prevVertex];\n" +" \n" +" float4 edgeNormal = normalize(cross(normal,v1-v0));\n" +" float c = -dot(edgeNormal,v0);\n" +" facesA[fidx].m_numIndices = 2;\n" +" facesA[fidx].m_indexOffset=curUsedIndices;\n" +" indicesA[curUsedIndices++]=i;\n" +" indicesA[curUsedIndices++]=prevVertex;\n" +" \n" +" facesA[fidx].m_plane.x = edgeNormal.x;\n" +" facesA[fidx].m_plane.y = edgeNormal.y;\n" +" facesA[fidx].m_plane.z = edgeNormal.z;\n" +" facesA[fidx].m_plane.w = c;\n" +" fidx++;\n" +" prevVertex = i;\n" +" }\n" +" }\n" +" convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n" +" convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n" +" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" +" posA.w = 0.f;\n" +" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" +" posB.w = 0.f;\n" +" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" +" float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" +" \n" +" ///////////////////\n" +" ///compound shape support\n" +" if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" +" {\n" +" int compoundChild = concavePairs[pairIdx].w;\n" +" int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;\n" +" int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" +" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" +" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" +" float4 newPosB = transform(&childPosB,&posB,&ornB);\n" +" float4 newOrnB = qtMul(ornB,childOrnB);\n" +" posB = newPosB;\n" +" ornB = newOrnB;\n" +" shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" +" }\n" +" //////////////////\n" +" float4 c0local = convexPolyhedronA.m_localCenter;\n" +" float4 c0 = transform(&c0local, &posA, &ornA);\n" +" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" +" float4 c1 = transform(&c1local,&posB,&ornB);\n" +" const float4 DeltaC2 = c0 - c1;\n" +" bool sepA = findSeparatingAxisLocalA( &convexPolyhedronA, &convexShapes[shapeIndexB],\n" +" posA,ornA,\n" +" posB,ornB,\n" +" DeltaC2,\n" +" verticesA,uniqueEdgesA,facesA,indicesA,\n" +" vertices,uniqueEdges,faces,indices,\n" +" &sepAxis,&dmin);\n" +" hasSeparatingAxis = 4;\n" +" if (!sepA)\n" +" {\n" +" hasSeparatingAxis = 0;\n" +" } else\n" +" {\n" +" bool sepB = findSeparatingAxisLocalB( &convexShapes[shapeIndexB],&convexPolyhedronA,\n" +" posB,ornB,\n" +" posA,ornA,\n" +" DeltaC2,\n" +" vertices,uniqueEdges,faces,indices,\n" +" verticesA,uniqueEdgesA,facesA,indicesA,\n" +" &sepAxis,&dmin);\n" +" if (!sepB)\n" +" {\n" +" hasSeparatingAxis = 0;\n" +" } else\n" +" {\n" +" bool sepEE = findSeparatingAxisEdgeEdgeLocalA( &convexPolyhedronA, &convexShapes[shapeIndexB],\n" +" posA,ornA,\n" +" posB,ornB,\n" +" DeltaC2,\n" +" verticesA,uniqueEdgesA,facesA,indicesA,\n" +" vertices,uniqueEdges,faces,indices,\n" +" &sepAxis,&dmin);\n" +" \n" +" if (!sepEE)\n" +" {\n" +" hasSeparatingAxis = 0;\n" +" } else\n" +" {\n" +" hasSeparatingAxis = 1;\n" +" }\n" +" }\n" +" } \n" +" \n" +" if (hasSeparatingAxis)\n" +" {\n" +" sepAxis.w = dmin;\n" +" concaveSeparatingNormalsOut[pairIdx]=sepAxis;\n" +" concaveHasSeparatingNormals[i]=1;\n" +" float minDist = -1e30f;\n" +" float maxDist = 0.02f;\n" +" \n" +" findClippingFaces(sepAxis,\n" +" &convexPolyhedronA,\n" +" &convexShapes[shapeIndexB],\n" +" posA,ornA,\n" +" posB,ornB,\n" +" worldVertsA1GPU,\n" +" worldNormalsAGPU,\n" +" worldVertsB1GPU,\n" +" vertexFaceCapacity,\n" +" minDist, maxDist,\n" +" verticesA,\n" +" facesA,\n" +" indicesA,\n" +" vertices,\n" +" faces,\n" +" indices,\n" +" clippingFacesOut, pairIdx);\n" +" } else\n" +" { \n" +" //mark this pair as in-active\n" +" concavePairs[pairIdx].w = -1;\n" +" }\n" +" }\n" +" else\n" +" { \n" +" //mark this pair as in-active\n" +" concavePairs[pairIdx].w = -1;\n" +" }\n" +" \n" +" concavePairs[pairIdx].z = -1;//now z is used for existing/persistent contacts\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp new file mode 100644 index 0000000000..a4980f71e1 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp @@ -0,0 +1,213 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada +//Host-code rewritten by Erwin Coumans + +#define BOUNDSEARCH_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl" +#define KERNEL0 "SearchSortDataLowerKernel" +#define KERNEL1 "SearchSortDataUpperKernel" +#define KERNEL2 "SubtractKernel" + + +#include "b3BoundSearchCL.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "b3LauncherCL.h" +#include "kernels/BoundSearchKernelsCL.h" + +b3BoundSearchCL::b3BoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize) + :m_context(ctx), + m_device(device), + m_queue(queue) +{ + + const char* additionalMacros = ""; + //const char* srcFileNameForCaching=""; + + cl_int pErrNum; + const char* kernelSource = boundSearchKernelsCL; + + cl_program boundSearchProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, BOUNDSEARCH_PATH); + b3Assert(boundSearchProg); + + m_lowerSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg,additionalMacros ); + b3Assert(m_lowerSortDataKernel ); + + m_upperSortDataKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg,additionalMacros ); + b3Assert(m_upperSortDataKernel); + + m_subtractKernel = 0; + + if( maxSize ) + { + m_subtractKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg,additionalMacros ); + b3Assert(m_subtractKernel); + } + + //m_constBuffer = new b3OpenCLArray<b3Int4>( device, 1, BufferBase::BUFFER_CONST ); + + m_lower = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue,maxSize ); + m_upper = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue, maxSize ); + + m_filler = new b3FillCL(ctx,device,queue); +} + +b3BoundSearchCL::~b3BoundSearchCL() +{ + + delete m_lower; + delete m_upper; + delete m_filler; + + clReleaseKernel(m_lowerSortDataKernel); + clReleaseKernel(m_upperSortDataKernel); + clReleaseKernel(m_subtractKernel); + + +} + + +void b3BoundSearchCL::execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option ) +{ + b3Int4 constBuffer; + constBuffer.x = nSrc; + constBuffer.y = nDst; + + if( option == BOUND_LOWER ) + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL()) }; + + b3LauncherCL launcher( m_queue, m_lowerSortDataKernel,"m_lowerSortDataKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( nSrc ); + launcher.setConst( nDst ); + + launcher.launch1D( nSrc, 64 ); + } + else if( option == BOUND_UPPER ) + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) }; + + b3LauncherCL launcher(m_queue, m_upperSortDataKernel,"m_upperSortDataKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( nSrc ); + launcher.setConst( nDst ); + + launcher.launch1D( nSrc, 64 ); + } + else if( option == COUNT ) + { + b3Assert( m_lower ); + b3Assert( m_upper ); + b3Assert( m_lower->capacity() <= (int)nDst ); + b3Assert( m_upper->capacity() <= (int)nDst ); + + int zero = 0; + m_filler->execute( *m_lower, zero, nDst ); + m_filler->execute( *m_upper, zero, nDst ); + + execute( src, nSrc, *m_lower, nDst, BOUND_LOWER ); + execute( src, nSrc, *m_upper, nDst, BOUND_UPPER ); + + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_upper->getBufferCL(), true ), b3BufferInfoCL( m_lower->getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) }; + + b3LauncherCL launcher( m_queue, m_subtractKernel ,"m_subtractKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( nSrc ); + launcher.setConst( nDst ); + + launcher.launch1D( nDst, 64 ); + } + } + else + { + b3Assert( 0 ); + } + +} + + +void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc, + b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option ) +{ + + + for(int i=0; i<nSrc-1; i++) + b3Assert( src[i].m_key <= src[i+1].m_key ); + + b3SortData minData,zeroData,maxData; + minData.m_key = -1; + minData.m_value = -1; + zeroData.m_key=0; + zeroData.m_value=0; + maxData.m_key = nDst; + maxData.m_value = nDst; + + if( option == BOUND_LOWER ) + { + for(int i=0; i<nSrc; i++) + { + b3SortData& iData = (i==0)? minData: src[i-1]; + b3SortData& jData = (i==nSrc)? maxData: src[i]; + + if( iData.m_key != jData.m_key ) + { + int k = jData.m_key; + { + dst[k] = i; + } + } + } + } + else if( option == BOUND_UPPER ) + { + for(int i=1; i<nSrc+1; i++) + { + b3SortData& iData = src[i-1]; + b3SortData& jData = (i==nSrc)? maxData: src[i]; + + if( iData.m_key != jData.m_key ) + { + int k = iData.m_key; + { + dst[k] = i; + } + } + } + } + else if( option == COUNT ) + { + b3AlignedObjectArray<unsigned int> lower; + lower.resize(nDst ); + b3AlignedObjectArray<unsigned int> upper; + upper.resize(nDst ); + + for(int i=0; i<nDst; i++) + { + lower[i] = upper[i] = 0; + } + + executeHost( src, nSrc, lower, nDst, BOUND_LOWER ); + executeHost( src, nSrc, upper, nDst, BOUND_UPPER ); + + for( int i=0; i<nDst; i++) + { + dst[i] = upper[i] - lower[i]; + } + } + else + { + b3Assert( 0 ); + } +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h new file mode 100644 index 0000000000..7e2940965c --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h @@ -0,0 +1,67 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + +#ifndef B3_BOUNDSEARCH_H +#define B3_BOUNDSEARCH_H + +#pragma once + +/*#include <Adl/Adl.h> +#include <AdlPrimitives/Math/Math.h> +#include <AdlPrimitives/Sort/SortData.h> +#include <AdlPrimitives/Fill/Fill.h> +*/ + +#include "b3OpenCLArray.h" +#include "b3FillCL.h" +#include "b3RadixSort32CL.h" //for b3SortData (perhaps move it?) +class b3BoundSearchCL +{ + public: + + enum Option + { + BOUND_LOWER, + BOUND_UPPER, + COUNT, + }; + + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + + + cl_kernel m_lowerSortDataKernel; + cl_kernel m_upperSortDataKernel; + cl_kernel m_subtractKernel; + + b3OpenCLArray<b3Int4>* m_constbtOpenCLArray; + b3OpenCLArray<unsigned int>* m_lower; + b3OpenCLArray<unsigned int>* m_upper; + + b3FillCL* m_filler; + + b3BoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size); + + virtual ~b3BoundSearchCL(); + + // src has to be src[i].m_key <= src[i+1].m_key + void execute( b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER ); + + void executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER); +}; + + +#endif //B3_BOUNDSEARCH_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h new file mode 100644 index 0000000000..52f219ae3f --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h @@ -0,0 +1,19 @@ + +#ifndef B3_BUFFER_INFO_CL_H +#define B3_BUFFER_INFO_CL_H + +#include "b3OpenCLArray.h" + + +struct b3BufferInfoCL +{ + //b3BufferInfoCL(){} + +// template<typename T> + b3BufferInfoCL(cl_mem buff, bool isReadOnly = false): m_clBuffer(buff), m_isReadOnly(isReadOnly){} + + cl_mem m_clBuffer; + bool m_isReadOnly; +}; + +#endif //B3_BUFFER_INFO_CL_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp new file mode 100644 index 0000000000..f05c2648f1 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp @@ -0,0 +1,126 @@ +#include "b3FillCL.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "b3BufferInfoCL.h" +#include "b3LauncherCL.h" + +#define FILL_CL_PROGRAM_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl" + +#include "kernels/FillKernelsCL.h" + +b3FillCL::b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue) +:m_commandQueue(queue) +{ + const char* kernelSource = fillKernelsCL; + cl_int pErrNum; + const char* additionalMacros = ""; + + cl_program fillProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, FILL_CL_PROGRAM_PATH); + b3Assert(fillProg); + + m_fillIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg,additionalMacros ); + b3Assert(m_fillIntKernel); + + m_fillUnsignedIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg,additionalMacros ); + b3Assert(m_fillIntKernel); + + m_fillFloatKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg,additionalMacros ); + b3Assert(m_fillFloatKernel); + + + + m_fillKernelInt2 = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg,additionalMacros ); + b3Assert(m_fillKernelInt2); + +} + +b3FillCL::~b3FillCL() +{ + clReleaseKernel(m_fillKernelInt2); + clReleaseKernel(m_fillIntKernel); + clReleaseKernel(m_fillUnsignedIntKernel); + clReleaseKernel(m_fillFloatKernel); + +} + +void b3FillCL::execute(b3OpenCLArray<float>& src, const float value, int n, int offset) +{ + b3Assert( n>0 ); + + { + b3LauncherCL launcher( m_commandQueue, m_fillFloatKernel,"m_fillFloatKernel" ); + launcher.setBuffer( src.getBufferCL()); + launcher.setConst( n ); + launcher.setConst( value ); + launcher.setConst( offset); + + launcher.launch1D( n ); + } +} + +void b3FillCL::execute(b3OpenCLArray<int>& src, const int value, int n, int offset) +{ + b3Assert( n>0 ); + + + { + b3LauncherCL launcher( m_commandQueue, m_fillIntKernel ,"m_fillIntKernel"); + launcher.setBuffer(src.getBufferCL()); + launcher.setConst( n); + launcher.setConst( value); + launcher.setConst( offset); + launcher.launch1D( n ); + } +} + + +void b3FillCL::execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset) +{ + b3Assert( n>0 ); + + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) }; + + b3LauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel,"m_fillUnsignedIntKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( n ); + launcher.setConst(value); + launcher.setConst(offset); + + launcher.launch1D( n ); + } +} + +void b3FillCL::executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset) +{ + for (int i=0;i<n;i++) + { + src[i+offset]=value; + } +} + +void b3FillCL::executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset) +{ + for (int i=0;i<n;i++) + { + src[i+offset]=value; + } +} + +void b3FillCL::execute(b3OpenCLArray<b3Int2> &src, const b3Int2 &value, int n, int offset) +{ + b3Assert( n>0 ); + + + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) }; + + b3LauncherCL launcher(m_commandQueue, m_fillKernelInt2,"m_fillKernelInt2"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(n); + launcher.setConst(value); + launcher.setConst(offset); + + //( constBuffer ); + launcher.launch1D( n ); + } +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h new file mode 100644 index 0000000000..1609676b9d --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h @@ -0,0 +1,63 @@ +#ifndef B3_FILL_CL_H +#define B3_FILL_CL_H + +#include "b3OpenCLArray.h" +#include "Bullet3Common/b3Scalar.h" + +#include "Bullet3Common/shared/b3Int2.h" +#include "Bullet3Common/shared/b3Int4.h" + + +class b3FillCL +{ + + cl_command_queue m_commandQueue; + + cl_kernel m_fillKernelInt2; + cl_kernel m_fillIntKernel; + cl_kernel m_fillUnsignedIntKernel; + cl_kernel m_fillFloatKernel; + + public: + + struct b3ConstData + { + union + { + b3Int4 m_data; + b3UnsignedInt4 m_UnsignedData; + }; + int m_offset; + int m_n; + int m_padding[2]; + }; + +protected: + +public: + + b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue); + + virtual ~b3FillCL(); + + void execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0); + + void execute(b3OpenCLArray<int>& src, const int value, int n, int offset = 0); + + void execute(b3OpenCLArray<float>& src, const float value, int n, int offset = 0); + + void execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset = 0); + + void executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset); + + void executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset); + + // void execute(b3OpenCLArray<b3Int4>& src, const b3Int4& value, int n, int offset = 0); + +}; + + + + + +#endif //B3_FILL_CL_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp new file mode 100644 index 0000000000..94590d11ca --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp @@ -0,0 +1,308 @@ +#include "b3LauncherCL.h" + +bool gDebugLauncherCL = false; + +b3LauncherCL::b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name) +:m_commandQueue(queue), +m_kernel(kernel), +m_idx(0), +m_enableSerialization(false), +m_name(name) +{ + if (gDebugLauncherCL) + { + static int counter = 0; + printf("[%d] Prepare to launch OpenCL kernel %s\n", counter++, name); + } + + m_serializationSizeInBytes = sizeof(int); +} + +b3LauncherCL::~b3LauncherCL() + { + for (int i=0;i<m_arrays.size();i++) + { + delete (m_arrays[i]); + } + + m_arrays.clear(); + if (gDebugLauncherCL) + { + static int counter = 0; + printf("[%d] Finished launching OpenCL kernel %s\n", counter++,m_name); + } + } + +void b3LauncherCL::setBuffer( cl_mem clBuffer) +{ + if (m_enableSerialization) + { + b3KernelArgData kernelArg; + kernelArg.m_argIndex = m_idx; + kernelArg.m_isBuffer = 1; + kernelArg.m_clBuffer = clBuffer; + + cl_mem_info param_name = CL_MEM_SIZE; + size_t param_value; + size_t sizeInBytes = sizeof(size_t); + size_t actualSizeInBytes; + cl_int err; + err = clGetMemObjectInfo ( kernelArg.m_clBuffer, + param_name, + sizeInBytes, + ¶m_value, + &actualSizeInBytes); + + b3Assert( err == CL_SUCCESS ); + kernelArg.m_argSizeInBytes = param_value; + + m_kernelArguments.push_back(kernelArg); + m_serializationSizeInBytes+= sizeof(b3KernelArgData); + m_serializationSizeInBytes+=param_value; + } + cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &clBuffer); + b3Assert( status == CL_SUCCESS ); +} + + +void b3LauncherCL::setBuffers( b3BufferInfoCL* buffInfo, int n ) +{ + for(int i=0; i<n; i++) + { + if (m_enableSerialization) + { + b3KernelArgData kernelArg; + kernelArg.m_argIndex = m_idx; + kernelArg.m_isBuffer = 1; + kernelArg.m_clBuffer = buffInfo[i].m_clBuffer; + + cl_mem_info param_name = CL_MEM_SIZE; + size_t param_value; + size_t sizeInBytes = sizeof(size_t); + size_t actualSizeInBytes; + cl_int err; + err = clGetMemObjectInfo ( kernelArg.m_clBuffer, + param_name, + sizeInBytes, + ¶m_value, + &actualSizeInBytes); + + b3Assert( err == CL_SUCCESS ); + kernelArg.m_argSizeInBytes = param_value; + + m_kernelArguments.push_back(kernelArg); + m_serializationSizeInBytes+= sizeof(b3KernelArgData); + m_serializationSizeInBytes+=param_value; + } + cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer); + b3Assert( status == CL_SUCCESS ); + } +} + +struct b3KernelArgDataUnaligned +{ + int m_isBuffer; + int m_argIndex; + int m_argSizeInBytes; + int m_unusedPadding; + union + { + cl_mem m_clBuffer; + unsigned char m_argData[B3_CL_MAX_ARG_SIZE]; + }; + +}; +#include <string.h> + + + +int b3LauncherCL::deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx) +{ + int index=0; + + int numArguments = *(int*) &buf[index]; + index+=sizeof(int); + + for (int i=0;i<numArguments;i++) + { + b3KernelArgDataUnaligned* arg = (b3KernelArgDataUnaligned*)&buf[index]; + + index+=sizeof(b3KernelArgData); + if (arg->m_isBuffer) + { + b3OpenCLArray<unsigned char>* clData = new b3OpenCLArray<unsigned char>(ctx,m_commandQueue, arg->m_argSizeInBytes); + clData->resize(arg->m_argSizeInBytes); + + clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes); + + arg->m_clBuffer = clData->getBufferCL(); + + m_arrays.push_back(clData); + + cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer); + b3Assert( status == CL_SUCCESS ); + index+=arg->m_argSizeInBytes; + } else + { + cl_int status = clSetKernelArg( m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData); + b3Assert( status == CL_SUCCESS ); + } + b3KernelArgData b; + memcpy(&b,arg,sizeof(b3KernelArgDataUnaligned)); + m_kernelArguments.push_back(b); + } +m_serializationSizeInBytes = index; + return index; +} + +int b3LauncherCL::validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx) + { + int index=0; + + int numArguments = *(int*) &goldBuffer[index]; + index+=sizeof(int); + + if (numArguments != m_kernelArguments.size()) + { + printf("failed validation: expected %d arguments, found %d\n",numArguments, m_kernelArguments.size()); + return -1; + } + + for (int ii=0;ii<numArguments;ii++) + { + b3KernelArgData* argGold = (b3KernelArgData*)&goldBuffer[index]; + + if (m_kernelArguments[ii].m_argSizeInBytes != argGold->m_argSizeInBytes) + { + printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n",ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes); + return -2; + } + + { + int expected = argGold->m_isBuffer; + int found = m_kernelArguments[ii].m_isBuffer; + + if (expected != found) + { + printf("failed validation: argument %d isBuffer expected: %d, found %d\n",ii,expected, found); + return -3; + } + } + index+=sizeof(b3KernelArgData); + + if (argGold->m_isBuffer) + { + + unsigned char* memBuf= (unsigned char*) malloc(m_kernelArguments[ii].m_argSizeInBytes); + unsigned char* goldBuf = &goldBuffer[index]; + for (int j=0;j<m_kernelArguments[j].m_argSizeInBytes;j++) + { + memBuf[j] = 0xaa; + } + + cl_int status = 0; + status = clEnqueueReadBuffer( m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes, + memBuf, 0,0,0 ); + b3Assert( status==CL_SUCCESS ); + clFinish(m_commandQueue); + + for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++) + { + int expected = goldBuf[b]; + int found = memBuf[b]; + if (expected != found) + { + printf("failed validation: argument %d OpenCL data at byte position %d expected: %d, found %d\n", + ii, b, expected, found); + return -4; + } + } + + + index+=argGold->m_argSizeInBytes; + } else + { + + //compare content + for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++) + { + int expected = argGold->m_argData[b]; + int found =m_kernelArguments[ii].m_argData[b]; + if (expected != found) + { + printf("failed validation: argument %d const data at byte position %d expected: %d, found %d\n", + ii, b, expected, found); + return -5; + } + } + + } + } + return index; + +} + +int b3LauncherCL::serializeArguments(unsigned char* destBuffer, int destBufferCapacity) +{ +//initialize to known values +for (int i=0;i<destBufferCapacity;i++) + destBuffer[i] = 0xec; + + assert(destBufferCapacity>=m_serializationSizeInBytes); + + //todo: use the b3Serializer for this to allow for 32/64bit, endianness etc + int numArguments = m_kernelArguments.size(); + int curBufferSize = 0; + int* dest = (int*)&destBuffer[curBufferSize]; + *dest = numArguments; + curBufferSize += sizeof(int); + + + + for (int i=0;i<this->m_kernelArguments.size();i++) + { + b3KernelArgData* arg = (b3KernelArgData*) &destBuffer[curBufferSize]; + *arg = m_kernelArguments[i]; + curBufferSize+=sizeof(b3KernelArgData); + if (arg->m_isBuffer==1) + { + //copy the OpenCL buffer content + cl_int status = 0; + status = clEnqueueReadBuffer( m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes, + &destBuffer[curBufferSize], 0,0,0 ); + b3Assert( status==CL_SUCCESS ); + clFinish(m_commandQueue); + curBufferSize+=arg->m_argSizeInBytes; + } + + } + return curBufferSize; +} + +void b3LauncherCL::serializeToFile(const char* fileName, int numWorkItems) +{ + int num = numWorkItems; + int buffSize = getSerializationBufferSize(); + unsigned char* buf = new unsigned char[buffSize+sizeof(int)]; + for (int i=0;i<buffSize+1;i++) + { + unsigned char* ptr = (unsigned char*)&buf[i]; + *ptr = 0xff; + } +// int actualWrite = serializeArguments(buf,buffSize); + +// unsigned char* cptr = (unsigned char*)&buf[buffSize]; +// printf("buf[buffSize] = %d\n",*cptr); + + assert(buf[buffSize]==0xff);//check for buffer overrun + int* ptr = (int*)&buf[buffSize]; + + *ptr = num; + + FILE* f = fopen(fileName,"wb"); + fwrite(buf,buffSize+sizeof(int),1,f); + fclose(f); + + delete[] buf; +} + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h new file mode 100644 index 0000000000..1b267b31ef --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h @@ -0,0 +1,135 @@ + +#ifndef B3_LAUNCHER_CL_H +#define B3_LAUNCHER_CL_H + +#include "b3BufferInfoCL.h" +#include "Bullet3Common/b3MinMax.h" +#include "b3OpenCLArray.h" +#include <stdio.h> + +#define B3_DEBUG_SERIALIZE_CL + + +#ifdef _WIN32 +#pragma warning(disable :4996) +#endif +#define B3_CL_MAX_ARG_SIZE 16 +B3_ATTRIBUTE_ALIGNED16(struct) b3KernelArgData +{ + int m_isBuffer; + int m_argIndex; + int m_argSizeInBytes; + int m_unusedPadding; + union + { + cl_mem m_clBuffer; + unsigned char m_argData[B3_CL_MAX_ARG_SIZE]; + }; + +}; + +class b3LauncherCL +{ + + cl_command_queue m_commandQueue; + cl_kernel m_kernel; + int m_idx; + + b3AlignedObjectArray<b3KernelArgData> m_kernelArguments; + int m_serializationSizeInBytes; + bool m_enableSerialization; + + const char* m_name; + public: + + b3AlignedObjectArray<b3OpenCLArray<unsigned char>* > m_arrays; + + b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name); + + virtual ~b3LauncherCL(); + + void setBuffer( cl_mem clBuffer); + + void setBuffers( b3BufferInfoCL* buffInfo, int n ); + + int getSerializationBufferSize() const + { + return m_serializationSizeInBytes; + } + + int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx); + + inline int validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx); + + int serializeArguments(unsigned char* destBuffer, int destBufferCapacity); + + int getNumArguments() const + { + return m_kernelArguments.size(); + } + + b3KernelArgData getArgument(int index) + { + return m_kernelArguments[index]; + } + + void serializeToFile(const char* fileName, int numWorkItems); + + template<typename T> + inline void setConst( const T& consts ) + { + int sz=sizeof(T); + b3Assert(sz<=B3_CL_MAX_ARG_SIZE); + + if (m_enableSerialization) + { + b3KernelArgData kernelArg; + kernelArg.m_argIndex = m_idx; + kernelArg.m_isBuffer = 0; + T* destArg = (T*)kernelArg.m_argData; + *destArg = consts; + kernelArg.m_argSizeInBytes = sizeof(T); + m_kernelArguments.push_back(kernelArg); + m_serializationSizeInBytes+=sizeof(b3KernelArgData); + } + + cl_int status = clSetKernelArg( m_kernel, m_idx++, sz, &consts ); + b3Assert( status == CL_SUCCESS ); + } + + inline void launch1D( int numThreads, int localSize = 64) + { + launch2D( numThreads, 1, localSize, 1 ); + } + + inline void launch2D( int numThreadsX, int numThreadsY, int localSizeX, int localSizeY ) + { + size_t gRange[3] = {1,1,1}; + size_t lRange[3] = {1,1,1}; + lRange[0] = localSizeX; + lRange[1] = localSizeY; + gRange[0] = b3Max((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1)); + gRange[0] *= lRange[0]; + gRange[1] = b3Max((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1)); + gRange[1] *= lRange[1]; + + cl_int status = clEnqueueNDRangeKernel( m_commandQueue, + m_kernel, 2, NULL, gRange, lRange, 0,0,0 ); + if (status != CL_SUCCESS) + { + printf("Error: OpenCL status = %d\n",status); + } + b3Assert( status == CL_SUCCESS ); + + } + + void enableSerialization(bool serialize) + { + m_enableSerialization = serialize; + } + +}; + + + +#endif //B3_LAUNCHER_CL_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h new file mode 100644 index 0000000000..d70c30f53f --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h @@ -0,0 +1,306 @@ +#ifndef B3_OPENCL_ARRAY_H +#define B3_OPENCL_ARRAY_H + +#include "Bullet3Common/b3AlignedObjectArray.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h" + +template <typename T> +class b3OpenCLArray +{ + size_t m_size; + size_t m_capacity; + cl_mem m_clBuffer; + + cl_context m_clContext; + cl_command_queue m_commandQueue; + + bool m_ownsMemory; + + bool m_allowGrowingCapacity; + + void deallocate() + { + if (m_clBuffer && m_ownsMemory) + { + clReleaseMemObject(m_clBuffer); + } + m_clBuffer = 0; + m_capacity=0; + } + + b3OpenCLArray<T>& operator=(const b3OpenCLArray<T>& src); + + B3_FORCE_INLINE size_t allocSize(size_t size) + { + return (size ? size*2 : 1); + } + +public: + + b3OpenCLArray(cl_context ctx, cl_command_queue queue, size_t initialCapacity=0, bool allowGrowingCapacity=true) + :m_size(0), m_capacity(0),m_clBuffer(0), + m_clContext(ctx),m_commandQueue(queue), + m_ownsMemory(true),m_allowGrowingCapacity(true) + { + if (initialCapacity) + { + reserve(initialCapacity); + } + m_allowGrowingCapacity = allowGrowingCapacity; + } + + ///this is an error-prone method with no error checking, be careful! + void setFromOpenCLBuffer(cl_mem buffer, size_t sizeInElements) + { + deallocate(); + m_ownsMemory = false; + m_allowGrowingCapacity = false; + m_clBuffer = buffer; + m_size = sizeInElements; + m_capacity = sizeInElements; + } + +// we could enable this assignment, but need to make sure to avoid accidental deep copies +// b3OpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src) +// { +// copyFromArray(src); +// return *this; +// } + + + cl_mem getBufferCL() const + { + return m_clBuffer; + } + + + virtual ~b3OpenCLArray() + { + deallocate(); + m_size=0; + m_capacity=0; + } + + B3_FORCE_INLINE bool push_back(const T& _Val,bool waitForCompletion=true) + { + bool result = true; + size_t sz = size(); + if( sz == capacity() ) + { + result = reserve( allocSize(size()) ); + } + copyFromHostPointer(&_Val, 1, sz, waitForCompletion); + m_size++; + return result; + } + + B3_FORCE_INLINE T forcedAt(size_t n) const + { + b3Assert(n>=0); + b3Assert(n<capacity()); + T elem; + copyToHostPointer(&elem,1,n,true); + return elem; + } + + B3_FORCE_INLINE T at(size_t n) const + { + b3Assert(n>=0); + b3Assert(n<size()); + T elem; + copyToHostPointer(&elem,1,n,true); + return elem; + } + + B3_FORCE_INLINE bool resize(size_t newsize, bool copyOldContents=true) + { + bool result = true; + size_t curSize = size(); + + if (newsize < curSize) + { + //leave the OpenCL memory for now + } else + { + if (newsize > size()) + { + result = reserve(newsize,copyOldContents); + } + + //leave new data uninitialized (init in debug mode?) + //for (size_t i=curSize;i<newsize;i++) ... + } + + if (result) + { + m_size = newsize; + } else + { + m_size = 0; + } + return result; + } + + B3_FORCE_INLINE size_t size() const + { + return m_size; + } + + B3_FORCE_INLINE size_t capacity() const + { + return m_capacity; + } + + B3_FORCE_INLINE bool reserve(size_t _Count, bool copyOldContents=true) + { + bool result=true; + // determine new minimum length of allocated storage + if (capacity() < _Count) + { // not enough room, reallocate + + if (m_allowGrowingCapacity) + { + cl_int ciErrNum; + //create a new OpenCL buffer + size_t memSizeInBytes = sizeof(T)*_Count; + cl_mem buf = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, memSizeInBytes, NULL, &ciErrNum); + if (ciErrNum!=CL_SUCCESS) + { + b3Error("OpenCL out-of-memory\n"); + _Count = 0; + result = false; + } +//#define B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS +#ifdef B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS + unsigned char* src = (unsigned char*)malloc(memSizeInBytes); + for (size_t i=0;i<memSizeInBytes;i++) + src[i] = 0xbb; + ciErrNum = clEnqueueWriteBuffer( m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0,0,0 ); + b3Assert(ciErrNum==CL_SUCCESS); + clFinish(m_commandQueue); + free(src); +#endif //B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS + + if (result) + { + if (copyOldContents) + copyToCL(buf, size()); + } + + //deallocate the old buffer + deallocate(); + + m_clBuffer = buf; + + m_capacity = _Count; + } else + { + //fail: assert and + b3Assert(0); + deallocate(); + result=false; + } + } + return result; + } + + + void copyToCL(cl_mem destination, size_t numElements, size_t firstElem=0, size_t dstOffsetInElems=0) const + { + if (numElements<=0) + return; + + b3Assert(m_clBuffer); + b3Assert(destination); + + //likely some error, destination is same as source + b3Assert(m_clBuffer != destination); + + b3Assert((firstElem+numElements)<=m_size); + + cl_int status = 0; + + + b3Assert(numElements>0); + b3Assert(numElements<=m_size); + + size_t srcOffsetBytes = sizeof(T)*firstElem; + size_t dstOffsetInBytes = sizeof(T)*dstOffsetInElems; + + status = clEnqueueCopyBuffer( m_commandQueue, m_clBuffer, destination, + srcOffsetBytes, dstOffsetInBytes, sizeof(T)*numElements, 0, 0, 0 ); + + b3Assert( status == CL_SUCCESS ); + } + + void copyFromHost(const b3AlignedObjectArray<T>& srcArray, bool waitForCompletion=true) + { + size_t newSize = srcArray.size(); + + bool copyOldContents = false; + resize (newSize,copyOldContents); + if (newSize) + copyFromHostPointer(&srcArray[0],newSize,0,waitForCompletion); + + } + + void copyFromHostPointer(const T* src, size_t numElems, size_t destFirstElem= 0, bool waitForCompletion=true) + { + b3Assert(numElems+destFirstElem <= capacity()); + + if (numElems+destFirstElem) + { + cl_int status = 0; + size_t sizeInBytes=sizeof(T)*numElems; + status = clEnqueueWriteBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*destFirstElem, sizeInBytes, + src, 0,0,0 ); + b3Assert(status == CL_SUCCESS ); + if (waitForCompletion) + clFinish(m_commandQueue); + } else + { + b3Error("copyFromHostPointer invalid range\n"); + } + } + + + void copyToHost(b3AlignedObjectArray<T>& destArray, bool waitForCompletion=true) const + { + destArray.resize(this->size()); + if (size()) + copyToHostPointer(&destArray[0], size(),0,waitForCompletion); + } + + void copyToHostPointer(T* destPtr, size_t numElem, size_t srcFirstElem=0, bool waitForCompletion=true) const + { + b3Assert(numElem+srcFirstElem <= capacity()); + + if(numElem+srcFirstElem <= capacity()) + { + cl_int status = 0; + status = clEnqueueReadBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*srcFirstElem, sizeof(T)*numElem, + destPtr, 0,0,0 ); + b3Assert( status==CL_SUCCESS ); + + if (waitForCompletion) + clFinish(m_commandQueue); + } else + { + b3Error("copyToHostPointer invalid range\n"); + } + } + + void copyFromOpenCLArray(const b3OpenCLArray& src) + { + size_t newSize = src.size(); + resize(newSize); + if (size()) + { + src.copyToCL(m_clBuffer,size()); + } + } + +}; + + +#endif //B3_OPENCL_ARRAY_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp new file mode 100644 index 0000000000..42cd197740 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp @@ -0,0 +1,126 @@ +#include "b3PrefixScanCL.h" +#include "b3FillCL.h" +#define B3_PREFIXSCAN_PROG_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl" + +#include "b3LauncherCL.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "kernels/PrefixScanKernelsCL.h" + +b3PrefixScanCL::b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size) +:m_commandQueue(queue) +{ + const char* scanKernelSource = prefixScanKernelsCL; + cl_int pErrNum; + char* additionalMacros=0; + + m_workBuffer = new b3OpenCLArray<unsigned int>(ctx,queue,size); + cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_PROG_PATH); + b3Assert(scanProg); + + m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros ); + b3Assert(m_localScanKernel ); + m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros ); + b3Assert(m_blockSumKernel ); + m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros ); + b3Assert(m_propagationKernel ); +} + + +b3PrefixScanCL::~b3PrefixScanCL() +{ + delete m_workBuffer; + clReleaseKernel(m_localScanKernel); + clReleaseKernel(m_blockSumKernel); + clReleaseKernel(m_propagationKernel); +} + +template<class T> +T b3NextPowerOf2(T n) +{ + n -= 1; + for(int i=0; i<sizeof(T)*8; i++) + n = n | (n>>i); + return n+1; +} + +void b3PrefixScanCL::execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum) +{ + +// b3Assert( data->m_option == EXCLUSIVE ); + const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) ); + + dst.resize(src.size()); + m_workBuffer->resize(src.size()); + + b3Int4 constBuffer; + constBuffer.x = n; + constBuffer.y = numBlocks; + constBuffer.z = (int)b3NextPowerOf2( numBlocks ); + + b3OpenCLArray<unsigned int>* srcNative = &src; + b3OpenCLArray<unsigned int>* dstNative = &dst; + + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + + b3LauncherCL launcher( m_commandQueue, m_localScanKernel,"m_localScanKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE ); + } + + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + + b3LauncherCL launcher( m_commandQueue, m_blockSumKernel,"m_blockSumKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE ); + } + + + if( numBlocks > 1 ) + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + b3LauncherCL launcher( m_commandQueue, m_propagationKernel,"m_propagationKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE ); + } + + + if( sum ) + { + clFinish(m_commandQueue); + dstNative->copyToHostPointer(sum,1,n-1,true); + } + +} + + +void b3PrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum) +{ + unsigned int s = 0; + //if( data->m_option == EXCLUSIVE ) + { + for(int i=0; i<n; i++) + { + dst[i] = s; + s += src[i]; + } + } + /*else + { + for(int i=0; i<n; i++) + { + s += hSrc[i]; + hDst[i] = s; + } + } + */ + + if( sum ) + { + *sum = dst[n-1]; + } +}
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h new file mode 100644 index 0000000000..a9a2e61b9e --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h @@ -0,0 +1,37 @@ + +#ifndef B3_PREFIX_SCAN_CL_H +#define B3_PREFIX_SCAN_CL_H + +#include "b3OpenCLArray.h" +#include "b3BufferInfoCL.h" +#include "Bullet3Common/b3AlignedObjectArray.h" + +class b3PrefixScanCL +{ + enum + { + BLOCK_SIZE = 128 + }; + +// Option m_option; + + cl_command_queue m_commandQueue; + + cl_kernel m_localScanKernel; + cl_kernel m_blockSumKernel; + cl_kernel m_propagationKernel; + + b3OpenCLArray<unsigned int>* m_workBuffer; + + + public: + + b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0); + + virtual ~b3PrefixScanCL(); + + void execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0); + void executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum=0); +}; + +#endif //B3_PREFIX_SCAN_CL_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp new file mode 100644 index 0000000000..80560d793d --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp @@ -0,0 +1,126 @@ +#include "b3PrefixScanFloat4CL.h" +#include "b3FillCL.h" +#define B3_PREFIXSCAN_FLOAT4_PROG_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl" + +#include "b3LauncherCL.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "kernels/PrefixScanKernelsFloat4CL.h" + +b3PrefixScanFloat4CL::b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size) +:m_commandQueue(queue) +{ + const char* scanKernelSource = prefixScanKernelsFloat4CL; + cl_int pErrNum; + char* additionalMacros=0; + + m_workBuffer = new b3OpenCLArray<b3Vector3>(ctx,queue,size); + cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_FLOAT4_PROG_PATH); + b3Assert(scanProg); + + m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros ); + b3Assert(m_localScanKernel ); + m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros ); + b3Assert(m_blockSumKernel ); + m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros ); + b3Assert(m_propagationKernel ); +} + + +b3PrefixScanFloat4CL::~b3PrefixScanFloat4CL() +{ + delete m_workBuffer; + clReleaseKernel(m_localScanKernel); + clReleaseKernel(m_blockSumKernel); + clReleaseKernel(m_propagationKernel); +} + +template<class T> +T b3NextPowerOf2(T n) +{ + n -= 1; + for(int i=0; i<sizeof(T)*8; i++) + n = n | (n>>i); + return n+1; +} + +void b3PrefixScanFloat4CL::execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<b3Vector3>& dst, int n, b3Vector3* sum) +{ + +// b3Assert( data->m_option == EXCLUSIVE ); + const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) ); + + dst.resize(src.size()); + m_workBuffer->resize(src.size()); + + b3Int4 constBuffer; + constBuffer.x = n; + constBuffer.y = numBlocks; + constBuffer.z = (int)b3NextPowerOf2( numBlocks ); + + b3OpenCLArray<b3Vector3>* srcNative = &src; + b3OpenCLArray<b3Vector3>* dstNative = &dst; + + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + + b3LauncherCL launcher( m_commandQueue, m_localScanKernel ,"m_localScanKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE ); + } + + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + + b3LauncherCL launcher( m_commandQueue, m_blockSumKernel ,"m_blockSumKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE ); + } + + + if( numBlocks > 1 ) + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + b3LauncherCL launcher( m_commandQueue, m_propagationKernel ,"m_propagationKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE ); + } + + + if( sum ) + { + clFinish(m_commandQueue); + dstNative->copyToHostPointer(sum,1,n-1,true); + } + +} + + +void b3PrefixScanFloat4CL::executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum) +{ + b3Vector3 s=b3MakeVector3(0,0,0); + //if( data->m_option == EXCLUSIVE ) + { + for(int i=0; i<n; i++) + { + dst[i] = s; + s += src[i]; + } + } + /*else + { + for(int i=0; i<n; i++) + { + s += hSrc[i]; + hDst[i] = s; + } + } + */ + + if( sum ) + { + *sum = dst[n-1]; + } +}
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h new file mode 100644 index 0000000000..2c8003c1bb --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h @@ -0,0 +1,38 @@ + +#ifndef B3_PREFIX_SCAN_CL_H +#define B3_PREFIX_SCAN_CL_H + +#include "b3OpenCLArray.h" +#include "b3BufferInfoCL.h" +#include "Bullet3Common/b3AlignedObjectArray.h" +#include "Bullet3Common/b3Vector3.h" + +class b3PrefixScanFloat4CL +{ + enum + { + BLOCK_SIZE = 128 + }; + +// Option m_option; + + cl_command_queue m_commandQueue; + + cl_kernel m_localScanKernel; + cl_kernel m_blockSumKernel; + cl_kernel m_propagationKernel; + + b3OpenCLArray<b3Vector3>* m_workBuffer; + + + public: + + b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0); + + virtual ~b3PrefixScanFloat4CL(); + + void execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<b3Vector3>& dst, int n, b3Vector3* sum = 0); + void executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum); +}; + +#endif //B3_PREFIX_SCAN_CL_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp new file mode 100644 index 0000000000..f11ae4bcdb --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp @@ -0,0 +1,710 @@ + +#include "b3RadixSort32CL.h" +#include "b3LauncherCL.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "b3PrefixScanCL.h" +#include "b3FillCL.h" + +#define RADIXSORT32_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl" + +#include "kernels/RadixSort32KernelsCL.h" + +b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity) +:m_commandQueue(queue) +{ + b3OpenCLDeviceInfo info; + b3OpenCLUtils::getDeviceInfo(device,&info); + m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0; + + m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx,queue); + m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx,queue); + m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx,queue); + m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx,queue); + m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx,queue); + m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx,queue); + + + if (initialCapacity>0) + { + m_workBuffer1->resize(initialCapacity); + m_workBuffer3->resize(initialCapacity); + m_workBuffer3a->resize(initialCapacity); + m_workBuffer4->resize(initialCapacity); + m_workBuffer4a->resize(initialCapacity); + } + + m_scan = new b3PrefixScanCL(ctx,device,queue); + m_fill = new b3FillCL(ctx,device,queue); + + const char* additionalMacros = ""; + + cl_int pErrNum; + const char* kernelSource = radixSort32KernelsCL; + + cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH); + b3Assert(sortProg); + + m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_streamCountSortDataKernel ); + + + + m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_streamCountKernel); + + + + if (m_deviceCPU) + { + + m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_sortAndScatterSortDataKernel); + m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_sortAndScatterKernel); + } else + { + m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_sortAndScatterSortDataKernel); + m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_sortAndScatterKernel); + } + + m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_prefixScanKernel); + +} + +b3RadixSort32CL::~b3RadixSort32CL() +{ + delete m_scan; + delete m_fill; + delete m_workBuffer1; + delete m_workBuffer2; + delete m_workBuffer3; + delete m_workBuffer3a; + delete m_workBuffer4; + delete m_workBuffer4a; + + clReleaseKernel(m_streamCountSortDataKernel); + clReleaseKernel(m_streamCountKernel); + clReleaseKernel(m_sortAndScatterSortDataKernel); + clReleaseKernel(m_sortAndScatterKernel); + clReleaseKernel(m_prefixScanKernel); +} + +void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */) +{ + int n = inout.size(); + const int BITS_PER_PASS = 8; + const int NUM_TABLES = (1<<BITS_PER_PASS); + + + int tables[NUM_TABLES]; + int counter[NUM_TABLES]; + + b3SortData* src = &inout[0]; + b3AlignedObjectArray<b3SortData> workbuffer; + workbuffer.resize(inout.size()); + b3SortData* dst = &workbuffer[0]; + + int count=0; + for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS) + { + for(int i=0; i<NUM_TABLES; i++) + { + tables[i] = 0; + } + + for(int i=0; i<n; i++) + { + int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1); + tables[tableIdx]++; + } +//#define TEST +#ifdef TEST + printf("histogram size=%d\n",NUM_TABLES); + for (int i=0;i<NUM_TABLES;i++) + { + if (tables[i]!=0) + { + printf("tables[%d]=%d]\n",i,tables[i]); + } + + } +#endif //TEST + // prefix scan + int sum = 0; + for(int i=0; i<NUM_TABLES; i++) + { + int iData = tables[i]; + tables[i] = sum; + sum += iData; + counter[i] = 0; + } + + // distribute + for(int i=0; i<n; i++) + { + int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1); + + dst[tables[tableIdx] + counter[tableIdx]] = src[i]; + counter[tableIdx] ++; + } + + b3Swap( src, dst ); + count++; + } + + if (count&1) + { + b3Assert(0);//need to copy + + } +} + +void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */) +{ + + b3AlignedObjectArray<b3SortData> inout; + keyValuesInOut.copyToHost(inout); + + executeHost(inout,sortBits); + + keyValuesInOut.copyFromHost(inout); +} + +void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn, + b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits) +{ + +} + +//#define DEBUG_RADIXSORT +//#define DEBUG_RADIXSORT2 + + +void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */) +{ + + int originalSize = keyValuesInOut.size(); + int workingSize = originalSize; + + + int dataAlignment = DATA_ALIGNMENT; + +#ifdef DEBUG_RADIXSORT2 + b3AlignedObjectArray<b3SortData> test2; + keyValuesInOut.copyToHost(test2); + printf("numElem = %d\n",test2.size()); + for (int i=0;i<test2.size();i++) + { + printf("test2[%d].m_key=%d\n",i,test2[i].m_key); + printf("test2[%d].m_value=%d\n",i,test2[i].m_value); + } +#endif //DEBUG_RADIXSORT2 + + b3OpenCLArray<b3SortData>* src = 0; + + if (workingSize%dataAlignment) + { + workingSize += dataAlignment-(workingSize%dataAlignment); + m_workBuffer4->copyFromOpenCLArray(keyValuesInOut); + m_workBuffer4->resize(workingSize); + b3SortData fillValue; + fillValue.m_key = 0xffffffff; + fillValue.m_value = 0xffffffff; + +#define USE_BTFILL +#ifdef USE_BTFILL + m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4,(b3Int2&)fillValue,workingSize-originalSize,originalSize); +#else + //fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side) + + for (int i=originalSize; i<workingSize;i++) + { + m_workBuffer4->copyFromHostPointer(&fillValue,1,i); + } +#endif//USE_BTFILL + + src = m_workBuffer4; + } else + { + src = &keyValuesInOut; + m_workBuffer4->resize(0); + } + + b3Assert( workingSize%DATA_ALIGNMENT == 0 ); + int minCap = NUM_BUCKET*NUM_WGS; + + + int n = workingSize; + + m_workBuffer1->resize(minCap); + m_workBuffer3->resize(workingSize); + + +// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 ); + b3Assert( BITS_PER_PASS == 4 ); + b3Assert( WG_SIZE == 64 ); + b3Assert( (sortBits&0x3) == 0 ); + + + + b3OpenCLArray<b3SortData>* dst = m_workBuffer3; + + b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1; + b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2; + + + int nWGs = NUM_WGS; + b3ConstData cdata; + + { + int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256 + int nBlocks = (n+blockSize-1)/(blockSize); + cdata.m_n = n; + cdata.m_nWGs = NUM_WGS; + cdata.m_startBit = 0; + cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs; + if( nBlocks < NUM_WGS ) + { + cdata.m_nBlocksPerWG = 1; + nWGs = nBlocks; + } + } + + int count=0; + for(int ib=0; ib<sortBits; ib+=4) + { +#ifdef DEBUG_RADIXSORT2 + keyValuesInOut.copyToHost(test2); + printf("numElem = %d\n",test2.size()); + for (int i=0;i<test2.size();i++) + { + if (test2[i].m_key != test2[i].m_value) + { + printf("test2[%d].m_key=%d\n",i,test2[i].m_key); + printf("test2[%d].m_value=%d\n",i,test2[i].m_value); + } + } +#endif //DEBUG_RADIXSORT2 + + cdata.m_startBit = ib; + + if (src->size()) + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) }; + b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel,"m_streamCountSortDataKernel"); + + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + + int num = NUM_WGS*WG_SIZE; + launcher.launch1D( num, WG_SIZE ); + } + + + +#ifdef DEBUG_RADIXSORT + b3AlignedObjectArray<unsigned int> testHist; + srcHisto->copyToHost(testHist); + printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); + for (int i=0;i<testHist.size();i++) + { + if (testHist[i]!=0) + printf("testHist[%d]=%d\n",i,testHist[i]); + } +#endif //DEBUG_RADIXSORT + + + +//fast prefix scan is not working properly on Mac OSX yet +#ifdef __APPLE__ + bool fastScan=false; +#else + bool fastScan=!m_deviceCPU;//only use fast scan on GPU +#endif + + if (fastScan) + {// prefix scan group histogram + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) }; + b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( 128, 128 ); + destHisto = srcHisto; + }else + { + //unsigned int sum; //for debugging + m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum); + } + + +#ifdef DEBUG_RADIXSORT + destHisto->copyToHost(testHist); + printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); + for (int i=0;i<testHist.size();i++) + { + if (testHist[i]!=0) + printf("testHist[%d]=%d\n",i,testHist[i]); + } + + for (int i=0;i<testHist.size();i+=NUM_WGS) + { + printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]); + } + +#endif //DEBUG_RADIXSORT + +#define USE_GPU +#ifdef USE_GPU + + if (src->size()) + {// local sort and distribute + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )}; + b3LauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel,"m_sortAndScatterSortDataKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( nWGs*WG_SIZE, WG_SIZE ); + + } +#else + { +#define NUM_TABLES 16 +//#define SEQUENTIAL +#ifdef SEQUENTIAL + int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + int tables[NUM_TABLES]; + int startBit = ib; + + destHisto->copyToHost(testHist); + b3AlignedObjectArray<b3SortData> srcHost; + b3AlignedObjectArray<b3SortData> dstHost; + dstHost.resize(src->size()); + + src->copyToHost(srcHost); + + for (int i=0;i<NUM_TABLES;i++) + { + tables[i] = testHist[i*NUM_WGS]; + } + + // distribute + for(int i=0; i<n; i++) + { + int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); + + dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; + counter2[tableIdx] ++; + } + + +#else + + int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + int tables[NUM_TABLES]; + b3AlignedObjectArray<b3SortData> dstHostOK; + dstHostOK.resize(src->size()); + + destHisto->copyToHost(testHist); + b3AlignedObjectArray<b3SortData> srcHost; + src->copyToHost(srcHost); + + int blockSize = 256; + int nBlocksPerWG = cdata.m_nBlocksPerWG; + int startBit = ib; + + { + for (int i=0;i<NUM_TABLES;i++) + { + tables[i] = testHist[i*NUM_WGS]; + } + + // distribute + for(int i=0; i<n; i++) + { + int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); + + dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; + counter2[tableIdx] ++; + } + + + } + + + b3AlignedObjectArray<b3SortData> dstHost; + dstHost.resize(src->size()); + + + int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + + + for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++) + { + int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; + + for(int iblock=0; iblock<b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++) + { + for (int lIdx = 0;lIdx < 64;lIdx++) + { + int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD + // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops + // AMD: AtomInc performs better while NV prefers ++ + for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++) + { + if( addr+j < n ) + { + // printf ("addr+j=%d\n", addr+j); + + int i = addr+j; + + int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); + + int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx]; + + b3SortData ok = dstHostOK[destIndex]; + + if (ok.m_key != srcHost[i].m_key) + { + printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key ); + printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value ); + } + if (ok.m_value != srcHost[i].m_value) + { + + printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value ); + printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key ); + + } + + dstHost[destIndex] = srcHost[i]; + counter[tableIdx] ++; + + } + } + } + } + } + + +#endif //SEQUENTIAL + + dst->copyFromHost(dstHost); + } +#endif//USE_GPU + + + +#ifdef DEBUG_RADIXSORT + destHisto->copyToHost(testHist); + printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); + for (int i=0;i<testHist.size();i++) + { + if (testHist[i]!=0) + printf("testHist[%d]=%d\n",i,testHist[i]); + } +#endif //DEBUG_RADIXSORT + b3Swap(src, dst ); + b3Swap(srcHisto,destHisto); + +#ifdef DEBUG_RADIXSORT2 + keyValuesInOut.copyToHost(test2); + printf("numElem = %d\n",test2.size()); + for (int i=0;i<test2.size();i++) + { + if (test2[i].m_key != test2[i].m_value) + { + printf("test2[%d].m_key=%d\n",i,test2[i].m_key); + printf("test2[%d].m_value=%d\n",i,test2[i].m_value); + } + } +#endif //DEBUG_RADIXSORT2 + + count++; + + + } + + + + if (count&1) + { + b3Assert(0);//need to copy from workbuffer to keyValuesInOut + } + + if (m_workBuffer4->size()) + { + m_workBuffer4->resize(originalSize); + keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4); + } + + +#ifdef DEBUG_RADIXSORT + keyValuesInOut.copyToHost(test2); + + printf("numElem = %d\n",test2.size()); + for (int i=0;i<test2.size();i++) + { + printf("test2[%d].m_key=%d\n",i,test2[i].m_key); + printf("test2[%d].m_value=%d\n",i,test2[i].m_value); + } +#endif + +} + + + + + + +void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */) +{ + int originalSize = keysInOut.size(); + int workingSize = originalSize; + + + int dataAlignment = DATA_ALIGNMENT; + + b3OpenCLArray<unsigned int>* src = 0; + + if (workingSize%dataAlignment) + { + workingSize += dataAlignment-(workingSize%dataAlignment); + m_workBuffer4a->copyFromOpenCLArray(keysInOut); + m_workBuffer4a->resize(workingSize); + unsigned int fillValue = 0xffffffff; + + m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize); + + src = m_workBuffer4a; + } else + { + src = &keysInOut; + m_workBuffer4a->resize(0); + } + + + + b3Assert( workingSize%DATA_ALIGNMENT == 0 ); + int minCap = NUM_BUCKET*NUM_WGS; + + + int n = workingSize; + + + m_workBuffer1->resize(minCap); + m_workBuffer3->resize(workingSize); + m_workBuffer3a->resize(workingSize); + +// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 ); + b3Assert( BITS_PER_PASS == 4 ); + b3Assert( WG_SIZE == 64 ); + b3Assert( (sortBits&0x3) == 0 ); + + + + b3OpenCLArray<unsigned int>* dst = m_workBuffer3a; + + b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1; + b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2; + + + int nWGs = NUM_WGS; + b3ConstData cdata; + + { + int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256 + int nBlocks = (n+blockSize-1)/(blockSize); + cdata.m_n = n; + cdata.m_nWGs = NUM_WGS; + cdata.m_startBit = 0; + cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs; + if( nBlocks < NUM_WGS ) + { + cdata.m_nBlocksPerWG = 1; + nWGs = nBlocks; + } + } + + int count=0; + for(int ib=0; ib<sortBits; ib+=4) + { + cdata.m_startBit = ib; + + if (src->size()) + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) }; + b3LauncherCL launcher(m_commandQueue, m_streamCountKernel,"m_streamCountKernel"); + + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + + int num = NUM_WGS*WG_SIZE; + launcher.launch1D( num, WG_SIZE ); + } + + + +//fast prefix scan is not working properly on Mac OSX yet +#ifdef __APPLE__ + bool fastScan=false; +#else + bool fastScan=!m_deviceCPU; +#endif + + if (fastScan) + {// prefix scan group histogram + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) }; + b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( 128, 128 ); + destHisto = srcHisto; + }else + { + //unsigned int sum; //for debugging + m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum); + } + + if (src->size()) + {// local sort and distribute + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )}; + b3LauncherCL launcher( m_commandQueue, m_sortAndScatterKernel ,"m_sortAndScatterKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( nWGs*WG_SIZE, WG_SIZE ); + + } + + b3Swap(src, dst ); + b3Swap(srcHisto,destHisto); + + count++; + } + + if (count&1) + { + b3Assert(0);//need to copy from workbuffer to keyValuesInOut + } + + if (m_workBuffer4a->size()) + { + m_workBuffer4a->resize(originalSize); + keysInOut.copyFromOpenCLArray(*m_workBuffer4a); + } + +} + + + + + + + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h new file mode 100644 index 0000000000..975bd80e53 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h @@ -0,0 +1,95 @@ + +#ifndef B3_RADIXSORT32_H +#define B3_RADIXSORT32_H + +#include "b3OpenCLArray.h" + +struct b3SortData +{ + union + { + unsigned int m_key; + unsigned int x; + }; + + union + { + unsigned int m_value; + unsigned int y; + + }; +}; +#include "b3BufferInfoCL.h" + +class b3RadixSort32CL +{ + + b3OpenCLArray<unsigned int>* m_workBuffer1; + b3OpenCLArray<unsigned int>* m_workBuffer2; + + b3OpenCLArray<b3SortData>* m_workBuffer3; + b3OpenCLArray<b3SortData>* m_workBuffer4; + + b3OpenCLArray<unsigned int>* m_workBuffer3a; + b3OpenCLArray<unsigned int>* m_workBuffer4a; + + cl_command_queue m_commandQueue; + + cl_kernel m_streamCountSortDataKernel; + cl_kernel m_streamCountKernel; + + cl_kernel m_prefixScanKernel; + cl_kernel m_sortAndScatterSortDataKernel; + cl_kernel m_sortAndScatterKernel; + + + bool m_deviceCPU; + + class b3PrefixScanCL* m_scan; + class b3FillCL* m_fill; + +public: + struct b3ConstData + { + int m_n; + int m_nWGs; + int m_startBit; + int m_nBlocksPerWG; + }; + enum + { + DATA_ALIGNMENT = 256, + WG_SIZE = 64, + BLOCK_SIZE = 256, + ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE), + BITS_PER_PASS = 4, + NUM_BUCKET=(1<<BITS_PER_PASS), + // if you change this, change nPerWI in kernel as well + NUM_WGS = 20*6, // cypress +// NUM_WGS = 24*6, // cayman +// NUM_WGS = 32*4, // nv + }; + + +private: + + +public: + + b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0); + + virtual ~b3RadixSort32CL(); + + void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn, + b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32); + + ///keys only + void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32 ); + + void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32 ); + void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32); + void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32); + +}; +#endif //B3_RADIXSORT32_H + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl new file mode 100644 index 0000000000..f3b4a1e8a7 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl @@ -0,0 +1,106 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) + +typedef struct +{ + u32 m_key; + u32 m_value; +}SortData; + + + +typedef struct +{ + u32 m_nSrc; + u32 m_nDst; + u32 m_padding[2]; +} ConstBuffer; + + + +__attribute__((reqd_work_group_size(64,1,1))) +__kernel +void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, + unsigned int nSrc, unsigned int nDst) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nSrc ) + { + SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1); + SortData end; end.m_key = nDst; end.m_value = nDst; + + SortData iData = (gIdx==0)? first: src[gIdx-1]; + SortData jData = (gIdx==nSrc)? end: src[gIdx]; + + if( iData.m_key != jData.m_key ) + { +// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++) + u32 k = jData.m_key; + { + dst[k] = gIdx; + } + } + } +} + + +__attribute__((reqd_work_group_size(64,1,1))) +__kernel +void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, + unsigned int nSrc, unsigned int nDst) +{ + int gIdx = GET_GLOBAL_IDX+1; + + if( gIdx < nSrc+1 ) + { + SortData first; first.m_key = 0; first.m_value = 0; + SortData end; end.m_key = nDst; end.m_value = nDst; + + SortData iData = src[gIdx-1]; + SortData jData = (gIdx==nSrc)? end: src[gIdx]; + + if( iData.m_key != jData.m_key ) + { + u32 k = iData.m_key; + { + dst[k] = gIdx; + } + } + } +} + +__attribute__((reqd_work_group_size(64,1,1))) +__kernel +void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, + unsigned int nSrc, unsigned int nDst) +{ + int gIdx = GET_GLOBAL_IDX; + + + if( gIdx < nDst ) + { + C[gIdx] = A[gIdx] - B[gIdx]; + } +} + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h new file mode 100644 index 0000000000..9c9e847138 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h @@ -0,0 +1,87 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* boundSearchKernelsCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"typedef struct\n" +"{\n" +" u32 m_key; \n" +" u32 m_value;\n" +"}SortData;\n" +"typedef struct\n" +"{\n" +" u32 m_nSrc;\n" +" u32 m_nDst;\n" +" u32 m_padding[2];\n" +"} ConstBuffer;\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"__kernel\n" +"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n" +" unsigned int nSrc, unsigned int nDst)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < nSrc )\n" +" {\n" +" SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n" +" SortData end; end.m_key = nDst; end.m_value = nDst;\n" +" SortData iData = (gIdx==0)? first: src[gIdx-1];\n" +" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n" +" if( iData.m_key != jData.m_key )\n" +" {\n" +"// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n" +" u32 k = jData.m_key;\n" +" {\n" +" dst[k] = gIdx;\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"__kernel\n" +"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n" +" unsigned int nSrc, unsigned int nDst)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX+1;\n" +" if( gIdx < nSrc+1 )\n" +" {\n" +" SortData first; first.m_key = 0; first.m_value = 0;\n" +" SortData end; end.m_key = nDst; end.m_value = nDst;\n" +" SortData iData = src[gIdx-1];\n" +" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n" +" if( iData.m_key != jData.m_key )\n" +" {\n" +" u32 k = iData.m_key;\n" +" {\n" +" dst[k] = gIdx;\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"__kernel\n" +"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n" +" unsigned int nSrc, unsigned int nDst)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" \n" +" if( gIdx < nDst )\n" +" {\n" +" C[gIdx] = A[gIdx] - B[gIdx];\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl new file mode 100644 index 0000000000..2eee5752ec --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl @@ -0,0 +1,128 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) + +#define make_uint4 (uint4) +#define make_uint2 (uint2) +#define make_int2 (int2) + +typedef struct +{ + int m_n; + int m_padding[3]; +} ConstBuffer; + + + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void Copy1F4Kernel(__global float4* dst, __global float4* src, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < cb.m_n ) + { + float4 a0 = src[gIdx]; + + dst[ gIdx ] = a0; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void Copy2F4Kernel(__global float4* dst, __global float4* src, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( 2*gIdx <= cb.m_n ) + { + float4 a0 = src[gIdx*2+0]; + float4 a1 = src[gIdx*2+1]; + + dst[ gIdx*2+0 ] = a0; + dst[ gIdx*2+1 ] = a1; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void Copy4F4Kernel(__global float4* dst, __global float4* src, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( 4*gIdx <= cb.m_n ) + { + int idx0 = gIdx*4+0; + int idx1 = gIdx*4+1; + int idx2 = gIdx*4+2; + int idx3 = gIdx*4+3; + + float4 a0 = src[idx0]; + float4 a1 = src[idx1]; + float4 a2 = src[idx2]; + float4 a3 = src[idx3]; + + dst[ idx0 ] = a0; + dst[ idx1 ] = a1; + dst[ idx2 ] = a2; + dst[ idx3 ] = a3; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void CopyF1Kernel(__global float* dstF1, __global float* srcF1, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < cb.m_n ) + { + float a0 = srcF1[gIdx]; + + dstF1[ gIdx ] = a0; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < cb.m_n ) + { + float2 a0 = srcF2[gIdx]; + + dstF2[ gIdx ] = a0; + } +} + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h new file mode 100644 index 0000000000..e5670e3cd3 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h @@ -0,0 +1,132 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* copyKernelsCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"\n" +"#define make_uint4 (uint4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"\n" +"typedef struct\n" +"{\n" +" int m_n;\n" +" int m_padding[3];\n" +"} ConstBuffer;\n" +"\n" +"\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < cb.m_n )\n" +" {\n" +" float4 a0 = src[gIdx];\n" +"\n" +" dst[ gIdx ] = a0;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( 2*gIdx <= cb.m_n )\n" +" {\n" +" float4 a0 = src[gIdx*2+0];\n" +" float4 a1 = src[gIdx*2+1];\n" +"\n" +" dst[ gIdx*2+0 ] = a0;\n" +" dst[ gIdx*2+1 ] = a1;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( 4*gIdx <= cb.m_n )\n" +" {\n" +" int idx0 = gIdx*4+0;\n" +" int idx1 = gIdx*4+1;\n" +" int idx2 = gIdx*4+2;\n" +" int idx3 = gIdx*4+3;\n" +"\n" +" float4 a0 = src[idx0];\n" +" float4 a1 = src[idx1];\n" +" float4 a2 = src[idx2];\n" +" float4 a3 = src[idx3];\n" +"\n" +" dst[ idx0 ] = a0;\n" +" dst[ idx1 ] = a1;\n" +" dst[ idx2 ] = a2;\n" +" dst[ idx3 ] = a3;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < cb.m_n )\n" +" {\n" +" float a0 = srcF1[gIdx];\n" +"\n" +" dstF1[ gIdx ] = a0;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < cb.m_n )\n" +" {\n" +" float2 a0 = srcF2[gIdx];\n" +"\n" +" dstF2[ gIdx ] = a0;\n" +" }\n" +"}\n" +"\n" +"\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl new file mode 100644 index 0000000000..71c31075dd --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl @@ -0,0 +1,107 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) + +#define make_uint4 (uint4) +#define make_uint2 (uint2) +#define make_int2 (int2) + +typedef struct +{ + union + { + int4 m_data; + uint4 m_unsignedData; + float m_floatData; + }; + int m_offset; + int m_n; + int m_padding[2]; +} ConstBuffer; + + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num_elements ) + { + dstInt[ offset+gIdx ] = value; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num_elements ) + { + dstFloat[ offset+gIdx ] = value; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num ) + { + dstInt[ offset+gIdx ] = value; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num ) + { + dstInt2[ gIdx + offset] = make_int2( value.x, value.y ); + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num ) + { + dstInt4[ offset+gIdx ] = value; + } +} + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h new file mode 100644 index 0000000000..4f8b96e489 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h @@ -0,0 +1,91 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* fillKernelsCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define make_uint4 (uint4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"typedef struct\n" +"{\n" +" union\n" +" {\n" +" int4 m_data;\n" +" uint4 m_unsignedData;\n" +" float m_floatData;\n" +" };\n" +" int m_offset;\n" +" int m_n;\n" +" int m_padding[2];\n" +"} ConstBuffer;\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < num_elements )\n" +" {\n" +" dstInt[ offset+gIdx ] = value;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < num_elements )\n" +" {\n" +" dstFloat[ offset+gIdx ] = value;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < num )\n" +" {\n" +" dstInt[ offset+gIdx ] = value;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < num )\n" +" {\n" +" dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < num )\n" +" {\n" +" dstInt4[ offset+gIdx ] = value;\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl new file mode 100644 index 0000000000..c9da79854a --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl @@ -0,0 +1,154 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) + +// takahiro end +#define WG_SIZE 128 +#define m_numElems x +#define m_numBlocks y +#define m_numScanBlocks z + +/*typedef struct +{ + uint m_numElems; + uint m_numBlocks; + uint m_numScanBlocks; + uint m_padding[1]; +} ConstBuffer; +*/ + +float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize) +{ + float4 blocksum; + int offset = 1; + for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1) + { + GROUP_LDS_BARRIER; + for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize) + { + int ai = offset*(2*iIdx+1)-1; + int bi = offset*(2*iIdx+2)-1; + data[bi] += data[ai]; + } + } + + GROUP_LDS_BARRIER; + + if( lIdx == 0 ) + { + blocksum = data[ n-1 ]; + data[ n-1 ] = 0; + } + + GROUP_LDS_BARRIER; + + offset >>= 1; + for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 ) + { + GROUP_LDS_BARRIER; + for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize ) + { + int ai = offset*(2*iIdx+1)-1; + int bi = offset*(2*iIdx+2)-1; + float4 temp = data[ai]; + data[ai] = data[bi]; + data[bi] += temp; + } + } + GROUP_LDS_BARRIER; + + return blocksum; +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer, uint4 cb) +{ + __local float4 ldsData[WG_SIZE*2]; + + int gIdx = GET_GLOBAL_IDX; + int lIdx = GET_LOCAL_IDX; + + ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0; + ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0; + + float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE); + + if( lIdx == 0 ) + sumBuffer[GET_GROUP_IDX] = sum; + + if( (2*gIdx) < cb.m_numElems ) + { + dst[2*gIdx] = ldsData[2*lIdx]; + } + if( (2*gIdx + 1) < cb.m_numElems ) + { + dst[2*gIdx + 1] = ldsData[2*lIdx + 1]; + } +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb) +{ + const u32 blockSize = WG_SIZE*2; + + int myIdx = GET_GROUP_IDX+1; + int lIdx = GET_LOCAL_IDX; + + float4 iBlockSum = blockSum[myIdx]; + + int endValue = min((myIdx+1)*(blockSize), cb.m_numElems); + for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE) + { + dst[i] += iBlockSum; + } +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void TopLevelScanKernel(__global float4* dst, uint4 cb) +{ + __local float4 ldsData[2048]; + int gIdx = GET_GLOBAL_IDX; + int lIdx = GET_LOCAL_IDX; + int lSize = GET_GROUP_SIZE; + + for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize ) + { + ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0; + } + + GROUP_LDS_BARRIER; + + float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE); + + for(int i=lIdx; i<cb.m_numBlocks; i+=lSize ) + { + dst[i] = ldsData[i]; + } + + if( gIdx == 0 ) + { + dst[cb.m_numBlocks] = sum; + } +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl new file mode 100644 index 0000000000..963cc1e48e --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl @@ -0,0 +1,154 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) + +// takahiro end +#define WG_SIZE 128 +#define m_numElems x +#define m_numBlocks y +#define m_numScanBlocks z + +/*typedef struct +{ + uint m_numElems; + uint m_numBlocks; + uint m_numScanBlocks; + uint m_padding[1]; +} ConstBuffer; +*/ + +u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize) +{ + u32 blocksum; + int offset = 1; + for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1) + { + GROUP_LDS_BARRIER; + for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize) + { + int ai = offset*(2*iIdx+1)-1; + int bi = offset*(2*iIdx+2)-1; + data[bi] += data[ai]; + } + } + + GROUP_LDS_BARRIER; + + if( lIdx == 0 ) + { + blocksum = data[ n-1 ]; + data[ n-1 ] = 0; + } + + GROUP_LDS_BARRIER; + + offset >>= 1; + for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 ) + { + GROUP_LDS_BARRIER; + for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize ) + { + int ai = offset*(2*iIdx+1)-1; + int bi = offset*(2*iIdx+2)-1; + u32 temp = data[ai]; + data[ai] = data[bi]; + data[bi] += temp; + } + } + GROUP_LDS_BARRIER; + + return blocksum; +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer, + uint4 cb) +{ + __local u32 ldsData[WG_SIZE*2]; + + int gIdx = GET_GLOBAL_IDX; + int lIdx = GET_LOCAL_IDX; + + ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0; + ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0; + + u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE); + + if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum; + + if( (2*gIdx) < cb.m_numElems ) + { + dst[2*gIdx] = ldsData[2*lIdx]; + } + if( (2*gIdx + 1) < cb.m_numElems ) + { + dst[2*gIdx + 1] = ldsData[2*lIdx + 1]; + } +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb) +{ + const u32 blockSize = WG_SIZE*2; + + int myIdx = GET_GROUP_IDX+1; + int lIdx = GET_LOCAL_IDX; + + u32 iBlockSum = blockSum[myIdx]; + + int endValue = min((myIdx+1)*(blockSize), cb.m_numElems); + for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE) + { + dst[i] += iBlockSum; + } +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void TopLevelScanKernel(__global u32* dst, uint4 cb) +{ + __local u32 ldsData[2048]; + int gIdx = GET_GLOBAL_IDX; + int lIdx = GET_LOCAL_IDX; + int lSize = GET_GROUP_SIZE; + + for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize ) + { + ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0; + } + + GROUP_LDS_BARRIER; + + u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE); + + for(int i=lIdx; i<cb.m_numBlocks; i+=lSize ) + { + dst[i] = ldsData[i]; + } + + if( gIdx == 0 ) + { + dst[cb.m_numBlocks] = sum; + } +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h new file mode 100644 index 0000000000..27baab8331 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h @@ -0,0 +1,129 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* prefixScanKernelsCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"// takahiro end\n" +"#define WG_SIZE 128 \n" +"#define m_numElems x\n" +"#define m_numBlocks y\n" +"#define m_numScanBlocks z\n" +"/*typedef struct\n" +"{\n" +" uint m_numElems;\n" +" uint m_numBlocks;\n" +" uint m_numScanBlocks;\n" +" uint m_padding[1];\n" +"} ConstBuffer;\n" +"*/\n" +"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n" +"{\n" +" u32 blocksum;\n" +" int offset = 1;\n" +" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n" +" {\n" +" GROUP_LDS_BARRIER;\n" +" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n" +" {\n" +" int ai = offset*(2*iIdx+1)-1;\n" +" int bi = offset*(2*iIdx+2)-1;\n" +" data[bi] += data[ai];\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" if( lIdx == 0 )\n" +" {\n" +" blocksum = data[ n-1 ];\n" +" data[ n-1 ] = 0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" offset >>= 1;\n" +" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n" +" {\n" +" GROUP_LDS_BARRIER;\n" +" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n" +" {\n" +" int ai = offset*(2*iIdx+1)-1;\n" +" int bi = offset*(2*iIdx+2)-1;\n" +" u32 temp = data[ai];\n" +" data[ai] = data[bi];\n" +" data[bi] += temp;\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" return blocksum;\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n" +" uint4 cb)\n" +"{\n" +" __local u32 ldsData[WG_SIZE*2];\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n" +" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n" +" u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" +" if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n" +" if( (2*gIdx) < cb.m_numElems )\n" +" {\n" +" dst[2*gIdx] = ldsData[2*lIdx];\n" +" }\n" +" if( (2*gIdx + 1) < cb.m_numElems )\n" +" {\n" +" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n" +"{\n" +" const u32 blockSize = WG_SIZE*2;\n" +" int myIdx = GET_GROUP_IDX+1;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" u32 iBlockSum = blockSum[myIdx];\n" +" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n" +" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n" +" {\n" +" dst[i] += iBlockSum;\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n" +"{\n" +" __local u32 ldsData[2048];\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" int lSize = GET_GROUP_SIZE;\n" +" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n" +" {\n" +" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" +" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n" +" {\n" +" dst[i] = ldsData[i];\n" +" }\n" +" if( gIdx == 0 )\n" +" {\n" +" dst[cb.m_numBlocks] = sum;\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h new file mode 100644 index 0000000000..5b13254796 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h @@ -0,0 +1,129 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* prefixScanKernelsFloat4CL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"// takahiro end\n" +"#define WG_SIZE 128 \n" +"#define m_numElems x\n" +"#define m_numBlocks y\n" +"#define m_numScanBlocks z\n" +"/*typedef struct\n" +"{\n" +" uint m_numElems;\n" +" uint m_numBlocks;\n" +" uint m_numScanBlocks;\n" +" uint m_padding[1];\n" +"} ConstBuffer;\n" +"*/\n" +"float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n" +"{\n" +" float4 blocksum;\n" +" int offset = 1;\n" +" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n" +" {\n" +" GROUP_LDS_BARRIER;\n" +" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n" +" {\n" +" int ai = offset*(2*iIdx+1)-1;\n" +" int bi = offset*(2*iIdx+2)-1;\n" +" data[bi] += data[ai];\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" if( lIdx == 0 )\n" +" {\n" +" blocksum = data[ n-1 ];\n" +" data[ n-1 ] = 0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" offset >>= 1;\n" +" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n" +" {\n" +" GROUP_LDS_BARRIER;\n" +" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n" +" {\n" +" int ai = offset*(2*iIdx+1)-1;\n" +" int bi = offset*(2*iIdx+2)-1;\n" +" float4 temp = data[ai];\n" +" data[ai] = data[bi];\n" +" data[bi] += temp;\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" return blocksum;\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer, uint4 cb)\n" +"{\n" +" __local float4 ldsData[WG_SIZE*2];\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n" +" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n" +" float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" +" if( lIdx == 0 ) \n" +" sumBuffer[GET_GROUP_IDX] = sum;\n" +" if( (2*gIdx) < cb.m_numElems )\n" +" {\n" +" dst[2*gIdx] = ldsData[2*lIdx];\n" +" }\n" +" if( (2*gIdx + 1) < cb.m_numElems )\n" +" {\n" +" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n" +"{\n" +" const u32 blockSize = WG_SIZE*2;\n" +" int myIdx = GET_GROUP_IDX+1;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" float4 iBlockSum = blockSum[myIdx];\n" +" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n" +" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n" +" {\n" +" dst[i] += iBlockSum;\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void TopLevelScanKernel(__global float4* dst, uint4 cb)\n" +"{\n" +" __local float4 ldsData[2048];\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" int lSize = GET_GROUP_SIZE;\n" +" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n" +" {\n" +" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" +" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n" +" {\n" +" dst[i] = ldsData[i];\n" +" }\n" +" if( gIdx == 0 )\n" +" {\n" +" dst[cb.m_numBlocks] = sum;\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl new file mode 100644 index 0000000000..7402e2f3b3 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl @@ -0,0 +1,1071 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Author Takahiro Harada + + +//#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AtomAdd(x, value) atom_add(&(x), value) + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + + +#define make_uint4 (uint4) +#define make_uint2 (uint2) +#define make_int2 (int2) + +#define WG_SIZE 64 +#define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE) +#define BITS_PER_PASS 4 +#define NUM_BUCKET (1<<BITS_PER_PASS) +typedef uchar u8; + +// this isn't optimization for VLIW. But just reducing writes. +#define USE_2LEVEL_REDUCE 1 + +//#define CHECK_BOUNDARY 1 + +//#define NV_GPU 1 + + +// Cypress +#define nPerWI 16 +// Cayman +//#define nPerWI 20 + +#define m_n x +#define m_nWGs y +#define m_startBit z +#define m_nBlocksPerWG w + +/* +typedef struct +{ + int m_n; + int m_nWGs; + int m_startBit; + int m_nBlocksPerWG; +} ConstBuffer; +*/ + +typedef struct +{ + unsigned int m_key; + unsigned int m_value; +} SortDataCL; + + +uint prefixScanVectorEx( uint4* data ) +{ + u32 sum = 0; + u32 tmp = data[0].x; + data[0].x = sum; + sum += tmp; + tmp = data[0].y; + data[0].y = sum; + sum += tmp; + tmp = data[0].z; + data[0].z = sum; + sum += tmp; + tmp = data[0].w; + data[0].w = sum; + sum += tmp; + return sum; +} + +u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ ) +{ + { // Set data + sorterSharedMemory[lIdx] = 0; + sorterSharedMemory[lIdx+wgSize] = pData; + } + + GROUP_LDS_BARRIER; + + { // Prefix sum + int idx = 2*lIdx + (wgSize+1); +#if defined(USE_2LEVEL_REDUCE) + if( lIdx < 64 ) + { + u32 u0, u1, u2; + u0 = sorterSharedMemory[idx-3]; + u1 = sorterSharedMemory[idx-2]; + u2 = sorterSharedMemory[idx-1]; + AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); + GROUP_MEM_FENCE; + + u0 = sorterSharedMemory[idx-12]; + u1 = sorterSharedMemory[idx-8]; + u2 = sorterSharedMemory[idx-4]; + AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); + GROUP_MEM_FENCE; + + u0 = sorterSharedMemory[idx-48]; + u1 = sorterSharedMemory[idx-32]; + u2 = sorterSharedMemory[idx-16]; + AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); + GROUP_MEM_FENCE; + if( wgSize > 64 ) + { + sorterSharedMemory[idx] += sorterSharedMemory[idx-64]; + GROUP_MEM_FENCE; + } + + sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2]; + GROUP_MEM_FENCE; + } +#else + if( lIdx < 64 ) + { + sorterSharedMemory[idx] += sorterSharedMemory[idx-1]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-2]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-4]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-8]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-16]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-32]; + GROUP_MEM_FENCE; + if( wgSize > 64 ) + { + sorterSharedMemory[idx] += sorterSharedMemory[idx-64]; + GROUP_MEM_FENCE; + } + + sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2]; + GROUP_MEM_FENCE; + } +#endif + } + + GROUP_LDS_BARRIER; + + *totalSum = sorterSharedMemory[wgSize*2-1]; + u32 addValue = sorterSharedMemory[lIdx+wgSize-1]; + return addValue; +} + +//__attribute__((reqd_work_group_size(128,1,1))) +uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory ) +{ + u32 s4 = prefixScanVectorEx( &pData ); + u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 ); + return pData + make_uint4( rank, rank, rank, rank ); +} + + +//__attribute__((reqd_work_group_size(64,1,1))) +uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory ) +{ + u32 s4 = prefixScanVectorEx( &pData ); + u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 ); + return pData + make_uint4( rank, rank, rank, rank ); +} + +u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;} + +u32 bit8Scan(u32 v) +{ + return (v<<8) + (v<<16) + (v<<24); +} + +//=== + + + + +#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx] + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb ) +{ + __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE]; + + u32 gIdx = GET_GLOBAL_IDX; + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + const int startBit = cb.m_startBit; + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + for(int i=0; i<NUM_BUCKET; i++) + { + MY_HISTOGRAM(i) = 0; + } + + GROUP_LDS_BARRIER; + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + u32 localKey; + + int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; + + int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize) + { + // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD + // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops + // AMD: AtomInc performs better while NV prefers ++ + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) + { +#if defined(CHECK_BOUNDARY) + if( addr+i < n ) +#endif + { + localKey = (gSrc[addr+i]>>startBit) & 0xf; +#if defined(NV_GPU) + MY_HISTOGRAM( localKey )++; +#else + AtomInc( MY_HISTOGRAM( localKey ) ); +#endif + } + } + } + + GROUP_LDS_BARRIER; + + if( lIdx < NUM_BUCKET ) + { + u32 sum = 0; + for(int i=0; i<GET_GROUP_SIZE; i++) + { + sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE]; + } + histogramOut[lIdx*nWGs+wgIdx] = sum; + } +} + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4 cb ) +{ + __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE]; + + u32 gIdx = GET_GLOBAL_IDX; + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + const int startBit = cb.m_startBit; + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + for(int i=0; i<NUM_BUCKET; i++) + { + MY_HISTOGRAM(i) = 0; + } + + GROUP_LDS_BARRIER; + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + u32 localKey; + + int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; + + int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize) + { + // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD + // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops + // AMD: AtomInc performs better while NV prefers ++ + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) + { +#if defined(CHECK_BOUNDARY) + if( addr+i < n ) +#endif + { + localKey = (gSrc[addr+i].m_key>>startBit) & 0xf; +#if defined(NV_GPU) + MY_HISTOGRAM( localKey )++; +#else + AtomInc( MY_HISTOGRAM( localKey ) ); +#endif + } + } + } + + GROUP_LDS_BARRIER; + + if( lIdx < NUM_BUCKET ) + { + u32 sum = 0; + for(int i=0; i<GET_GROUP_SIZE; i++) + { + sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE]; + } + histogramOut[lIdx*nWGs+wgIdx] = sum; + } +} + +#define nPerLane (nPerWI/4) + +// NUM_BUCKET*nWGs < 128*nPerWI +__kernel +__attribute__((reqd_work_group_size(128,1,1))) +void PrefixScanKernel( __global u32* wHistogram1, int4 cb ) +{ + __local u32 ldsTopScanData[128*2]; + + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + const int nWGs = cb.m_nWGs; + + u32 data[nPerWI]; + for(int i=0; i<nPerWI; i++) + { + data[i] = 0; + if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs ) + data[i] = wHistogram1[nPerWI*lIdx+i]; + } + + uint4 myData = make_uint4(0,0,0,0); + + for(int i=0; i<nPerLane; i++) + { + myData.x += data[nPerLane*0+i]; + myData.y += data[nPerLane*1+i]; + myData.z += data[nPerLane*2+i]; + myData.w += data[nPerLane*3+i]; + } + + uint totalSum; + uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData ); + +// for(int j=0; j<4; j++) // somehow it introduces a lot of branches + { int j = 0; + u32 sum = 0; + for(int i=0; i<nPerLane; i++) + { + u32 tmp = data[nPerLane*j+i]; + data[nPerLane*j+i] = sum; + sum += tmp; + } + } + { int j = 1; + u32 sum = 0; + for(int i=0; i<nPerLane; i++) + { + u32 tmp = data[nPerLane*j+i]; + data[nPerLane*j+i] = sum; + sum += tmp; + } + } + { int j = 2; + u32 sum = 0; + for(int i=0; i<nPerLane; i++) + { + u32 tmp = data[nPerLane*j+i]; + data[nPerLane*j+i] = sum; + sum += tmp; + } + } + { int j = 3; + u32 sum = 0; + for(int i=0; i<nPerLane; i++) + { + u32 tmp = data[nPerLane*j+i]; + data[nPerLane*j+i] = sum; + sum += tmp; + } + } + + for(int i=0; i<nPerLane; i++) + { + data[nPerLane*0+i] += scanned.x; + data[nPerLane*1+i] += scanned.y; + data[nPerLane*2+i] += scanned.z; + data[nPerLane*3+i] += scanned.w; + } + + for(int i=0; i<nPerWI; i++) + { + int index = nPerWI*lIdx+i; + if (index < NUM_BUCKET*nWGs) + wHistogram1[nPerWI*lIdx+i] = data[i]; + } +} + +// 4 scan, 4 exchange +void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData) +{ + for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++) + { + u32 mask = (1<<bitIdx); + uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask ); + uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) ); + u32 total; + prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData ); + { + uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3); + uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total ); + dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) ); + + GROUP_LDS_BARRIER; + + ldsSortData[dstAddr.x] = sortData[0]; + ldsSortData[dstAddr.y] = sortData[1]; + ldsSortData[dstAddr.z] = sortData[2]; + ldsSortData[dstAddr.w] = sortData[3]; + + GROUP_LDS_BARRIER; + + sortData[0] = ldsSortData[localAddr.x]; + sortData[1] = ldsSortData[localAddr.y]; + sortData[2] = ldsSortData[localAddr.z]; + sortData[3] = ldsSortData[localAddr.w]; + + GROUP_LDS_BARRIER; + } + } +} + +// 2 scan, 2 exchange +void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData) +{ + for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2) + { + uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, + (sortData[1]>>(startBit+ibit)) & 0x3, + (sortData[2]>>(startBit+ibit)) & 0x3, + (sortData[3]>>(startBit+ibit)) & 0x3); + + u32 key4; + u32 sKeyPacked[4] = { 0, 0, 0, 0 }; + { + sKeyPacked[0] |= 1<<(8*b.x); + sKeyPacked[1] |= 1<<(8*b.y); + sKeyPacked[2] |= 1<<(8*b.z); + sKeyPacked[3] |= 1<<(8*b.w); + + key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3]; + } + + u32 rankPacked; + u32 sumPacked; + { + rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE ); + } + + GROUP_LDS_BARRIER; + + u32 newOffset[4] = { 0,0,0,0 }; + { + u32 sumScanned = bit8Scan( sumPacked ); + + u32 scannedKeys[4]; + scannedKeys[0] = 1<<(8*b.x); + scannedKeys[1] = 1<<(8*b.y); + scannedKeys[2] = 1<<(8*b.z); + scannedKeys[3] = 1<<(8*b.w); + { // 4 scans at once + u32 sum4 = 0; + for(int ie=0; ie<4; ie++) + { + u32 tmp = scannedKeys[ie]; + scannedKeys[ie] = sum4; + sum4 += tmp; + } + } + + { + u32 sumPlusRank = sumScanned + rankPacked; + { u32 ie = b.x; + scannedKeys[0] += sumPlusRank; + newOffset[0] = unpack4Key( scannedKeys[0], ie ); + } + { u32 ie = b.y; + scannedKeys[1] += sumPlusRank; + newOffset[1] = unpack4Key( scannedKeys[1], ie ); + } + { u32 ie = b.z; + scannedKeys[2] += sumPlusRank; + newOffset[2] = unpack4Key( scannedKeys[2], ie ); + } + { u32 ie = b.w; + scannedKeys[3] += sumPlusRank; + newOffset[3] = unpack4Key( scannedKeys[3], ie ); + } + } + } + + + GROUP_LDS_BARRIER; + + { + ldsSortData[newOffset[0]] = sortData[0]; + ldsSortData[newOffset[1]] = sortData[1]; + ldsSortData[newOffset[2]] = sortData[2]; + ldsSortData[newOffset[3]] = sortData[3]; + + GROUP_LDS_BARRIER; + + u32 dstAddr = 4*lIdx; + sortData[0] = ldsSortData[dstAddr+0]; + sortData[1] = ldsSortData[dstAddr+1]; + sortData[2] = ldsSortData[dstAddr+2]; + sortData[3] = ldsSortData[dstAddr+3]; + + GROUP_LDS_BARRIER; + } + } +} + +#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key] + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb ) +{ + __local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16]; + __local u32 localHistogramToCarry[NUM_BUCKET]; + __local u32 localHistogram[NUM_BUCKET*2]; + + u32 gIdx = GET_GLOBAL_IDX; + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int startBit = cb.m_startBit; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + if( lIdx < (NUM_BUCKET) ) + { + localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx]; + } + + GROUP_LDS_BARRIER; + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + + int nBlocks = n/blockSize - nBlocksPerWG*wgIdx; + + int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize) + { + u32 myHistogram = 0; + + u32 sortData[ELEMENTS_PER_WORK_ITEM]; + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) +#if defined(CHECK_BOUNDARY) + sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff; +#else + sortData[i] = gSrc[ addr+i ]; +#endif + + sort4Bits(sortData, startBit, lIdx, ldsSortData); + + u32 keys[ELEMENTS_PER_WORK_ITEM]; + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) + keys[i] = (sortData[i]>>startBit) & 0xf; + + { // create histogram + u32 setIdx = lIdx/16; + if( lIdx < NUM_BUCKET ) + { + localHistogram[lIdx] = 0; + } + ldsSortData[lIdx] = 0; + GROUP_LDS_BARRIER; + + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) +#if defined(CHECK_BOUNDARY) + if( addr+i < n ) +#endif + +#if defined(NV_GPU) + SET_HISTOGRAM( setIdx, keys[i] )++; +#else + AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) ); +#endif + + GROUP_LDS_BARRIER; + + uint hIdx = NUM_BUCKET+lIdx; + if( lIdx < NUM_BUCKET ) + { + u32 sum = 0; + for(int i=0; i<WG_SIZE/16; i++) + { + sum += SET_HISTOGRAM( i, lIdx ); + } + myHistogram = sum; + localHistogram[hIdx] = sum; + } + GROUP_LDS_BARRIER; + +#if defined(USE_2LEVEL_REDUCE) + if( lIdx < NUM_BUCKET ) + { + localHistogram[hIdx] = localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + + u32 u0, u1, u2; + u0 = localHistogram[hIdx-3]; + u1 = localHistogram[hIdx-2]; + u2 = localHistogram[hIdx-1]; + AtomAdd( localHistogram[hIdx], u0 + u1 + u2 ); + GROUP_MEM_FENCE; + u0 = localHistogram[hIdx-12]; + u1 = localHistogram[hIdx-8]; + u2 = localHistogram[hIdx-4]; + AtomAdd( localHistogram[hIdx], u0 + u1 + u2 ); + GROUP_MEM_FENCE; + } +#else + if( lIdx < NUM_BUCKET ) + { + localHistogram[hIdx] = localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-2]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-4]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-8]; + GROUP_MEM_FENCE; + } +#endif + GROUP_LDS_BARRIER; + } + + { + for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++) + { + int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie; + int binIdx = keys[ie]; + int groupOffset = localHistogramToCarry[binIdx]; + int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx]; +#if defined(CHECK_BOUNDARY) + if( addr+ie < n ) +#endif + gDst[ groupOffset + myIdx ] = sortData[ie]; + } + } + + GROUP_LDS_BARRIER; + + if( lIdx < NUM_BUCKET ) + { + localHistogramToCarry[lIdx] += myHistogram; + } + GROUP_LDS_BARRIER; + } +} + +// 2 scan, 2 exchange +void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal) +{ + for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2) + { + uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, + (sortData[1]>>(startBit+ibit)) & 0x3, + (sortData[2]>>(startBit+ibit)) & 0x3, + (sortData[3]>>(startBit+ibit)) & 0x3); + + u32 key4; + u32 sKeyPacked[4] = { 0, 0, 0, 0 }; + { + sKeyPacked[0] |= 1<<(8*b.x); + sKeyPacked[1] |= 1<<(8*b.y); + sKeyPacked[2] |= 1<<(8*b.z); + sKeyPacked[3] |= 1<<(8*b.w); + + key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3]; + } + + u32 rankPacked; + u32 sumPacked; + { + rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE ); + } + + GROUP_LDS_BARRIER; + + u32 newOffset[4] = { 0,0,0,0 }; + { + u32 sumScanned = bit8Scan( sumPacked ); + + u32 scannedKeys[4]; + scannedKeys[0] = 1<<(8*b.x); + scannedKeys[1] = 1<<(8*b.y); + scannedKeys[2] = 1<<(8*b.z); + scannedKeys[3] = 1<<(8*b.w); + { // 4 scans at once + u32 sum4 = 0; + for(int ie=0; ie<4; ie++) + { + u32 tmp = scannedKeys[ie]; + scannedKeys[ie] = sum4; + sum4 += tmp; + } + } + + { + u32 sumPlusRank = sumScanned + rankPacked; + { u32 ie = b.x; + scannedKeys[0] += sumPlusRank; + newOffset[0] = unpack4Key( scannedKeys[0], ie ); + } + { u32 ie = b.y; + scannedKeys[1] += sumPlusRank; + newOffset[1] = unpack4Key( scannedKeys[1], ie ); + } + { u32 ie = b.z; + scannedKeys[2] += sumPlusRank; + newOffset[2] = unpack4Key( scannedKeys[2], ie ); + } + { u32 ie = b.w; + scannedKeys[3] += sumPlusRank; + newOffset[3] = unpack4Key( scannedKeys[3], ie ); + } + } + } + + + GROUP_LDS_BARRIER; + + { + ldsSortData[newOffset[0]] = sortData[0]; + ldsSortData[newOffset[1]] = sortData[1]; + ldsSortData[newOffset[2]] = sortData[2]; + ldsSortData[newOffset[3]] = sortData[3]; + + ldsSortVal[newOffset[0]] = sortVal[0]; + ldsSortVal[newOffset[1]] = sortVal[1]; + ldsSortVal[newOffset[2]] = sortVal[2]; + ldsSortVal[newOffset[3]] = sortVal[3]; + + GROUP_LDS_BARRIER; + + u32 dstAddr = 4*lIdx; + sortData[0] = ldsSortData[dstAddr+0]; + sortData[1] = ldsSortData[dstAddr+1]; + sortData[2] = ldsSortData[dstAddr+2]; + sortData[3] = ldsSortData[dstAddr+3]; + + sortVal[0] = ldsSortVal[dstAddr+0]; + sortVal[1] = ldsSortVal[dstAddr+1]; + sortVal[2] = ldsSortVal[dstAddr+2]; + sortVal[3] = ldsSortVal[dstAddr+3]; + + GROUP_LDS_BARRIER; + } + } +} + + + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb) +{ + __local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16]; + __local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16]; + __local u32 localHistogramToCarry[NUM_BUCKET]; + __local u32 localHistogram[NUM_BUCKET*2]; + + u32 gIdx = GET_GLOBAL_IDX; + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int startBit = cb.m_startBit; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + if( lIdx < (NUM_BUCKET) ) + { + localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx]; + } + + GROUP_LDS_BARRIER; + + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + + int nBlocks = n/blockSize - nBlocksPerWG*wgIdx; + + int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize) + { + + u32 myHistogram = 0; + + int sortData[ELEMENTS_PER_WORK_ITEM]; + int sortVal[ELEMENTS_PER_WORK_ITEM]; + + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) +#if defined(CHECK_BOUNDARY) + { + sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff; + sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff; + } +#else + { + sortData[i] = gSrc[ addr+i ].m_key; + sortVal[i] = gSrc[ addr+i ].m_value; + } +#endif + + sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal); + + u32 keys[ELEMENTS_PER_WORK_ITEM]; + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) + keys[i] = (sortData[i]>>startBit) & 0xf; + + { // create histogram + u32 setIdx = lIdx/16; + if( lIdx < NUM_BUCKET ) + { + localHistogram[lIdx] = 0; + } + ldsSortData[lIdx] = 0; + GROUP_LDS_BARRIER; + + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) +#if defined(CHECK_BOUNDARY) + if( addr+i < n ) +#endif + +#if defined(NV_GPU) + SET_HISTOGRAM( setIdx, keys[i] )++; +#else + AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) ); +#endif + + GROUP_LDS_BARRIER; + + uint hIdx = NUM_BUCKET+lIdx; + if( lIdx < NUM_BUCKET ) + { + u32 sum = 0; + for(int i=0; i<WG_SIZE/16; i++) + { + sum += SET_HISTOGRAM( i, lIdx ); + } + myHistogram = sum; + localHistogram[hIdx] = sum; + } + GROUP_LDS_BARRIER; + +#if defined(USE_2LEVEL_REDUCE) + if( lIdx < NUM_BUCKET ) + { + localHistogram[hIdx] = localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + + u32 u0, u1, u2; + u0 = localHistogram[hIdx-3]; + u1 = localHistogram[hIdx-2]; + u2 = localHistogram[hIdx-1]; + AtomAdd( localHistogram[hIdx], u0 + u1 + u2 ); + GROUP_MEM_FENCE; + u0 = localHistogram[hIdx-12]; + u1 = localHistogram[hIdx-8]; + u2 = localHistogram[hIdx-4]; + AtomAdd( localHistogram[hIdx], u0 + u1 + u2 ); + GROUP_MEM_FENCE; + } +#else + if( lIdx < NUM_BUCKET ) + { + localHistogram[hIdx] = localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-2]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-4]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-8]; + GROUP_MEM_FENCE; + } +#endif + GROUP_LDS_BARRIER; + } + + { + for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++) + { + int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie; + int binIdx = keys[ie]; + int groupOffset = localHistogramToCarry[binIdx]; + int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx]; +#if defined(CHECK_BOUNDARY) + if( addr+ie < n ) + { + if ((groupOffset + myIdx)<n) + { + if (sortData[ie]==sortVal[ie]) + { + + SortDataCL tmp; + tmp.m_key = sortData[ie]; + tmp.m_value = sortVal[ie]; + if (tmp.m_key == tmp.m_value) + gDst[groupOffset + myIdx ] = tmp; + } + + } + } +#else + if ((groupOffset + myIdx)<n) + { + gDst[ groupOffset + myIdx ].m_key = sortData[ie]; + gDst[ groupOffset + myIdx ].m_value = sortVal[ie]; + } +#endif + } + } + + GROUP_LDS_BARRIER; + + if( lIdx < NUM_BUCKET ) + { + localHistogramToCarry[lIdx] += myHistogram; + } + GROUP_LDS_BARRIER; + } +} + + + + + + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb) +{ + + u32 gIdx = GET_GLOBAL_IDX; + u32 realLocalIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + const int startBit = cb.m_startBit; + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + int counter[NUM_BUCKET]; + + if (realLocalIdx>0) + return; + + for (int c=0;c<NUM_BUCKET;c++) + counter[c]=0; + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + + int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++) + { + for (int lIdx=0;lIdx<WG_SIZE;lIdx++) + { + int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++) + { + int i = addr2+j; + if( i < n ) + { + int tableIdx; + tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1 + gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i]; + counter[tableIdx] ++; + } + } + } + } + +} + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb ) +{ + + u32 gIdx = GET_GLOBAL_IDX; + u32 realLocalIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + const int startBit = cb.m_startBit; + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + int counter[NUM_BUCKET]; + + if (realLocalIdx>0) + return; + + for (int c=0;c<NUM_BUCKET;c++) + counter[c]=0; + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + + int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++) + { + for (int lIdx=0;lIdx<WG_SIZE;lIdx++) + { + int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++) + { + int i = addr2+j; + if( i < n ) + { + int tableIdx; + tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1 + gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i]; + counter[tableIdx] ++; + } + } + } + } + +}
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h new file mode 100644 index 0000000000..8876c16aa6 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h @@ -0,0 +1,910 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* radixSort32KernelsCL= \ +"/*\n" +"Bullet Continuous Collision Detection and Physics Library\n" +"Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Author Takahiro Harada\n" +"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define make_uint4 (uint4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"#define WG_SIZE 64\n" +"#define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE)\n" +"#define BITS_PER_PASS 4\n" +"#define NUM_BUCKET (1<<BITS_PER_PASS)\n" +"typedef uchar u8;\n" +"// this isn't optimization for VLIW. But just reducing writes. \n" +"#define USE_2LEVEL_REDUCE 1\n" +"//#define CHECK_BOUNDARY 1\n" +"//#define NV_GPU 1\n" +"// Cypress\n" +"#define nPerWI 16\n" +"// Cayman\n" +"//#define nPerWI 20\n" +"#define m_n x\n" +"#define m_nWGs y\n" +"#define m_startBit z\n" +"#define m_nBlocksPerWG w\n" +"/*\n" +"typedef struct\n" +"{\n" +" int m_n;\n" +" int m_nWGs;\n" +" int m_startBit;\n" +" int m_nBlocksPerWG;\n" +"} ConstBuffer;\n" +"*/\n" +"typedef struct\n" +"{\n" +" unsigned int m_key;\n" +" unsigned int m_value;\n" +"} SortDataCL;\n" +"uint prefixScanVectorEx( uint4* data )\n" +"{\n" +" u32 sum = 0;\n" +" u32 tmp = data[0].x;\n" +" data[0].x = sum;\n" +" sum += tmp;\n" +" tmp = data[0].y;\n" +" data[0].y = sum;\n" +" sum += tmp;\n" +" tmp = data[0].z;\n" +" data[0].z = sum;\n" +" sum += tmp;\n" +" tmp = data[0].w;\n" +" data[0].w = sum;\n" +" sum += tmp;\n" +" return sum;\n" +"}\n" +"u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ )\n" +"{\n" +" { // Set data\n" +" sorterSharedMemory[lIdx] = 0;\n" +" sorterSharedMemory[lIdx+wgSize] = pData;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" { // Prefix sum\n" +" int idx = 2*lIdx + (wgSize+1);\n" +"#if defined(USE_2LEVEL_REDUCE)\n" +" if( lIdx < 64 )\n" +" {\n" +" u32 u0, u1, u2;\n" +" u0 = sorterSharedMemory[idx-3];\n" +" u1 = sorterSharedMemory[idx-2];\n" +" u2 = sorterSharedMemory[idx-1];\n" +" AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n" +" GROUP_MEM_FENCE;\n" +" u0 = sorterSharedMemory[idx-12];\n" +" u1 = sorterSharedMemory[idx-8];\n" +" u2 = sorterSharedMemory[idx-4];\n" +" AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n" +" GROUP_MEM_FENCE;\n" +" u0 = sorterSharedMemory[idx-48];\n" +" u1 = sorterSharedMemory[idx-32];\n" +" u2 = sorterSharedMemory[idx-16];\n" +" AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n" +" GROUP_MEM_FENCE;\n" +" if( wgSize > 64 )\n" +" {\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +" sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#else\n" +" if( lIdx < 64 )\n" +" {\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-2]; \n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-32];\n" +" GROUP_MEM_FENCE;\n" +" if( wgSize > 64 )\n" +" {\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +" sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#endif\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" *totalSum = sorterSharedMemory[wgSize*2-1];\n" +" u32 addValue = sorterSharedMemory[lIdx+wgSize-1];\n" +" return addValue;\n" +"}\n" +"//__attribute__((reqd_work_group_size(128,1,1)))\n" +"uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n" +"{\n" +" u32 s4 = prefixScanVectorEx( &pData );\n" +" u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );\n" +" return pData + make_uint4( rank, rank, rank, rank );\n" +"}\n" +"//__attribute__((reqd_work_group_size(64,1,1)))\n" +"uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n" +"{\n" +" u32 s4 = prefixScanVectorEx( &pData );\n" +" u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );\n" +" return pData + make_uint4( rank, rank, rank, rank );\n" +"}\n" +"u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}\n" +"u32 bit8Scan(u32 v)\n" +"{\n" +" return (v<<8) + (v<<16) + (v<<24);\n" +"}\n" +"//===\n" +"#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )\n" +"{\n" +" __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int startBit = cb.m_startBit;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" for(int i=0; i<NUM_BUCKET; i++)\n" +" {\n" +" MY_HISTOGRAM(i) = 0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" u32 localKey;\n" +" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" +" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" +" {\n" +" // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n" +" // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n" +" // AMD: AtomInc performs better while NV prefers ++\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +" {\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+i < n )\n" +"#endif\n" +" {\n" +" localKey = (gSrc[addr+i]>>startBit) & 0xf;\n" +"#if defined(NV_GPU)\n" +" MY_HISTOGRAM( localKey )++;\n" +"#else\n" +" AtomInc( MY_HISTOGRAM( localKey ) );\n" +"#endif\n" +" }\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" \n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" u32 sum = 0;\n" +" for(int i=0; i<GET_GROUP_SIZE; i++)\n" +" {\n" +" sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n" +" }\n" +" histogramOut[lIdx*nWGs+wgIdx] = sum;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4 cb )\n" +"{\n" +" __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int startBit = cb.m_startBit;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" for(int i=0; i<NUM_BUCKET; i++)\n" +" {\n" +" MY_HISTOGRAM(i) = 0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" u32 localKey;\n" +" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" +" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" +" {\n" +" // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n" +" // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n" +" // AMD: AtomInc performs better while NV prefers ++\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +" {\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+i < n )\n" +"#endif\n" +" {\n" +" localKey = (gSrc[addr+i].m_key>>startBit) & 0xf;\n" +"#if defined(NV_GPU)\n" +" MY_HISTOGRAM( localKey )++;\n" +"#else\n" +" AtomInc( MY_HISTOGRAM( localKey ) );\n" +"#endif\n" +" }\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" \n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" u32 sum = 0;\n" +" for(int i=0; i<GET_GROUP_SIZE; i++)\n" +" {\n" +" sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n" +" }\n" +" histogramOut[lIdx*nWGs+wgIdx] = sum;\n" +" }\n" +"}\n" +"#define nPerLane (nPerWI/4)\n" +"// NUM_BUCKET*nWGs < 128*nPerWI\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(128,1,1)))\n" +"void PrefixScanKernel( __global u32* wHistogram1, int4 cb )\n" +"{\n" +" __local u32 ldsTopScanData[128*2];\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" const int nWGs = cb.m_nWGs;\n" +" u32 data[nPerWI];\n" +" for(int i=0; i<nPerWI; i++)\n" +" {\n" +" data[i] = 0;\n" +" if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs )\n" +" data[i] = wHistogram1[nPerWI*lIdx+i];\n" +" }\n" +" uint4 myData = make_uint4(0,0,0,0);\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" myData.x += data[nPerLane*0+i];\n" +" myData.y += data[nPerLane*1+i];\n" +" myData.z += data[nPerLane*2+i];\n" +" myData.w += data[nPerLane*3+i];\n" +" }\n" +" uint totalSum;\n" +" uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData );\n" +"// for(int j=0; j<4; j++) // somehow it introduces a lot of branches\n" +" { int j = 0;\n" +" u32 sum = 0;\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" u32 tmp = data[nPerLane*j+i];\n" +" data[nPerLane*j+i] = sum;\n" +" sum += tmp;\n" +" }\n" +" }\n" +" { int j = 1;\n" +" u32 sum = 0;\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" u32 tmp = data[nPerLane*j+i];\n" +" data[nPerLane*j+i] = sum;\n" +" sum += tmp;\n" +" }\n" +" }\n" +" { int j = 2;\n" +" u32 sum = 0;\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" u32 tmp = data[nPerLane*j+i];\n" +" data[nPerLane*j+i] = sum;\n" +" sum += tmp;\n" +" }\n" +" }\n" +" { int j = 3;\n" +" u32 sum = 0;\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" u32 tmp = data[nPerLane*j+i];\n" +" data[nPerLane*j+i] = sum;\n" +" sum += tmp;\n" +" }\n" +" }\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" data[nPerLane*0+i] += scanned.x;\n" +" data[nPerLane*1+i] += scanned.y;\n" +" data[nPerLane*2+i] += scanned.z;\n" +" data[nPerLane*3+i] += scanned.w;\n" +" }\n" +" for(int i=0; i<nPerWI; i++)\n" +" {\n" +" int index = nPerWI*lIdx+i;\n" +" if (index < NUM_BUCKET*nWGs)\n" +" wHistogram1[nPerWI*lIdx+i] = data[i];\n" +" }\n" +"}\n" +"// 4 scan, 4 exchange\n" +"void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n" +"{\n" +" for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++)\n" +" {\n" +" u32 mask = (1<<bitIdx);\n" +" uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );\n" +" uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n" +" u32 total;\n" +" prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );\n" +" {\n" +" uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n" +" uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n" +" dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n" +" GROUP_LDS_BARRIER;\n" +" ldsSortData[dstAddr.x] = sortData[0];\n" +" ldsSortData[dstAddr.y] = sortData[1];\n" +" ldsSortData[dstAddr.z] = sortData[2];\n" +" ldsSortData[dstAddr.w] = sortData[3];\n" +" GROUP_LDS_BARRIER;\n" +" sortData[0] = ldsSortData[localAddr.x];\n" +" sortData[1] = ldsSortData[localAddr.y];\n" +" sortData[2] = ldsSortData[localAddr.z];\n" +" sortData[3] = ldsSortData[localAddr.w];\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" }\n" +"}\n" +"// 2 scan, 2 exchange\n" +"void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n" +"{\n" +" for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n" +" {\n" +" uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n" +" (sortData[1]>>(startBit+ibit)) & 0x3, \n" +" (sortData[2]>>(startBit+ibit)) & 0x3, \n" +" (sortData[3]>>(startBit+ibit)) & 0x3);\n" +" u32 key4;\n" +" u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n" +" {\n" +" sKeyPacked[0] |= 1<<(8*b.x);\n" +" sKeyPacked[1] |= 1<<(8*b.y);\n" +" sKeyPacked[2] |= 1<<(8*b.z);\n" +" sKeyPacked[3] |= 1<<(8*b.w);\n" +" key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n" +" }\n" +" u32 rankPacked;\n" +" u32 sumPacked;\n" +" {\n" +" rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" u32 newOffset[4] = { 0,0,0,0 };\n" +" {\n" +" u32 sumScanned = bit8Scan( sumPacked );\n" +" u32 scannedKeys[4];\n" +" scannedKeys[0] = 1<<(8*b.x);\n" +" scannedKeys[1] = 1<<(8*b.y);\n" +" scannedKeys[2] = 1<<(8*b.z);\n" +" scannedKeys[3] = 1<<(8*b.w);\n" +" { // 4 scans at once\n" +" u32 sum4 = 0;\n" +" for(int ie=0; ie<4; ie++)\n" +" {\n" +" u32 tmp = scannedKeys[ie];\n" +" scannedKeys[ie] = sum4;\n" +" sum4 += tmp;\n" +" }\n" +" }\n" +" {\n" +" u32 sumPlusRank = sumScanned + rankPacked;\n" +" { u32 ie = b.x;\n" +" scannedKeys[0] += sumPlusRank;\n" +" newOffset[0] = unpack4Key( scannedKeys[0], ie );\n" +" }\n" +" { u32 ie = b.y;\n" +" scannedKeys[1] += sumPlusRank;\n" +" newOffset[1] = unpack4Key( scannedKeys[1], ie );\n" +" }\n" +" { u32 ie = b.z;\n" +" scannedKeys[2] += sumPlusRank;\n" +" newOffset[2] = unpack4Key( scannedKeys[2], ie );\n" +" }\n" +" { u32 ie = b.w;\n" +" scannedKeys[3] += sumPlusRank;\n" +" newOffset[3] = unpack4Key( scannedKeys[3], ie );\n" +" }\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" {\n" +" ldsSortData[newOffset[0]] = sortData[0];\n" +" ldsSortData[newOffset[1]] = sortData[1];\n" +" ldsSortData[newOffset[2]] = sortData[2];\n" +" ldsSortData[newOffset[3]] = sortData[3];\n" +" GROUP_LDS_BARRIER;\n" +" u32 dstAddr = 4*lIdx;\n" +" sortData[0] = ldsSortData[dstAddr+0];\n" +" sortData[1] = ldsSortData[dstAddr+1];\n" +" sortData[2] = ldsSortData[dstAddr+2];\n" +" sortData[3] = ldsSortData[dstAddr+3];\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" }\n" +"}\n" +"#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )\n" +"{\n" +" __local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" +" __local u32 localHistogramToCarry[NUM_BUCKET];\n" +" __local u32 localHistogram[NUM_BUCKET*2];\n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int startBit = cb.m_startBit;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" if( lIdx < (NUM_BUCKET) )\n" +" {\n" +" localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n" +" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" +" {\n" +" u32 myHistogram = 0;\n" +" u32 sortData[ELEMENTS_PER_WORK_ITEM];\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +"#if defined(CHECK_BOUNDARY)\n" +" sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff;\n" +"#else\n" +" sortData[i] = gSrc[ addr+i ];\n" +"#endif\n" +" sort4Bits(sortData, startBit, lIdx, ldsSortData);\n" +" u32 keys[ELEMENTS_PER_WORK_ITEM];\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +" keys[i] = (sortData[i]>>startBit) & 0xf;\n" +" { // create histogram\n" +" u32 setIdx = lIdx/16;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[lIdx] = 0;\n" +" }\n" +" ldsSortData[lIdx] = 0;\n" +" GROUP_LDS_BARRIER;\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+i < n )\n" +"#endif\n" +"#if defined(NV_GPU)\n" +" SET_HISTOGRAM( setIdx, keys[i] )++;\n" +"#else\n" +" AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n" +"#endif\n" +" \n" +" GROUP_LDS_BARRIER;\n" +" \n" +" uint hIdx = NUM_BUCKET+lIdx;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" u32 sum = 0;\n" +" for(int i=0; i<WG_SIZE/16; i++)\n" +" {\n" +" sum += SET_HISTOGRAM( i, lIdx );\n" +" }\n" +" myHistogram = sum;\n" +" localHistogram[hIdx] = sum;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +"#if defined(USE_2LEVEL_REDUCE)\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[hIdx] = localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" u32 u0, u1, u2;\n" +" u0 = localHistogram[hIdx-3];\n" +" u1 = localHistogram[hIdx-2];\n" +" u2 = localHistogram[hIdx-1];\n" +" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" +" GROUP_MEM_FENCE;\n" +" u0 = localHistogram[hIdx-12];\n" +" u1 = localHistogram[hIdx-8];\n" +" u2 = localHistogram[hIdx-4];\n" +" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#else\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[hIdx] = localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-2];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-4];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-8];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#endif\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" {\n" +" for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n" +" {\n" +" int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n" +" int binIdx = keys[ie];\n" +" int groupOffset = localHistogramToCarry[binIdx];\n" +" int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+ie < n )\n" +"#endif\n" +" gDst[ groupOffset + myIdx ] = sortData[ie];\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogramToCarry[lIdx] += myHistogram;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +"}\n" +"// 2 scan, 2 exchange\n" +"void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal)\n" +"{\n" +" for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n" +" {\n" +" uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n" +" (sortData[1]>>(startBit+ibit)) & 0x3, \n" +" (sortData[2]>>(startBit+ibit)) & 0x3, \n" +" (sortData[3]>>(startBit+ibit)) & 0x3);\n" +" u32 key4;\n" +" u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n" +" {\n" +" sKeyPacked[0] |= 1<<(8*b.x);\n" +" sKeyPacked[1] |= 1<<(8*b.y);\n" +" sKeyPacked[2] |= 1<<(8*b.z);\n" +" sKeyPacked[3] |= 1<<(8*b.w);\n" +" key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n" +" }\n" +" u32 rankPacked;\n" +" u32 sumPacked;\n" +" {\n" +" rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" u32 newOffset[4] = { 0,0,0,0 };\n" +" {\n" +" u32 sumScanned = bit8Scan( sumPacked );\n" +" u32 scannedKeys[4];\n" +" scannedKeys[0] = 1<<(8*b.x);\n" +" scannedKeys[1] = 1<<(8*b.y);\n" +" scannedKeys[2] = 1<<(8*b.z);\n" +" scannedKeys[3] = 1<<(8*b.w);\n" +" { // 4 scans at once\n" +" u32 sum4 = 0;\n" +" for(int ie=0; ie<4; ie++)\n" +" {\n" +" u32 tmp = scannedKeys[ie];\n" +" scannedKeys[ie] = sum4;\n" +" sum4 += tmp;\n" +" }\n" +" }\n" +" {\n" +" u32 sumPlusRank = sumScanned + rankPacked;\n" +" { u32 ie = b.x;\n" +" scannedKeys[0] += sumPlusRank;\n" +" newOffset[0] = unpack4Key( scannedKeys[0], ie );\n" +" }\n" +" { u32 ie = b.y;\n" +" scannedKeys[1] += sumPlusRank;\n" +" newOffset[1] = unpack4Key( scannedKeys[1], ie );\n" +" }\n" +" { u32 ie = b.z;\n" +" scannedKeys[2] += sumPlusRank;\n" +" newOffset[2] = unpack4Key( scannedKeys[2], ie );\n" +" }\n" +" { u32 ie = b.w;\n" +" scannedKeys[3] += sumPlusRank;\n" +" newOffset[3] = unpack4Key( scannedKeys[3], ie );\n" +" }\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" {\n" +" ldsSortData[newOffset[0]] = sortData[0];\n" +" ldsSortData[newOffset[1]] = sortData[1];\n" +" ldsSortData[newOffset[2]] = sortData[2];\n" +" ldsSortData[newOffset[3]] = sortData[3];\n" +" ldsSortVal[newOffset[0]] = sortVal[0];\n" +" ldsSortVal[newOffset[1]] = sortVal[1];\n" +" ldsSortVal[newOffset[2]] = sortVal[2];\n" +" ldsSortVal[newOffset[3]] = sortVal[3];\n" +" GROUP_LDS_BARRIER;\n" +" u32 dstAddr = 4*lIdx;\n" +" sortData[0] = ldsSortData[dstAddr+0];\n" +" sortData[1] = ldsSortData[dstAddr+1];\n" +" sortData[2] = ldsSortData[dstAddr+2];\n" +" sortData[3] = ldsSortData[dstAddr+3];\n" +" sortVal[0] = ldsSortVal[dstAddr+0];\n" +" sortVal[1] = ldsSortVal[dstAddr+1];\n" +" sortVal[2] = ldsSortVal[dstAddr+2];\n" +" sortVal[3] = ldsSortVal[dstAddr+3];\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n" +"{\n" +" __local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" +" __local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" +" __local u32 localHistogramToCarry[NUM_BUCKET];\n" +" __local u32 localHistogram[NUM_BUCKET*2];\n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int startBit = cb.m_startBit;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" if( lIdx < (NUM_BUCKET) )\n" +" {\n" +" localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" \n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n" +" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" +" {\n" +" u32 myHistogram = 0;\n" +" int sortData[ELEMENTS_PER_WORK_ITEM];\n" +" int sortVal[ELEMENTS_PER_WORK_ITEM];\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +"#if defined(CHECK_BOUNDARY)\n" +" {\n" +" sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff;\n" +" sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff;\n" +" }\n" +"#else\n" +" {\n" +" sortData[i] = gSrc[ addr+i ].m_key;\n" +" sortVal[i] = gSrc[ addr+i ].m_value;\n" +" }\n" +"#endif\n" +" sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal);\n" +" u32 keys[ELEMENTS_PER_WORK_ITEM];\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +" keys[i] = (sortData[i]>>startBit) & 0xf;\n" +" { // create histogram\n" +" u32 setIdx = lIdx/16;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[lIdx] = 0;\n" +" }\n" +" ldsSortData[lIdx] = 0;\n" +" GROUP_LDS_BARRIER;\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+i < n )\n" +"#endif\n" +"#if defined(NV_GPU)\n" +" SET_HISTOGRAM( setIdx, keys[i] )++;\n" +"#else\n" +" AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n" +"#endif\n" +" \n" +" GROUP_LDS_BARRIER;\n" +" \n" +" uint hIdx = NUM_BUCKET+lIdx;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" u32 sum = 0;\n" +" for(int i=0; i<WG_SIZE/16; i++)\n" +" {\n" +" sum += SET_HISTOGRAM( i, lIdx );\n" +" }\n" +" myHistogram = sum;\n" +" localHistogram[hIdx] = sum;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +"#if defined(USE_2LEVEL_REDUCE)\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[hIdx] = localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" u32 u0, u1, u2;\n" +" u0 = localHistogram[hIdx-3];\n" +" u1 = localHistogram[hIdx-2];\n" +" u2 = localHistogram[hIdx-1];\n" +" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" +" GROUP_MEM_FENCE;\n" +" u0 = localHistogram[hIdx-12];\n" +" u1 = localHistogram[hIdx-8];\n" +" u2 = localHistogram[hIdx-4];\n" +" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#else\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[hIdx] = localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-2];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-4];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-8];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#endif\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" {\n" +" for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n" +" {\n" +" int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n" +" int binIdx = keys[ie];\n" +" int groupOffset = localHistogramToCarry[binIdx];\n" +" int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+ie < n )\n" +" {\n" +" if ((groupOffset + myIdx)<n)\n" +" {\n" +" if (sortData[ie]==sortVal[ie])\n" +" {\n" +" \n" +" SortDataCL tmp;\n" +" tmp.m_key = sortData[ie];\n" +" tmp.m_value = sortVal[ie];\n" +" if (tmp.m_key == tmp.m_value)\n" +" gDst[groupOffset + myIdx ] = tmp;\n" +" }\n" +" \n" +" }\n" +" }\n" +"#else\n" +" if ((groupOffset + myIdx)<n)\n" +" {\n" +" gDst[ groupOffset + myIdx ].m_key = sortData[ie];\n" +" gDst[ groupOffset + myIdx ].m_value = sortVal[ie];\n" +" }\n" +"#endif\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogramToCarry[lIdx] += myHistogram;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n" +"{\n" +" \n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 realLocalIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int startBit = cb.m_startBit;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" int counter[NUM_BUCKET];\n" +" \n" +" if (realLocalIdx>0)\n" +" return;\n" +" \n" +" for (int c=0;c<NUM_BUCKET;c++)\n" +" counter[c]=0;\n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" \n" +" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n" +" {\n" +" for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n" +" {\n" +" int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" \n" +" for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n" +" {\n" +" int i = addr2+j;\n" +" if( i < n )\n" +" {\n" +" int tableIdx;\n" +" tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1\n" +" gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n" +" counter[tableIdx] ++;\n" +" }\n" +" }\n" +" }\n" +" }\n" +" \n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )\n" +"{\n" +" \n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 realLocalIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int startBit = cb.m_startBit;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" int counter[NUM_BUCKET];\n" +" \n" +" if (realLocalIdx>0)\n" +" return;\n" +" \n" +" for (int c=0;c<NUM_BUCKET;c++)\n" +" counter[c]=0;\n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" \n" +" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n" +" {\n" +" for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n" +" {\n" +" int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" \n" +" for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n" +" {\n" +" int i = addr2+j;\n" +" if( i < n )\n" +" {\n" +" int tableIdx;\n" +" tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1\n" +" gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n" +" counter[tableIdx] ++;\n" +" }\n" +" }\n" +" }\n" +" }\n" +" \n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp b/thirdparty/bullet/src/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp new file mode 100644 index 0000000000..161e304f09 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp @@ -0,0 +1,391 @@ + +#include "b3GpuRaycast.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h" +#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h" + + +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" +#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h" +#include "Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h" + +#include "Bullet3OpenCL/Raycast/kernels/rayCastKernels.h" + + +#define B3_RAYCAST_PATH "src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl" + + + +struct b3GpuRaycastInternalData +{ + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_q; + cl_kernel m_raytraceKernel; + cl_kernel m_raytracePairsKernel; + cl_kernel m_findRayRigidPairIndexRanges; + + b3GpuParallelLinearBvh* m_plbvh; + b3RadixSort32CL* m_radixSorter; + b3FillCL* m_fill; + + //1 element per ray + b3OpenCLArray<b3RayInfo>* m_gpuRays; + b3OpenCLArray<b3RayHit>* m_gpuHitResults; + b3OpenCLArray<int>* m_firstRayRigidPairIndexPerRay; + b3OpenCLArray<int>* m_numRayRigidPairsPerRay; + + //1 element per (ray index, rigid index) pair, where the ray intersects with the rigid's AABB + b3OpenCLArray<int>* m_gpuNumRayRigidPairs; + b3OpenCLArray<b3Int2>* m_gpuRayRigidPairs; //x == ray index, y == rigid index + + int m_test; +}; + +b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue q) +{ + m_data = new b3GpuRaycastInternalData; + m_data->m_context = ctx; + m_data->m_device = device; + m_data->m_q = q; + m_data->m_raytraceKernel = 0; + m_data->m_raytracePairsKernel = 0; + m_data->m_findRayRigidPairIndexRanges = 0; + + m_data->m_plbvh = new b3GpuParallelLinearBvh(ctx, device, q); + m_data->m_radixSorter = new b3RadixSort32CL(ctx, device, q); + m_data->m_fill = new b3FillCL(ctx, device, q); + + m_data->m_gpuRays = new b3OpenCLArray<b3RayInfo>(ctx, q); + m_data->m_gpuHitResults = new b3OpenCLArray<b3RayHit>(ctx, q); + m_data->m_firstRayRigidPairIndexPerRay = new b3OpenCLArray<int>(ctx, q); + m_data->m_numRayRigidPairsPerRay = new b3OpenCLArray<int>(ctx, q); + m_data->m_gpuNumRayRigidPairs = new b3OpenCLArray<int>(ctx, q); + m_data->m_gpuRayRigidPairs = new b3OpenCLArray<b3Int2>(ctx, q); + + { + cl_int errNum=0; + cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,rayCastKernelCL,&errNum,"",B3_RAYCAST_PATH); + b3Assert(errNum==CL_SUCCESS); + m_data->m_raytraceKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastKernel",&errNum,prog); + b3Assert(errNum==CL_SUCCESS); + m_data->m_raytracePairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastPairsKernel",&errNum,prog); + b3Assert(errNum==CL_SUCCESS); + m_data->m_findRayRigidPairIndexRanges = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "findRayRigidPairIndexRanges",&errNum,prog); + b3Assert(errNum==CL_SUCCESS); + clReleaseProgram(prog); + } + + +} + +b3GpuRaycast::~b3GpuRaycast() +{ + clReleaseKernel(m_data->m_raytraceKernel); + clReleaseKernel(m_data->m_raytracePairsKernel); + clReleaseKernel(m_data->m_findRayRigidPairIndexRanges); + + delete m_data->m_plbvh; + delete m_data->m_radixSorter; + delete m_data->m_fill; + + delete m_data->m_gpuRays; + delete m_data->m_gpuHitResults; + delete m_data->m_firstRayRigidPairIndexPerRay; + delete m_data->m_numRayRigidPairsPerRay; + delete m_data->m_gpuNumRayRigidPairs; + delete m_data->m_gpuRayRigidPairs; + + delete m_data; +} + +bool sphere_intersect(const b3Vector3& spherePos, b3Scalar radius, const b3Vector3& rayFrom, const b3Vector3& rayTo, float& hitFraction) +{ + b3Vector3 rs = rayFrom - spherePos; + b3Vector3 rayDir = rayTo-rayFrom; + + float A = b3Dot(rayDir,rayDir); + float B = b3Dot(rs, rayDir); + float C = b3Dot(rs, rs) - (radius * radius); + + float D = B * B - A*C; + + if (D > 0.0) + { + float t = (-B - sqrt(D))/A; + + if ( (t >= 0.0f) && (t < hitFraction) ) + { + hitFraction = t; + return true; + } + } + return false; +} + +bool rayConvex(const b3Vector3& rayFromLocal, const b3Vector3& rayToLocal, const b3ConvexPolyhedronData& poly, + const b3AlignedObjectArray<b3GpuFace>& faces, float& hitFraction, b3Vector3& hitNormal) +{ + float exitFraction = hitFraction; + float enterFraction = -0.1f; + b3Vector3 curHitNormal=b3MakeVector3(0,0,0); + for (int i=0;i<poly.m_numFaces;i++) + { + const b3GpuFace& face = faces[poly.m_faceOffset+i]; + float fromPlaneDist = b3Dot(rayFromLocal,face.m_plane)+face.m_plane.w; + float toPlaneDist = b3Dot(rayToLocal,face.m_plane)+face.m_plane.w; + if (fromPlaneDist<0.f) + { + if (toPlaneDist >= 0.f) + { + float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist); + if (exitFraction>fraction) + { + exitFraction = fraction; + } + } + } else + { + if (toPlaneDist<0.f) + { + float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist); + if (enterFraction <= fraction) + { + enterFraction = fraction; + curHitNormal = face.m_plane; + curHitNormal.w = 0.f; + } + } else + { + return false; + } + } + if (exitFraction <= enterFraction) + return false; + } + + if (enterFraction < 0.f) + return false; + + hitFraction = enterFraction; + hitNormal = curHitNormal; + return true; +} + +void b3GpuRaycast::castRaysHost(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults, + int numBodies,const struct b3RigidBodyData* bodies, int numCollidables,const struct b3Collidable* collidables, const struct b3GpuNarrowPhaseInternalData* narrowphaseData) +{ + +// return castRays(rays,hitResults,numBodies,bodies,numCollidables,collidables); + + B3_PROFILE("castRaysHost"); + for (int r=0;r<rays.size();r++) + { + b3Vector3 rayFrom = rays[r].m_from; + b3Vector3 rayTo = rays[r].m_to; + float hitFraction = hitResults[r].m_hitFraction; + + int hitBodyIndex= -1; + b3Vector3 hitNormal; + + for (int b=0;b<numBodies;b++) + { + + const b3Vector3& pos = bodies[b].m_pos; + //const b3Quaternion& orn = bodies[b].m_quat; + + switch (collidables[bodies[b].m_collidableIdx].m_shapeType) + { + case SHAPE_SPHERE: + { + b3Scalar radius = collidables[bodies[b].m_collidableIdx].m_radius; + if (sphere_intersect(pos, radius, rayFrom, rayTo,hitFraction)) + { + hitBodyIndex = b; + b3Vector3 hitPoint; + hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to,hitFraction); + hitNormal = (hitPoint-bodies[b].m_pos).normalize(); + } + } + case SHAPE_CONVEX_HULL: + { + + b3Transform convexWorldTransform; + convexWorldTransform.setIdentity(); + convexWorldTransform.setOrigin(bodies[b].m_pos); + convexWorldTransform.setRotation(bodies[b].m_quat); + b3Transform convexWorld2Local = convexWorldTransform.inverse(); + + b3Vector3 rayFromLocal = convexWorld2Local(rayFrom); + b3Vector3 rayToLocal = convexWorld2Local(rayTo); + + + int shapeIndex = collidables[bodies[b].m_collidableIdx].m_shapeIndex; + const b3ConvexPolyhedronData& poly = narrowphaseData->m_convexPolyhedra[shapeIndex]; + if (rayConvex(rayFromLocal, rayToLocal,poly,narrowphaseData->m_convexFaces, hitFraction, hitNormal)) + { + hitBodyIndex = b; + } + + + break; + } + default: + { + static bool once=true; + if (once) + { + once=false; + b3Warning("Raytest: unsupported shape type\n"); + } + } + } + } + if (hitBodyIndex>=0) + { + + hitResults[r].m_hitFraction = hitFraction; + hitResults[r].m_hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to,hitFraction); + hitResults[r].m_hitNormal = hitNormal; + hitResults[r].m_hitBody = hitBodyIndex; + } + + } +} +///todo: add some acceleration structure (AABBs, tree etc) +void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults, + int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, + const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase) +{ + //castRaysHost(rays,hitResults,numBodies,bodies,numCollidables,collidables,narrowphaseData); + + B3_PROFILE("castRaysGPU"); + + { + B3_PROFILE("raycast copyFromHost"); + m_data->m_gpuRays->copyFromHost(rays); + m_data->m_gpuHitResults->copyFromHost(hitResults); + + } + + int numRays = hitResults.size(); + { + m_data->m_firstRayRigidPairIndexPerRay->resize(numRays); + m_data->m_numRayRigidPairsPerRay->resize(numRays); + + m_data->m_gpuNumRayRigidPairs->resize(1); + m_data->m_gpuRayRigidPairs->resize(numRays * 16); + } + + //run kernel + const bool USE_BRUTE_FORCE_RAYCAST = false; + if(USE_BRUTE_FORCE_RAYCAST) + { + B3_PROFILE("raycast launch1D"); + + b3LauncherCL launcher(m_data->m_q,m_data->m_raytraceKernel,"m_raytraceKernel"); + int numRays = rays.size(); + launcher.setConst(numRays); + + launcher.setBuffer(m_data->m_gpuRays->getBufferCL()); + launcher.setBuffer(m_data->m_gpuHitResults->getBufferCL()); + + launcher.setConst(numBodies); + launcher.setBuffer(narrowphaseData->m_bodyBufferGPU->getBufferCL()); + launcher.setBuffer(narrowphaseData->m_collidablesGPU->getBufferCL()); + launcher.setBuffer(narrowphaseData->m_convexFacesGPU->getBufferCL()); + launcher.setBuffer(narrowphaseData->m_convexPolyhedraGPU->getBufferCL()); + + launcher.launch1D(numRays); + clFinish(m_data->m_q); + } + else + { + m_data->m_plbvh->build( broadphase->getAllAabbsGPU(), broadphase->getSmallAabbIndicesGPU(), broadphase->getLargeAabbIndicesGPU() ); + + m_data->m_plbvh->testRaysAgainstBvhAabbs(*m_data->m_gpuRays, *m_data->m_gpuNumRayRigidPairs, *m_data->m_gpuRayRigidPairs); + + int numRayRigidPairs = -1; + m_data->m_gpuNumRayRigidPairs->copyToHostPointer(&numRayRigidPairs, 1); + if( numRayRigidPairs > m_data->m_gpuRayRigidPairs->size() ) + { + numRayRigidPairs = m_data->m_gpuRayRigidPairs->size(); + m_data->m_gpuNumRayRigidPairs->copyFromHostPointer(&numRayRigidPairs, 1); + } + + m_data->m_gpuRayRigidPairs->resize(numRayRigidPairs); //Radix sort needs b3OpenCLArray::size() to be correct + + //Sort ray-rigid pairs by ray index + { + B3_PROFILE("sort ray-rigid pairs"); + m_data->m_radixSorter->execute( *reinterpret_cast< b3OpenCLArray<b3SortData>* >(m_data->m_gpuRayRigidPairs) ); + } + + //detect start,count of each ray pair + { + B3_PROFILE("detect ray-rigid pair index ranges"); + + { + B3_PROFILE("reset ray-rigid pair index ranges"); + + m_data->m_fill->execute(*m_data->m_firstRayRigidPairIndexPerRay, numRayRigidPairs, numRays); //atomic_min used to find first index + m_data->m_fill->execute(*m_data->m_numRayRigidPairsPerRay, 0, numRays); + clFinish(m_data->m_q); + } + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() ), + + b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ), + b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() ) + }; + + b3LauncherCL launcher(m_data->m_q, m_data->m_findRayRigidPairIndexRanges, "m_findRayRigidPairIndexRanges"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numRayRigidPairs); + + launcher.launch1D(numRayRigidPairs); + clFinish(m_data->m_q); + } + + { + B3_PROFILE("ray-rigid intersection"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_data->m_gpuRays->getBufferCL() ), + b3BufferInfoCL( m_data->m_gpuHitResults->getBufferCL() ), + b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ), + b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() ), + + b3BufferInfoCL( narrowphaseData->m_bodyBufferGPU->getBufferCL() ), + b3BufferInfoCL( narrowphaseData->m_collidablesGPU->getBufferCL() ), + b3BufferInfoCL( narrowphaseData->m_convexFacesGPU->getBufferCL() ), + b3BufferInfoCL( narrowphaseData->m_convexPolyhedraGPU->getBufferCL() ), + + b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() ) + }; + + b3LauncherCL launcher(m_data->m_q, m_data->m_raytracePairsKernel, "m_raytracePairsKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numRays); + + launcher.launch1D(numRays); + clFinish(m_data->m_q); + } + } + + + + //copy results + { + B3_PROFILE("raycast copyToHost"); + m_data->m_gpuHitResults->copyToHost(hitResults); + } + +}
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/Raycast/b3GpuRaycast.h b/thirdparty/bullet/src/Bullet3OpenCL/Raycast/b3GpuRaycast.h new file mode 100644 index 0000000000..3a5cf44b79 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/Raycast/b3GpuRaycast.h @@ -0,0 +1,32 @@ +#ifndef B3_GPU_RAYCAST_H +#define B3_GPU_RAYCAST_H + +#include "Bullet3Common/b3Vector3.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h" + +#include "Bullet3Common/b3AlignedObjectArray.h" +#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h" + + + +class b3GpuRaycast +{ +protected: + struct b3GpuRaycastInternalData* m_data; +public: + b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue q); + virtual ~b3GpuRaycast(); + + void castRaysHost(const b3AlignedObjectArray<b3RayInfo>& raysIn, b3AlignedObjectArray<b3RayHit>& hitResults, + int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, + const struct b3GpuNarrowPhaseInternalData* narrowphaseData); + + void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults, + int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, + const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase); + + + +}; + +#endif //B3_GPU_RAYCAST_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl b/thirdparty/bullet/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl new file mode 100644 index 0000000000..e72d96876b --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl @@ -0,0 +1,439 @@ + +#define SHAPE_CONVEX_HULL 3 +#define SHAPE_PLANE 4 +#define SHAPE_CONCAVE_TRIMESH 5 +#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6 +#define SHAPE_SPHERE 7 + + +typedef struct +{ + float4 m_from; + float4 m_to; +} b3RayInfo; + +typedef struct +{ + float m_hitFraction; + int m_hitResult0; + int m_hitResult1; + int m_hitResult2; + float4 m_hitPoint; + float4 m_hitNormal; +} b3RayHit; + +typedef struct +{ + float4 m_pos; + float4 m_quat; + float4 m_linVel; + float4 m_angVel; + + unsigned int m_collidableIdx; + float m_invMass; + float m_restituitionCoeff; + float m_frictionCoeff; +} Body; + +typedef struct Collidable +{ + union { + int m_numChildShapes; + int m_bvhIndex; + }; + float m_radius; + int m_shapeType; + int m_shapeIndex; +} Collidable; + + +typedef struct +{ + float4 m_localCenter; + float4 m_extents; + float4 mC; + float4 mE; + + float m_radius; + int m_faceOffset; + int m_numFaces; + int m_numVertices; + + int m_vertexOffset; + int m_uniqueEdgesOffset; + int m_numUniqueEdges; + int m_unused; + +} ConvexPolyhedronCL; + +typedef struct +{ + float4 m_plane; + int m_indexOffset; + int m_numIndices; +} b3GpuFace; + + + +/////////////////////////////////////// +// Quaternion +/////////////////////////////////////// + +typedef float4 Quaternion; + +__inline + Quaternion qtMul(Quaternion a, Quaternion b); + +__inline + Quaternion qtNormalize(Quaternion in); + + +__inline + Quaternion qtInvert(Quaternion q); + + +__inline + float dot3F4(float4 a, float4 b) +{ + float4 a1 = (float4)(a.xyz,0.f); + float4 b1 = (float4)(b.xyz,0.f); + return dot(a1, b1); +} + + +__inline + Quaternion qtMul(Quaternion a, Quaternion b) +{ + Quaternion ans; + ans = cross( a, b ); + ans += a.w*b+b.w*a; + // ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z); + ans.w = a.w*b.w - dot3F4(a, b); + return ans; +} + +__inline + Quaternion qtNormalize(Quaternion in) +{ + return fast_normalize(in); + // in /= length( in ); + // return in; +} +__inline + float4 qtRotate(Quaternion q, float4 vec) +{ + Quaternion qInv = qtInvert( q ); + float4 vcpy = vec; + vcpy.w = 0.f; + float4 out = qtMul(q,vcpy); + out = qtMul(out,qInv); + return out; +} + +__inline + Quaternion qtInvert(Quaternion q) +{ + return (Quaternion)(-q.xyz, q.w); +} + +__inline + float4 qtInvRotate(const Quaternion q, float4 vec) +{ + return qtRotate( qtInvert( q ), vec ); +} + + + +void trInverse(float4 translationIn, Quaternion orientationIn, + float4* translationOut, Quaternion* orientationOut) +{ + *orientationOut = qtInvert(orientationIn); + *translationOut = qtRotate(*orientationOut, -translationIn); +} + + + + + +bool rayConvex(float4 rayFromLocal, float4 rayToLocal, int numFaces, int faceOffset, + __global const b3GpuFace* faces, float* hitFraction, float4* hitNormal) +{ + rayFromLocal.w = 0.f; + rayToLocal.w = 0.f; + bool result = true; + + float exitFraction = hitFraction[0]; + float enterFraction = -0.3f; + float4 curHitNormal = (float4)(0,0,0,0); + for (int i=0;i<numFaces && result;i++) + { + b3GpuFace face = faces[faceOffset+i]; + float fromPlaneDist = dot(rayFromLocal,face.m_plane)+face.m_plane.w; + float toPlaneDist = dot(rayToLocal,face.m_plane)+face.m_plane.w; + if (fromPlaneDist<0.f) + { + if (toPlaneDist >= 0.f) + { + float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist); + if (exitFraction>fraction) + { + exitFraction = fraction; + } + } + } else + { + if (toPlaneDist<0.f) + { + float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist); + if (enterFraction <= fraction) + { + enterFraction = fraction; + curHitNormal = face.m_plane; + curHitNormal.w = 0.f; + } + } else + { + result = false; + } + } + if (exitFraction <= enterFraction) + result = false; + } + + if (enterFraction < 0.f) + { + result = false; + } + + if (result) + { + hitFraction[0] = enterFraction; + hitNormal[0] = curHitNormal; + } + return result; +} + + + + + + +bool sphere_intersect(float4 spherePos, float radius, float4 rayFrom, float4 rayTo, float* hitFraction) +{ + float4 rs = rayFrom - spherePos; + rs.w = 0.f; + float4 rayDir = rayTo-rayFrom; + rayDir.w = 0.f; + float A = dot(rayDir,rayDir); + float B = dot(rs, rayDir); + float C = dot(rs, rs) - (radius * radius); + + float D = B * B - A*C; + + if (D > 0.0f) + { + float t = (-B - sqrt(D))/A; + + if ( (t >= 0.0f) && (t < (*hitFraction)) ) + { + *hitFraction = t; + return true; + } + } + return false; +} + +float4 setInterpolate3(float4 from, float4 to, float t) +{ + float s = 1.0f - t; + float4 result; + result = s * from + t * to; + result.w = 0.f; + return result; +} + +__kernel void rayCastKernel( + int numRays, + const __global b3RayInfo* rays, + __global b3RayHit* hitResults, + const int numBodies, + __global Body* bodies, + __global Collidable* collidables, + __global const b3GpuFace* faces, + __global const ConvexPolyhedronCL* convexShapes ) +{ + + int i = get_global_id(0); + if (i>=numRays) + return; + + hitResults[i].m_hitFraction = 1.f; + + float4 rayFrom = rays[i].m_from; + float4 rayTo = rays[i].m_to; + float hitFraction = 1.f; + float4 hitPoint; + float4 hitNormal; + int hitBodyIndex= -1; + + int cachedCollidableIndex = -1; + Collidable cachedCollidable; + + for (int b=0;b<numBodies;b++) + { + if (hitResults[i].m_hitResult2==b) + continue; + Body body = bodies[b]; + float4 pos = body.m_pos; + float4 orn = body.m_quat; + if (cachedCollidableIndex != body.m_collidableIdx) + { + cachedCollidableIndex = body.m_collidableIdx; + cachedCollidable = collidables[cachedCollidableIndex]; + } + if (cachedCollidable.m_shapeType == SHAPE_CONVEX_HULL) + { + + float4 invPos = (float4)(0,0,0,0); + float4 invOrn = (float4)(0,0,0,0); + float4 rayFromLocal = (float4)(0,0,0,0); + float4 rayToLocal = (float4)(0,0,0,0); + invOrn = qtInvert(orn); + invPos = qtRotate(invOrn, -pos); + rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos; + rayToLocal = qtRotate( invOrn, rayTo) + invPos; + rayFromLocal.w = 0.f; + rayToLocal.w = 0.f; + int numFaces = convexShapes[cachedCollidable.m_shapeIndex].m_numFaces; + int faceOffset = convexShapes[cachedCollidable.m_shapeIndex].m_faceOffset; + if (numFaces) + { + if (rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal)) + { + hitBodyIndex = b; + + } + } + } + if (cachedCollidable.m_shapeType == SHAPE_SPHERE) + { + float radius = cachedCollidable.m_radius; + + if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction)) + { + hitBodyIndex = b; + hitNormal = (float4) (hitPoint-bodies[b].m_pos); + } + } + } + + if (hitBodyIndex>=0) + { + hitPoint = setInterpolate3(rayFrom, rayTo,hitFraction); + hitResults[i].m_hitFraction = hitFraction; + hitResults[i].m_hitPoint = hitPoint; + hitResults[i].m_hitNormal = normalize(hitNormal); + hitResults[i].m_hitResult0 = hitBodyIndex; + } + +} + + +__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, + __global int* out_firstRayRigidPairIndexPerRay, + __global int* out_numRayRigidPairsPerRay, + int numRayRigidPairs) +{ + int rayRigidPairIndex = get_global_id(0); + if (rayRigidPairIndex >= numRayRigidPairs) return; + + int rayIndex = rayRigidPairs[rayRigidPairIndex].x; + + atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex); + atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]); +} + +__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, + __global b3RayHit* hitResults, + __global int* firstRayRigidPairIndexPerRay, + __global int* numRayRigidPairsPerRay, + + __global Body* bodies, + __global Collidable* collidables, + __global const b3GpuFace* faces, + __global const ConvexPolyhedronCL* convexShapes, + + __global int2* rayRigidPairs, + int numRays) +{ + int i = get_global_id(0); + if (i >= numRays) return; + + float4 rayFrom = rays[i].m_from; + float4 rayTo = rays[i].m_to; + + hitResults[i].m_hitFraction = 1.f; + + float hitFraction = 1.f; + float4 hitPoint; + float4 hitNormal; + int hitBodyIndex = -1; + + // + for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair) + { + int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i]; + int b = rayRigidPairs[rayRigidPairIndex].y; + + if (hitResults[i].m_hitResult2 == b) continue; + + Body body = bodies[b]; + Collidable rigidCollidable = collidables[body.m_collidableIdx]; + + float4 pos = body.m_pos; + float4 orn = body.m_quat; + + if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL) + { + float4 invPos = (float4)(0,0,0,0); + float4 invOrn = (float4)(0,0,0,0); + float4 rayFromLocal = (float4)(0,0,0,0); + float4 rayToLocal = (float4)(0,0,0,0); + invOrn = qtInvert(orn); + invPos = qtRotate(invOrn, -pos); + rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos; + rayToLocal = qtRotate( invOrn, rayTo) + invPos; + rayFromLocal.w = 0.f; + rayToLocal.w = 0.f; + int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces; + int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset; + + if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal)) + { + hitBodyIndex = b; + hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction); + } + } + + if (rigidCollidable.m_shapeType == SHAPE_SPHERE) + { + float radius = rigidCollidable.m_radius; + + if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction)) + { + hitBodyIndex = b; + hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction); + hitNormal = (float4) (hitPoint - bodies[b].m_pos); + } + } + } + + if (hitBodyIndex >= 0) + { + hitResults[i].m_hitFraction = hitFraction; + hitResults[i].m_hitPoint = hitPoint; + hitResults[i].m_hitNormal = normalize(hitNormal); + hitResults[i].m_hitResult0 = hitBodyIndex; + } + +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h b/thirdparty/bullet/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h new file mode 100644 index 0000000000..6257909a4d --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h @@ -0,0 +1,381 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* rayCastKernelCL= \ +"#define SHAPE_CONVEX_HULL 3\n" +"#define SHAPE_PLANE 4\n" +"#define SHAPE_CONCAVE_TRIMESH 5\n" +"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n" +"#define SHAPE_SPHERE 7\n" +"typedef struct\n" +"{\n" +" float4 m_from;\n" +" float4 m_to;\n" +"} b3RayInfo;\n" +"typedef struct\n" +"{\n" +" float m_hitFraction;\n" +" int m_hitResult0;\n" +" int m_hitResult1;\n" +" int m_hitResult2;\n" +" float4 m_hitPoint;\n" +" float4 m_hitNormal;\n" +"} b3RayHit;\n" +"typedef struct\n" +"{\n" +" float4 m_pos;\n" +" float4 m_quat;\n" +" float4 m_linVel;\n" +" float4 m_angVel;\n" +" unsigned int m_collidableIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"} Body;\n" +"typedef struct Collidable\n" +"{\n" +" union {\n" +" int m_numChildShapes;\n" +" int m_bvhIndex;\n" +" };\n" +" float m_radius;\n" +" int m_shapeType;\n" +" int m_shapeIndex;\n" +"} Collidable;\n" +"typedef struct \n" +"{\n" +" float4 m_localCenter;\n" +" float4 m_extents;\n" +" float4 mC;\n" +" float4 mE;\n" +" float m_radius;\n" +" int m_faceOffset;\n" +" int m_numFaces;\n" +" int m_numVertices;\n" +" int m_vertexOffset;\n" +" int m_uniqueEdgesOffset;\n" +" int m_numUniqueEdges;\n" +" int m_unused;\n" +"} ConvexPolyhedronCL;\n" +"typedef struct\n" +"{\n" +" float4 m_plane;\n" +" int m_indexOffset;\n" +" int m_numIndices;\n" +"} b3GpuFace;\n" +"///////////////////////////////////////\n" +"// Quaternion\n" +"///////////////////////////////////////\n" +"typedef float4 Quaternion;\n" +"__inline\n" +" Quaternion qtMul(Quaternion a, Quaternion b);\n" +"__inline\n" +" Quaternion qtNormalize(Quaternion in);\n" +"__inline\n" +" Quaternion qtInvert(Quaternion q);\n" +"__inline\n" +" float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = (float4)(a.xyz,0.f);\n" +" float4 b1 = (float4)(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"__inline\n" +" Quaternion qtMul(Quaternion a, Quaternion b)\n" +"{\n" +" Quaternion ans;\n" +" ans = cross( a, b );\n" +" ans += a.w*b+b.w*a;\n" +" // ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"__inline\n" +" Quaternion qtNormalize(Quaternion in)\n" +"{\n" +" return fast_normalize(in);\n" +" // in /= length( in );\n" +" // return in;\n" +"}\n" +"__inline\n" +" float4 qtRotate(Quaternion q, float4 vec)\n" +"{\n" +" Quaternion qInv = qtInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = qtMul(q,vcpy);\n" +" out = qtMul(out,qInv);\n" +" return out;\n" +"}\n" +"__inline\n" +" Quaternion qtInvert(Quaternion q)\n" +"{\n" +" return (Quaternion)(-q.xyz, q.w);\n" +"}\n" +"__inline\n" +" float4 qtInvRotate(const Quaternion q, float4 vec)\n" +"{\n" +" return qtRotate( qtInvert( q ), vec );\n" +"}\n" +"void trInverse(float4 translationIn, Quaternion orientationIn,\n" +" float4* translationOut, Quaternion* orientationOut)\n" +"{\n" +" *orientationOut = qtInvert(orientationIn);\n" +" *translationOut = qtRotate(*orientationOut, -translationIn);\n" +"}\n" +"bool rayConvex(float4 rayFromLocal, float4 rayToLocal, int numFaces, int faceOffset,\n" +" __global const b3GpuFace* faces, float* hitFraction, float4* hitNormal)\n" +"{\n" +" rayFromLocal.w = 0.f;\n" +" rayToLocal.w = 0.f;\n" +" bool result = true;\n" +" float exitFraction = hitFraction[0];\n" +" float enterFraction = -0.3f;\n" +" float4 curHitNormal = (float4)(0,0,0,0);\n" +" for (int i=0;i<numFaces && result;i++)\n" +" {\n" +" b3GpuFace face = faces[faceOffset+i];\n" +" float fromPlaneDist = dot(rayFromLocal,face.m_plane)+face.m_plane.w;\n" +" float toPlaneDist = dot(rayToLocal,face.m_plane)+face.m_plane.w;\n" +" if (fromPlaneDist<0.f)\n" +" {\n" +" if (toPlaneDist >= 0.f)\n" +" {\n" +" float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n" +" if (exitFraction>fraction)\n" +" {\n" +" exitFraction = fraction;\n" +" }\n" +" } \n" +" } else\n" +" {\n" +" if (toPlaneDist<0.f)\n" +" {\n" +" float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n" +" if (enterFraction <= fraction)\n" +" {\n" +" enterFraction = fraction;\n" +" curHitNormal = face.m_plane;\n" +" curHitNormal.w = 0.f;\n" +" }\n" +" } else\n" +" {\n" +" result = false;\n" +" }\n" +" }\n" +" if (exitFraction <= enterFraction)\n" +" result = false;\n" +" }\n" +" if (enterFraction < 0.f)\n" +" {\n" +" result = false;\n" +" }\n" +" if (result)\n" +" { \n" +" hitFraction[0] = enterFraction;\n" +" hitNormal[0] = curHitNormal;\n" +" }\n" +" return result;\n" +"}\n" +"bool sphere_intersect(float4 spherePos, float radius, float4 rayFrom, float4 rayTo, float* hitFraction)\n" +"{\n" +" float4 rs = rayFrom - spherePos;\n" +" rs.w = 0.f;\n" +" float4 rayDir = rayTo-rayFrom;\n" +" rayDir.w = 0.f;\n" +" float A = dot(rayDir,rayDir);\n" +" float B = dot(rs, rayDir);\n" +" float C = dot(rs, rs) - (radius * radius);\n" +" float D = B * B - A*C;\n" +" if (D > 0.0f)\n" +" {\n" +" float t = (-B - sqrt(D))/A;\n" +" if ( (t >= 0.0f) && (t < (*hitFraction)) )\n" +" {\n" +" *hitFraction = t;\n" +" return true;\n" +" }\n" +" }\n" +" return false;\n" +"}\n" +"float4 setInterpolate3(float4 from, float4 to, float t)\n" +"{\n" +" float s = 1.0f - t;\n" +" float4 result;\n" +" result = s * from + t * to;\n" +" result.w = 0.f; \n" +" return result; \n" +"}\n" +"__kernel void rayCastKernel( \n" +" int numRays, \n" +" const __global b3RayInfo* rays, \n" +" __global b3RayHit* hitResults, \n" +" const int numBodies, \n" +" __global Body* bodies,\n" +" __global Collidable* collidables,\n" +" __global const b3GpuFace* faces,\n" +" __global const ConvexPolyhedronCL* convexShapes )\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numRays)\n" +" return;\n" +" hitResults[i].m_hitFraction = 1.f;\n" +" float4 rayFrom = rays[i].m_from;\n" +" float4 rayTo = rays[i].m_to;\n" +" float hitFraction = 1.f;\n" +" float4 hitPoint;\n" +" float4 hitNormal;\n" +" int hitBodyIndex= -1;\n" +" int cachedCollidableIndex = -1;\n" +" Collidable cachedCollidable;\n" +" for (int b=0;b<numBodies;b++)\n" +" {\n" +" if (hitResults[i].m_hitResult2==b)\n" +" continue;\n" +" Body body = bodies[b];\n" +" float4 pos = body.m_pos;\n" +" float4 orn = body.m_quat;\n" +" if (cachedCollidableIndex != body.m_collidableIdx)\n" +" {\n" +" cachedCollidableIndex = body.m_collidableIdx;\n" +" cachedCollidable = collidables[cachedCollidableIndex];\n" +" }\n" +" if (cachedCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n" +" {\n" +" float4 invPos = (float4)(0,0,0,0);\n" +" float4 invOrn = (float4)(0,0,0,0);\n" +" float4 rayFromLocal = (float4)(0,0,0,0);\n" +" float4 rayToLocal = (float4)(0,0,0,0);\n" +" invOrn = qtInvert(orn);\n" +" invPos = qtRotate(invOrn, -pos);\n" +" rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n" +" rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n" +" rayFromLocal.w = 0.f;\n" +" rayToLocal.w = 0.f;\n" +" int numFaces = convexShapes[cachedCollidable.m_shapeIndex].m_numFaces;\n" +" int faceOffset = convexShapes[cachedCollidable.m_shapeIndex].m_faceOffset;\n" +" if (numFaces)\n" +" {\n" +" if (rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n" +" {\n" +" hitBodyIndex = b;\n" +" \n" +" }\n" +" }\n" +" }\n" +" if (cachedCollidable.m_shapeType == SHAPE_SPHERE)\n" +" {\n" +" float radius = cachedCollidable.m_radius;\n" +" \n" +" if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n" +" {\n" +" hitBodyIndex = b;\n" +" hitNormal = (float4) (hitPoint-bodies[b].m_pos);\n" +" }\n" +" }\n" +" }\n" +" if (hitBodyIndex>=0)\n" +" {\n" +" hitPoint = setInterpolate3(rayFrom, rayTo,hitFraction);\n" +" hitResults[i].m_hitFraction = hitFraction;\n" +" hitResults[i].m_hitPoint = hitPoint;\n" +" hitResults[i].m_hitNormal = normalize(hitNormal);\n" +" hitResults[i].m_hitResult0 = hitBodyIndex;\n" +" }\n" +"}\n" +"__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, \n" +" __global int* out_firstRayRigidPairIndexPerRay,\n" +" __global int* out_numRayRigidPairsPerRay,\n" +" int numRayRigidPairs)\n" +"{\n" +" int rayRigidPairIndex = get_global_id(0);\n" +" if (rayRigidPairIndex >= numRayRigidPairs) return;\n" +" \n" +" int rayIndex = rayRigidPairs[rayRigidPairIndex].x;\n" +" \n" +" atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);\n" +" atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);\n" +"}\n" +"__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, \n" +" __global b3RayHit* hitResults, \n" +" __global int* firstRayRigidPairIndexPerRay,\n" +" __global int* numRayRigidPairsPerRay,\n" +" \n" +" __global Body* bodies,\n" +" __global Collidable* collidables,\n" +" __global const b3GpuFace* faces,\n" +" __global const ConvexPolyhedronCL* convexShapes,\n" +" \n" +" __global int2* rayRigidPairs,\n" +" int numRays)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i >= numRays) return;\n" +" \n" +" float4 rayFrom = rays[i].m_from;\n" +" float4 rayTo = rays[i].m_to;\n" +" \n" +" hitResults[i].m_hitFraction = 1.f;\n" +" \n" +" float hitFraction = 1.f;\n" +" float4 hitPoint;\n" +" float4 hitNormal;\n" +" int hitBodyIndex = -1;\n" +" \n" +" //\n" +" for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)\n" +" {\n" +" int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];\n" +" int b = rayRigidPairs[rayRigidPairIndex].y;\n" +" \n" +" if (hitResults[i].m_hitResult2 == b) continue;\n" +" \n" +" Body body = bodies[b];\n" +" Collidable rigidCollidable = collidables[body.m_collidableIdx];\n" +" \n" +" float4 pos = body.m_pos;\n" +" float4 orn = body.m_quat;\n" +" \n" +" if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n" +" {\n" +" float4 invPos = (float4)(0,0,0,0);\n" +" float4 invOrn = (float4)(0,0,0,0);\n" +" float4 rayFromLocal = (float4)(0,0,0,0);\n" +" float4 rayToLocal = (float4)(0,0,0,0);\n" +" invOrn = qtInvert(orn);\n" +" invPos = qtRotate(invOrn, -pos);\n" +" rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n" +" rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n" +" rayFromLocal.w = 0.f;\n" +" rayToLocal.w = 0.f;\n" +" int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;\n" +" int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;\n" +" \n" +" if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n" +" {\n" +" hitBodyIndex = b;\n" +" hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n" +" }\n" +" }\n" +" \n" +" if (rigidCollidable.m_shapeType == SHAPE_SPHERE)\n" +" {\n" +" float radius = rigidCollidable.m_radius;\n" +" \n" +" if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n" +" {\n" +" hitBodyIndex = b;\n" +" hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n" +" hitNormal = (float4) (hitPoint - bodies[b].m_pos);\n" +" }\n" +" }\n" +" }\n" +" \n" +" if (hitBodyIndex >= 0)\n" +" {\n" +" hitResults[i].m_hitFraction = hitFraction;\n" +" hitResults[i].m_hitPoint = hitPoint;\n" +" hitResults[i].m_hitNormal = normalize(hitNormal);\n" +" hitResults[i].m_hitResult0 = hitBodyIndex;\n" +" }\n" +" \n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuConstraint4.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuConstraint4.h new file mode 100644 index 0000000000..c7478f54a1 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuConstraint4.h @@ -0,0 +1,18 @@ + +#ifndef B3_CONSTRAINT4_h +#define B3_CONSTRAINT4_h +#include "Bullet3Common/b3Vector3.h" + +#include "Bullet3Dynamics/shared/b3ContactConstraint4.h" + + +B3_ATTRIBUTE_ALIGNED16(struct) b3GpuConstraint4 : public b3ContactConstraint4 +{ + B3_DECLARE_ALIGNED_ALLOCATOR(); + + inline void setFrictionCoeff(float value) { m_linear[3] = value; } + inline float getFrictionCoeff() const { return m_linear[3]; } +}; + +#endif //B3_CONSTRAINT4_h + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.cpp b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.cpp new file mode 100644 index 0000000000..af687b54e9 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.cpp @@ -0,0 +1,137 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + +#include "b3GpuGenericConstraint.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h" + +#include <new> +#include "Bullet3Common/b3Transform.h" + +void b3GpuGenericConstraint::getInfo1 (unsigned int* info,const b3RigidBodyData* bodies) +{ + switch (m_constraintType) + { + case B3_GPU_POINT2POINT_CONSTRAINT_TYPE: + { + *info = 3; + break; + }; + default: + { + b3Assert(0); + } + }; +} + +void getInfo2Point2Point(b3GpuGenericConstraint* constraint, b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies) +{ + b3Transform trA; + trA.setIdentity(); + trA.setOrigin(bodies[constraint->m_rbA].m_pos); + trA.setRotation(bodies[constraint->m_rbA].m_quat); + + b3Transform trB; + trB.setIdentity(); + trB.setOrigin(bodies[constraint->m_rbB].m_pos); + trB.setRotation(bodies[constraint->m_rbB].m_quat); + + // anchor points in global coordinates with respect to body PORs. + + // set jacobian + info->m_J1linearAxis[0] = 1; + info->m_J1linearAxis[info->rowskip+1] = 1; + info->m_J1linearAxis[2*info->rowskip+2] = 1; + + b3Vector3 a1 = trA.getBasis()*constraint->getPivotInA(); + //b3Vector3 a1a = b3QuatRotate(trA.getRotation(),constraint->getPivotInA()); + + { + b3Vector3* angular0 = (b3Vector3*)(info->m_J1angularAxis); + b3Vector3* angular1 = (b3Vector3*)(info->m_J1angularAxis+info->rowskip); + b3Vector3* angular2 = (b3Vector3*)(info->m_J1angularAxis+2*info->rowskip); + b3Vector3 a1neg = -a1; + a1neg.getSkewSymmetricMatrix(angular0,angular1,angular2); + } + + if (info->m_J2linearAxis) + { + info->m_J2linearAxis[0] = -1; + info->m_J2linearAxis[info->rowskip+1] = -1; + info->m_J2linearAxis[2*info->rowskip+2] = -1; + } + + b3Vector3 a2 = trB.getBasis()*constraint->getPivotInB(); + + { + // b3Vector3 a2n = -a2; + b3Vector3* angular0 = (b3Vector3*)(info->m_J2angularAxis); + b3Vector3* angular1 = (b3Vector3*)(info->m_J2angularAxis+info->rowskip); + b3Vector3* angular2 = (b3Vector3*)(info->m_J2angularAxis+2*info->rowskip); + a2.getSkewSymmetricMatrix(angular0,angular1,angular2); + } + + + + // set right hand side +// b3Scalar currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp; + b3Scalar currERP = info->erp; + + b3Scalar k = info->fps * currERP; + int j; + for (j=0; j<3; j++) + { + info->m_constraintError[j*info->rowskip] = k * (a2[j] + trB.getOrigin()[j] - a1[j] - trA.getOrigin()[j]); + //printf("info->m_constraintError[%d]=%f\n",j,info->m_constraintError[j]); + } +#if 0 + if(m_flags & B3_P2P_FLAGS_CFM) + { + for (j=0; j<3; j++) + { + info->cfm[j*info->rowskip] = m_cfm; + } + } +#endif + +#if 0 + b3Scalar impulseClamp = m_setting.m_impulseClamp;// + for (j=0; j<3; j++) + { + if (m_setting.m_impulseClamp > 0) + { + info->m_lowerLimit[j*info->rowskip] = -impulseClamp; + info->m_upperLimit[j*info->rowskip] = impulseClamp; + } + } + info->m_damping = m_setting.m_damping; +#endif + +} + +void b3GpuGenericConstraint::getInfo2 (b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies) +{ + switch (m_constraintType) + { + case B3_GPU_POINT2POINT_CONSTRAINT_TYPE: + { + getInfo2Point2Point(this,info,bodies); + break; + }; + default: + { + b3Assert(0); + } + }; +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h new file mode 100644 index 0000000000..14b3ba7fec --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h @@ -0,0 +1,132 @@ +/* +Copyright (c) 2013 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + +#ifndef B3_GPU_GENERIC_CONSTRAINT_H +#define B3_GPU_GENERIC_CONSTRAINT_H + +#include "Bullet3Common/b3Quaternion.h" +struct b3RigidBodyData; +enum B3_CONSTRAINT_FLAGS +{ + B3_CONSTRAINT_FLAG_ENABLED=1, +}; + +enum b3GpuGenericConstraintType +{ + B3_GPU_POINT2POINT_CONSTRAINT_TYPE=3, + B3_GPU_FIXED_CONSTRAINT_TYPE=4, +// B3_HINGE_CONSTRAINT_TYPE, +// B3_CONETWIST_CONSTRAINT_TYPE, +// B3_D6_CONSTRAINT_TYPE, +// B3_SLIDER_CONSTRAINT_TYPE, +// B3_CONTACT_CONSTRAINT_TYPE, +// B3_D6_SPRING_CONSTRAINT_TYPE, +// B3_GEAR_CONSTRAINT_TYPE, + + B3_GPU_MAX_CONSTRAINT_TYPE +}; + + + +struct b3GpuConstraintInfo2 +{ + // integrator parameters: frames per second (1/stepsize), default error + // reduction parameter (0..1). + b3Scalar fps,erp; + + // for the first and second body, pointers to two (linear and angular) + // n*3 jacobian sub matrices, stored by rows. these matrices will have + // been initialized to 0 on entry. if the second body is zero then the + // J2xx pointers may be 0. + b3Scalar *m_J1linearAxis,*m_J1angularAxis,*m_J2linearAxis,*m_J2angularAxis; + + // elements to jump from one row to the next in J's + int rowskip; + + // right hand sides of the equation J*v = c + cfm * lambda. cfm is the + // "constraint force mixing" vector. c is set to zero on entry, cfm is + // set to a constant value (typically very small or zero) value on entry. + b3Scalar *m_constraintError,*cfm; + + // lo and hi limits for variables (set to -/+ infinity on entry). + b3Scalar *m_lowerLimit,*m_upperLimit; + + // findex vector for variables. see the LCP solver interface for a + // description of what this does. this is set to -1 on entry. + // note that the returned indexes are relative to the first index of + // the constraint. + int *findex; + // number of solver iterations + int m_numIterations; + + //damping of the velocity + b3Scalar m_damping; +}; + + +B3_ATTRIBUTE_ALIGNED16(struct) b3GpuGenericConstraint +{ + int m_constraintType; + int m_rbA; + int m_rbB; + float m_breakingImpulseThreshold; + + b3Vector3 m_pivotInA; + b3Vector3 m_pivotInB; + b3Quaternion m_relTargetAB; + + int m_flags; + int m_uid; + int m_padding[2]; + + int getRigidBodyA() const + { + return m_rbA; + } + int getRigidBodyB() const + { + return m_rbB; + } + + const b3Vector3& getPivotInA() const + { + return m_pivotInA; + } + + const b3Vector3& getPivotInB() const + { + return m_pivotInB; + } + + int isEnabled() const + { + return m_flags & B3_CONSTRAINT_FLAG_ENABLED; + } + + float getBreakingImpulseThreshold() const + { + return m_breakingImpulseThreshold; + } + + ///internal method used by the constraint solver, don't use them directly + void getInfo1 (unsigned int* info,const b3RigidBodyData* bodies); + + ///internal method used by the constraint solver, don't use them directly + void getInfo2 (b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies); + + +}; + +#endif //B3_GPU_GENERIC_CONSTRAINT_H
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.cpp b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.cpp new file mode 100644 index 0000000000..179dfc4f26 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.cpp @@ -0,0 +1,1382 @@ + +#include "b3GpuJacobiContactSolver.h" +#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h" +#include "Bullet3Common/b3AlignedObjectArray.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2 +class b3Vector3; +#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "Bullet3OpenCL/RigidBody/kernels/solverUtils.h" +#include "Bullet3Common/b3Logging.h" +#include "b3GpuConstraint4.h" +#include "Bullet3Common/shared/b3Int2.h" +#include "Bullet3Common/shared/b3Int4.h" +#define SOLVER_UTILS_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverUtils.cl" + + +struct b3GpuJacobiSolverInternalData +{ + //btRadixSort32CL* m_sort32; + //btBoundSearchCL* m_search; + b3PrefixScanCL* m_scan; + + b3OpenCLArray<unsigned int>* m_bodyCount; + b3OpenCLArray<b3Int2>* m_contactConstraintOffsets; + b3OpenCLArray<unsigned int>* m_offsetSplitBodies; + + b3OpenCLArray<b3Vector3>* m_deltaLinearVelocities; + b3OpenCLArray<b3Vector3>* m_deltaAngularVelocities; + + b3AlignedObjectArray<b3Vector3> m_deltaLinearVelocitiesCPU; + b3AlignedObjectArray<b3Vector3> m_deltaAngularVelocitiesCPU; + + + + b3OpenCLArray<b3GpuConstraint4>* m_contactConstraints; + + b3FillCL* m_filler; + + + cl_kernel m_countBodiesKernel; + cl_kernel m_contactToConstraintSplitKernel; + cl_kernel m_clearVelocitiesKernel; + cl_kernel m_averageVelocitiesKernel; + cl_kernel m_updateBodyVelocitiesKernel; + cl_kernel m_solveContactKernel; + cl_kernel m_solveFrictionKernel; + + + +}; + + +b3GpuJacobiContactSolver::b3GpuJacobiContactSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity) + :m_context(ctx), + m_device(device), + m_queue(queue) +{ + m_data = new b3GpuJacobiSolverInternalData; + m_data->m_scan = new b3PrefixScanCL(m_context,m_device,m_queue); + m_data->m_bodyCount = new b3OpenCLArray<unsigned int>(m_context,m_queue); + m_data->m_filler = new b3FillCL(m_context,m_device,m_queue); + m_data->m_contactConstraintOffsets = new b3OpenCLArray<b3Int2>(m_context,m_queue); + m_data->m_offsetSplitBodies = new b3OpenCLArray<unsigned int>(m_context,m_queue); + m_data->m_contactConstraints = new b3OpenCLArray<b3GpuConstraint4>(m_context,m_queue); + m_data->m_deltaLinearVelocities = new b3OpenCLArray<b3Vector3>(m_context,m_queue); + m_data->m_deltaAngularVelocities = new b3OpenCLArray<b3Vector3>(m_context,m_queue); + + cl_int pErrNum; + const char* additionalMacros=""; + const char* solverUtilsSource = solverUtilsCL; + { + cl_program solverUtilsProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverUtilsSource, &pErrNum,additionalMacros, SOLVER_UTILS_KERNEL_PATH); + b3Assert(solverUtilsProg); + m_data->m_countBodiesKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "CountBodiesKernel", &pErrNum, solverUtilsProg,additionalMacros ); + b3Assert(m_data->m_countBodiesKernel); + + m_data->m_contactToConstraintSplitKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "ContactToConstraintSplitKernel", &pErrNum, solverUtilsProg,additionalMacros ); + b3Assert(m_data->m_contactToConstraintSplitKernel); + m_data->m_clearVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "ClearVelocitiesKernel", &pErrNum, solverUtilsProg,additionalMacros ); + b3Assert(m_data->m_clearVelocitiesKernel); + + m_data->m_averageVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "AverageVelocitiesKernel", &pErrNum, solverUtilsProg,additionalMacros ); + b3Assert(m_data->m_averageVelocitiesKernel); + + m_data->m_updateBodyVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "UpdateBodyVelocitiesKernel", &pErrNum, solverUtilsProg,additionalMacros ); + b3Assert(m_data->m_updateBodyVelocitiesKernel); + + + m_data->m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "SolveContactJacobiKernel", &pErrNum, solverUtilsProg,additionalMacros ); + b3Assert(m_data->m_solveContactKernel ); + + m_data->m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "SolveFrictionJacobiKernel", &pErrNum, solverUtilsProg,additionalMacros ); + b3Assert(m_data->m_solveFrictionKernel); + } + +} + + +b3GpuJacobiContactSolver::~b3GpuJacobiContactSolver() +{ + clReleaseKernel(m_data->m_solveContactKernel); + clReleaseKernel(m_data->m_solveFrictionKernel); + clReleaseKernel(m_data->m_countBodiesKernel); + clReleaseKernel(m_data->m_contactToConstraintSplitKernel); + clReleaseKernel(m_data->m_averageVelocitiesKernel); + clReleaseKernel(m_data->m_updateBodyVelocitiesKernel); + clReleaseKernel(m_data->m_clearVelocitiesKernel ); + + delete m_data->m_deltaLinearVelocities; + delete m_data->m_deltaAngularVelocities; + delete m_data->m_contactConstraints; + delete m_data->m_offsetSplitBodies; + delete m_data->m_contactConstraintOffsets; + delete m_data->m_bodyCount; + delete m_data->m_filler; + delete m_data->m_scan; + delete m_data; +} + + + +b3Vector3 make_float4(float v) +{ + return b3MakeVector3 (v,v,v); +} + +b3Vector4 make_float4(float x,float y, float z, float w) +{ + return b3MakeVector4 (x,y,z,w); +} + + + static + inline + float calcRelVel(const b3Vector3& l0, const b3Vector3& l1, const b3Vector3& a0, const b3Vector3& a1, + const b3Vector3& linVel0, const b3Vector3& angVel0, const b3Vector3& linVel1, const b3Vector3& angVel1) + { + return b3Dot(l0, linVel0) + b3Dot(a0, angVel0) + b3Dot(l1, linVel1) + b3Dot(a1, angVel1); + } + + + static + inline + void setLinearAndAngular(const b3Vector3& n, const b3Vector3& r0, const b3Vector3& r1, + b3Vector3& linear, b3Vector3& angular0, b3Vector3& angular1) + { + linear = n; + angular0 = b3Cross(r0, n); + angular1 = -b3Cross(r1, n); + } + + +static __inline void solveContact(b3GpuConstraint4& cs, + const b3Vector3& posA, const b3Vector3& linVelARO, const b3Vector3& angVelARO, float invMassA, const b3Matrix3x3& invInertiaA, + const b3Vector3& posB, const b3Vector3& linVelBRO, const b3Vector3& angVelBRO, float invMassB, const b3Matrix3x3& invInertiaB, + float maxRambdaDt[4], float minRambdaDt[4], b3Vector3& dLinVelA, b3Vector3& dAngVelA, b3Vector3& dLinVelB, b3Vector3& dAngVelB) +{ + + + for(int ic=0; ic<4; ic++) + { + // dont necessary because this makes change to 0 + if( cs.m_jacCoeffInv[ic] == 0.f ) continue; + + { + b3Vector3 angular0, angular1, linear; + b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA; + b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB; + setLinearAndAngular( (const b3Vector3 &)cs.m_linear, (const b3Vector3 &)r0, (const b3Vector3 &)r1, linear, angular0, angular1 ); + + float rambdaDt = calcRelVel((const b3Vector3 &)cs.m_linear,(const b3Vector3 &) -cs.m_linear, angular0, angular1, + linVelARO+dLinVelA, angVelARO+dAngVelA, linVelBRO+dLinVelB, angVelBRO+dAngVelB ) + cs.m_b[ic]; + rambdaDt *= cs.m_jacCoeffInv[ic]; + + { + float prevSum = cs.m_appliedRambdaDt[ic]; + float updated = prevSum; + updated += rambdaDt; + updated = b3Max( updated, minRambdaDt[ic] ); + updated = b3Min( updated, maxRambdaDt[ic] ); + rambdaDt = updated - prevSum; + cs.m_appliedRambdaDt[ic] = updated; + } + + b3Vector3 linImp0 = invMassA*linear*rambdaDt; + b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt; + b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt; + b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt; +#ifdef _WIN32 + b3Assert(_finite(linImp0.getX())); + b3Assert(_finite(linImp1.getX())); +#endif + + if (invMassA) + { + dLinVelA += linImp0; + dAngVelA += angImp0; + } + if (invMassB) + { + dLinVelB += linImp1; + dAngVelB += angImp1; + } + } + } +} + + + +void solveContact3(b3GpuConstraint4* cs, + b3Vector3* posAPtr, b3Vector3* linVelA, b3Vector3* angVelA, float invMassA, const b3Matrix3x3& invInertiaA, + b3Vector3* posBPtr, b3Vector3* linVelB, b3Vector3* angVelB, float invMassB, const b3Matrix3x3& invInertiaB, + b3Vector3* dLinVelA, b3Vector3* dAngVelA, b3Vector3* dLinVelB, b3Vector3* dAngVelB) +{ + float minRambdaDt = 0; + float maxRambdaDt = FLT_MAX; + + for(int ic=0; ic<4; ic++) + { + if( cs->m_jacCoeffInv[ic] == 0.f ) continue; + + b3Vector3 angular0, angular1, linear; + b3Vector3 r0 = cs->m_worldPos[ic] - *posAPtr; + b3Vector3 r1 = cs->m_worldPos[ic] - *posBPtr; + setLinearAndAngular( cs->m_linear, r0, r1, linear, angular0, angular1 ); + + float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, + *linVelA+*dLinVelA, *angVelA+*dAngVelA, *linVelB+*dLinVelB, *angVelB+*dAngVelB ) + cs->m_b[ic]; + rambdaDt *= cs->m_jacCoeffInv[ic]; + + { + float prevSum = cs->m_appliedRambdaDt[ic]; + float updated = prevSum; + updated += rambdaDt; + updated = b3Max( updated, minRambdaDt ); + updated = b3Min( updated, maxRambdaDt ); + rambdaDt = updated - prevSum; + cs->m_appliedRambdaDt[ic] = updated; + } + + b3Vector3 linImp0 = invMassA*linear*rambdaDt; + b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt; + b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt; + b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt; + + if (invMassA) + { + *dLinVelA += linImp0; + *dAngVelA += angImp0; + } + if (invMassB) + { + *dLinVelB += linImp1; + *dAngVelB += angImp1; + } + } +} + + +static inline void solveFriction(b3GpuConstraint4& cs, + const b3Vector3& posA, const b3Vector3& linVelARO, const b3Vector3& angVelARO, float invMassA, const b3Matrix3x3& invInertiaA, + const b3Vector3& posB, const b3Vector3& linVelBRO, const b3Vector3& angVelBRO, float invMassB, const b3Matrix3x3& invInertiaB, + float maxRambdaDt[4], float minRambdaDt[4], b3Vector3& dLinVelA, b3Vector3& dAngVelA, b3Vector3& dLinVelB, b3Vector3& dAngVelB) +{ + + b3Vector3 linVelA = linVelARO+dLinVelA; + b3Vector3 linVelB = linVelBRO+dLinVelB; + b3Vector3 angVelA = angVelARO+dAngVelA; + b3Vector3 angVelB = angVelBRO+dAngVelB; + + if( cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0 ) return; + const b3Vector3& center = (const b3Vector3&)cs.m_center; + + b3Vector3 n = -(const b3Vector3&)cs.m_linear; + + b3Vector3 tangent[2]; +#if 1 + b3PlaneSpace1 (n, tangent[0],tangent[1]); +#else + b3Vector3 r = cs.m_worldPos[0]-center; + tangent[0] = cross3( n, r ); + tangent[1] = cross3( tangent[0], n ); + tangent[0] = normalize3( tangent[0] ); + tangent[1] = normalize3( tangent[1] ); +#endif + + b3Vector3 angular0, angular1, linear; + b3Vector3 r0 = center - posA; + b3Vector3 r1 = center - posB; + for(int i=0; i<2; i++) + { + setLinearAndAngular( tangent[i], r0, r1, linear, angular0, angular1 ); + float rambdaDt = calcRelVel(linear, -linear, angular0, angular1, + linVelA, angVelA, linVelB, angVelB ); + rambdaDt *= cs.m_fJacCoeffInv[i]; + + { + float prevSum = cs.m_fAppliedRambdaDt[i]; + float updated = prevSum; + updated += rambdaDt; + updated = b3Max( updated, minRambdaDt[i] ); + updated = b3Min( updated, maxRambdaDt[i] ); + rambdaDt = updated - prevSum; + cs.m_fAppliedRambdaDt[i] = updated; + } + + b3Vector3 linImp0 = invMassA*linear*rambdaDt; + b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt; + b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt; + b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt; +#ifdef _WIN32 + b3Assert(_finite(linImp0.getX())); + b3Assert(_finite(linImp1.getX())); +#endif + if (invMassA) + { + dLinVelA += linImp0; + dAngVelA += angImp0; + } + if (invMassB) + { + dLinVelB += linImp1; + dAngVelB += angImp1; + } + } + + { // angular damping for point constraint + b3Vector3 ab = ( posB - posA ).normalized(); + b3Vector3 ac = ( center - posA ).normalized(); + if( b3Dot( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f)) + { + float angNA = b3Dot( n, angVelA ); + float angNB = b3Dot( n, angVelB ); + + if (invMassA) + dAngVelA -= (angNA*0.1f)*n; + if (invMassB) + dAngVelB -= (angNB*0.1f)*n; + } + } + +} + + + + +float calcJacCoeff(const b3Vector3& linear0, const b3Vector3& linear1, const b3Vector3& angular0, const b3Vector3& angular1, + float invMass0, const b3Matrix3x3* invInertia0, float invMass1, const b3Matrix3x3* invInertia1, float countA, float countB) +{ + // linear0,1 are normlized + float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0; + + float jmj1 = b3Dot(mtMul3(angular0,*invInertia0), angular0); + float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1; + float jmj3 = b3Dot(mtMul3(angular1,*invInertia1), angular1); + return -1.f/((jmj0+jmj1)*countA+(jmj2+jmj3)*countB); +// return -1.f/((jmj0+jmj1)+(jmj2+jmj3)); + +} + + +void setConstraint4( const b3Vector3& posA, const b3Vector3& linVelA, const b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA, + const b3Vector3& posB, const b3Vector3& linVelB, const b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, + b3Contact4* src, float dt, float positionDrift, float positionConstraintCoeff, float countA, float countB, + b3GpuConstraint4* dstC ) +{ + dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit); + dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit); + + float dtInv = 1.f/dt; + for(int ic=0; ic<4; ic++) + { + dstC->m_appliedRambdaDt[ic] = 0.f; + } + dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f; + + + dstC->m_linear = src->m_worldNormalOnB; + dstC->m_linear[3] = 0.7f ;//src->getFrictionCoeff() ); + for(int ic=0; ic<4; ic++) + { + b3Vector3 r0 = src->m_worldPosB[ic] - posA; + b3Vector3 r1 = src->m_worldPosB[ic] - posB; + + if( ic >= src->m_worldNormalOnB[3] )//npoints + { + dstC->m_jacCoeffInv[ic] = 0.f; + continue; + } + + float relVelN; + { + b3Vector3 linear, angular0, angular1; + setLinearAndAngular(src->m_worldNormalOnB, r0, r1, linear, angular0, angular1); + + dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1, + invMassA, &invInertiaA, invMassB, &invInertiaB ,countA,countB); + + relVelN = calcRelVel(linear, -linear, angular0, angular1, + linVelA, angVelA, linVelB, angVelB); + + float e = 0.f;//src->getRestituitionCoeff(); + if( relVelN*relVelN < 0.004f ) + { + e = 0.f; + } + + dstC->m_b[ic] = e*relVelN; + //float penetration = src->m_worldPos[ic].w; + dstC->m_b[ic] += (src->m_worldPosB[ic][3] + positionDrift)*positionConstraintCoeff*dtInv; + dstC->m_appliedRambdaDt[ic] = 0.f; + } + } + + if( src->m_worldNormalOnB[3] > 0 )//npoints + { // prepare friction + b3Vector3 center = make_float4(0.f); + for(int i=0; i<src->m_worldNormalOnB[3]; i++) + center += src->m_worldPosB[i]; + center /= (float)src->m_worldNormalOnB[3]; + + b3Vector3 tangent[2]; + b3PlaneSpace1(src->m_worldNormalOnB,tangent[0],tangent[1]); + + b3Vector3 r[2]; + r[0] = center - posA; + r[1] = center - posB; + + for(int i=0; i<2; i++) + { + b3Vector3 linear, angular0, angular1; + setLinearAndAngular(tangent[i], r[0], r[1], linear, angular0, angular1); + + dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1, + invMassA, &invInertiaA, invMassB, &invInertiaB ,countA,countB); + dstC->m_fAppliedRambdaDt[i] = 0.f; + } + dstC->m_center = center; + } + + for(int i=0; i<4; i++) + { + if( i<src->m_worldNormalOnB[3] ) + { + dstC->m_worldPos[i] = src->m_worldPosB[i]; + } + else + { + dstC->m_worldPos[i] = make_float4(0.f); + } + } +} + + + +void ContactToConstraintKernel(b3Contact4* gContact, b3RigidBodyData* gBodies, b3InertiaData* gShapes, b3GpuConstraint4* gConstraintOut, int nContacts, +float dt, +float positionDrift, +float positionConstraintCoeff, int gIdx, b3AlignedObjectArray<unsigned int>& bodyCount +) +{ + //int gIdx = 0;//GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit); + int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit); + + b3Vector3 posA = gBodies[aIdx].m_pos; + b3Vector3 linVelA = gBodies[aIdx].m_linVel; + b3Vector3 angVelA = gBodies[aIdx].m_angVel; + float invMassA = gBodies[aIdx].m_invMass; + b3Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertiaWorld;//.m_invInertia; + + b3Vector3 posB = gBodies[bIdx].m_pos; + b3Vector3 linVelB = gBodies[bIdx].m_linVel; + b3Vector3 angVelB = gBodies[bIdx].m_angVel; + float invMassB = gBodies[bIdx].m_invMass; + b3Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertiaWorld;//m_invInertia; + + b3GpuConstraint4 cs; + float countA = invMassA ? (float)(bodyCount[aIdx]) : 1; + float countB = invMassB ? (float)(bodyCount[bIdx]) : 1; + setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB, + &gContact[gIdx], dt, positionDrift, positionConstraintCoeff,countA,countB, + &cs ); + + + + cs.m_batchIdx = gContact[gIdx].m_batchIdx; + + gConstraintOut[gIdx] = cs; + } +} + + +void b3GpuJacobiContactSolver::solveGroupHost(b3RigidBodyData* bodies,b3InertiaData* inertias,int numBodies,b3Contact4* manifoldPtr, int numManifolds,const b3JacobiSolverInfo& solverInfo) +{ + B3_PROFILE("b3GpuJacobiContactSolver::solveGroup"); + + b3AlignedObjectArray<unsigned int> bodyCount; + bodyCount.resize(numBodies); + for (int i=0;i<numBodies;i++) + bodyCount[i] = 0; + + b3AlignedObjectArray<b3Int2> contactConstraintOffsets; + contactConstraintOffsets.resize(numManifolds); + + + for (int i=0;i<numManifolds;i++) + { + int pa = manifoldPtr[i].m_bodyAPtrAndSignBit; + int pb = manifoldPtr[i].m_bodyBPtrAndSignBit; + + bool isFixedA = (pa <0) || (pa == solverInfo.m_fixedBodyIndex); + bool isFixedB = (pb <0) || (pb == solverInfo.m_fixedBodyIndex); + + int bodyIndexA = manifoldPtr[i].getBodyA(); + int bodyIndexB = manifoldPtr[i].getBodyB(); + + if (!isFixedA) + { + contactConstraintOffsets[i].x = bodyCount[bodyIndexA]; + bodyCount[bodyIndexA]++; + } + if (!isFixedB) + { + contactConstraintOffsets[i].y = bodyCount[bodyIndexB]; + bodyCount[bodyIndexB]++; + } + } + + b3AlignedObjectArray<unsigned int> offsetSplitBodies; + offsetSplitBodies.resize(numBodies); + unsigned int totalNumSplitBodies; + m_data->m_scan->executeHost(bodyCount,offsetSplitBodies,numBodies,&totalNumSplitBodies); + int numlastBody = bodyCount[numBodies-1]; + totalNumSplitBodies += numlastBody; + printf("totalNumSplitBodies = %d\n",totalNumSplitBodies); + + + + + + b3AlignedObjectArray<b3GpuConstraint4> contactConstraints; + contactConstraints.resize(numManifolds); + + for (int i=0;i<numManifolds;i++) + { + ContactToConstraintKernel(&manifoldPtr[0],bodies,inertias,&contactConstraints[0],numManifolds, + solverInfo.m_deltaTime, + solverInfo.m_positionDrift, + solverInfo.m_positionConstraintCoeff, + i, bodyCount); + } + int maxIter = solverInfo.m_numIterations; + + + b3AlignedObjectArray<b3Vector3> deltaLinearVelocities; + b3AlignedObjectArray<b3Vector3> deltaAngularVelocities; + deltaLinearVelocities.resize(totalNumSplitBodies); + deltaAngularVelocities.resize(totalNumSplitBodies); + for (unsigned int i=0;i<totalNumSplitBodies;i++) + { + deltaLinearVelocities[i].setZero(); + deltaAngularVelocities[i].setZero(); + } + + + + for (int iter = 0;iter<maxIter;iter++) + { + int i=0; + for( i=0; i<numManifolds; i++) + { + + //float frictionCoeff = contactConstraints[i].getFrictionCoeff(); + int aIdx = (int)contactConstraints[i].m_bodyA; + int bIdx = (int)contactConstraints[i].m_bodyB; + b3RigidBodyData& bodyA = bodies[aIdx]; + b3RigidBodyData& bodyB = bodies[bIdx]; + + b3Vector3 zero = b3MakeVector3(0,0,0); + + b3Vector3* dlvAPtr=&zero; + b3Vector3* davAPtr=&zero; + b3Vector3* dlvBPtr=&zero; + b3Vector3* davBPtr=&zero; + + if (bodyA.m_invMass) + { + int bodyOffsetA = offsetSplitBodies[aIdx]; + int constraintOffsetA = contactConstraintOffsets[i].x; + int splitIndexA = bodyOffsetA+constraintOffsetA; + dlvAPtr = &deltaLinearVelocities[splitIndexA]; + davAPtr = &deltaAngularVelocities[splitIndexA]; + } + + if (bodyB.m_invMass) + { + int bodyOffsetB = offsetSplitBodies[bIdx]; + int constraintOffsetB = contactConstraintOffsets[i].y; + int splitIndexB= bodyOffsetB+constraintOffsetB; + dlvBPtr =&deltaLinearVelocities[splitIndexB]; + davBPtr = &deltaAngularVelocities[splitIndexB]; + } + + + + { + float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + + solveContact( contactConstraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, inertias[aIdx].m_invInertiaWorld, + (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, inertias[bIdx].m_invInertiaWorld, + maxRambdaDt, minRambdaDt , *dlvAPtr,*davAPtr,*dlvBPtr,*davBPtr ); + + + } + + } + + + //easy + for (int i=0;i<numBodies;i++) + { + if (bodies[i].m_invMass) + { + int bodyOffset = offsetSplitBodies[i]; + int count = bodyCount[i]; + float factor = 1.f/float(count); + b3Vector3 averageLinVel; + averageLinVel.setZero(); + b3Vector3 averageAngVel; + averageAngVel.setZero(); + for (int j=0;j<count;j++) + { + averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor; + averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor; + } + for (int j=0;j<count;j++) + { + deltaLinearVelocities[bodyOffset+j] = averageLinVel; + deltaAngularVelocities[bodyOffset+j] = averageAngVel; + } + } + } + } + for (int iter = 0;iter<maxIter;iter++) + { + //int i=0; + + //solve friction + + for(int i=0; i<numManifolds; i++) + { + float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + + float sum = 0; + for(int j=0; j<4; j++) + { + sum +=contactConstraints[i].m_appliedRambdaDt[j]; + } + float frictionCoeff = contactConstraints[i].getFrictionCoeff(); + int aIdx = (int)contactConstraints[i].m_bodyA; + int bIdx = (int)contactConstraints[i].m_bodyB; + b3RigidBodyData& bodyA = bodies[aIdx]; + b3RigidBodyData& bodyB = bodies[bIdx]; + + b3Vector3 zero = b3MakeVector3(0,0,0); + + b3Vector3* dlvAPtr=&zero; + b3Vector3* davAPtr=&zero; + b3Vector3* dlvBPtr=&zero; + b3Vector3* davBPtr=&zero; + + if (bodyA.m_invMass) + { + int bodyOffsetA = offsetSplitBodies[aIdx]; + int constraintOffsetA = contactConstraintOffsets[i].x; + int splitIndexA = bodyOffsetA+constraintOffsetA; + dlvAPtr = &deltaLinearVelocities[splitIndexA]; + davAPtr = &deltaAngularVelocities[splitIndexA]; + } + + if (bodyB.m_invMass) + { + int bodyOffsetB = offsetSplitBodies[bIdx]; + int constraintOffsetB = contactConstraintOffsets[i].y; + int splitIndexB= bodyOffsetB+constraintOffsetB; + dlvBPtr =&deltaLinearVelocities[splitIndexB]; + davBPtr = &deltaAngularVelocities[splitIndexB]; + } + + for(int j=0; j<4; j++) + { + maxRambdaDt[j] = frictionCoeff*sum; + minRambdaDt[j] = -maxRambdaDt[j]; + } + + solveFriction( contactConstraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,inertias[aIdx].m_invInertiaWorld, + (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, inertias[bIdx].m_invInertiaWorld, + maxRambdaDt, minRambdaDt , *dlvAPtr,*davAPtr,*dlvBPtr,*davBPtr); + + } + + //easy + for (int i=0;i<numBodies;i++) + { + if (bodies[i].m_invMass) + { + int bodyOffset = offsetSplitBodies[i]; + int count = bodyCount[i]; + float factor = 1.f/float(count); + b3Vector3 averageLinVel; + averageLinVel.setZero(); + b3Vector3 averageAngVel; + averageAngVel.setZero(); + for (int j=0;j<count;j++) + { + averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor; + averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor; + } + for (int j=0;j<count;j++) + { + deltaLinearVelocities[bodyOffset+j] = averageLinVel; + deltaAngularVelocities[bodyOffset+j] = averageAngVel; + } + } + } + + + + } + + + //easy + for (int i=0;i<numBodies;i++) + { + if (bodies[i].m_invMass) + { + int bodyOffset = offsetSplitBodies[i]; + int count = bodyCount[i]; + if (count) + { + bodies[i].m_linVel += deltaLinearVelocities[bodyOffset]; + bodies[i].m_angVel += deltaAngularVelocities[bodyOffset]; + } + } + } +} + + + +void b3GpuJacobiContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index) +// +// +//void b3GpuJacobiContactSolver::solveGroup(b3OpenCLArray<b3RigidBodyData>* bodies,b3OpenCLArray<b3InertiaData>* inertias,b3OpenCLArray<b3Contact4>* manifoldPtr,const btJacobiSolverInfo& solverInfo) +{ + b3JacobiSolverInfo solverInfo; + solverInfo.m_fixedBodyIndex = static0Index; + + + B3_PROFILE("b3GpuJacobiContactSolver::solveGroup"); + + //int numBodies = bodies->size(); + int numManifolds = numContacts;//manifoldPtr->size(); + + { + B3_PROFILE("resize"); + m_data->m_bodyCount->resize(numBodies); + } + + unsigned int val=0; + b3Int2 val2; + val2.x=0; + val2.y=0; + + { + B3_PROFILE("m_filler"); + m_data->m_contactConstraintOffsets->resize(numManifolds); + m_data->m_filler->execute(*m_data->m_bodyCount,val,numBodies); + + + m_data->m_filler->execute(*m_data->m_contactConstraintOffsets,val2,numManifolds); + } + + { + B3_PROFILE("m_countBodiesKernel"); + b3LauncherCL launcher(this->m_queue,m_data->m_countBodiesKernel,"m_countBodiesKernel"); + launcher.setBuffer(contactBuf);//manifoldPtr->getBufferCL()); + launcher.setBuffer(m_data->m_bodyCount->getBufferCL()); + launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL()); + launcher.setConst(numManifolds); + launcher.setConst(solverInfo.m_fixedBodyIndex); + launcher.launch1D(numManifolds); + } + unsigned int totalNumSplitBodies=0; + { + B3_PROFILE("m_scan->execute"); + + m_data->m_offsetSplitBodies->resize(numBodies); + m_data->m_scan->execute(*m_data->m_bodyCount,*m_data->m_offsetSplitBodies,numBodies,&totalNumSplitBodies); + totalNumSplitBodies+=m_data->m_bodyCount->at(numBodies-1); + } + + { + B3_PROFILE("m_data->m_contactConstraints->resize"); + //int numContacts = manifoldPtr->size(); + m_data->m_contactConstraints->resize(numContacts); + } + + { + B3_PROFILE("contactToConstraintSplitKernel"); + b3LauncherCL launcher( m_queue, m_data->m_contactToConstraintSplitKernel,"m_contactToConstraintSplitKernel"); + launcher.setBuffer(contactBuf); + launcher.setBuffer(bodyBuf); + launcher.setBuffer(inertiaBuf); + launcher.setBuffer(m_data->m_contactConstraints->getBufferCL()); + launcher.setBuffer(m_data->m_bodyCount->getBufferCL()); + launcher.setConst(numContacts); + launcher.setConst(solverInfo.m_deltaTime); + launcher.setConst(solverInfo.m_positionDrift); + launcher.setConst(solverInfo.m_positionConstraintCoeff); + launcher.launch1D( numContacts, 64 ); + + } + + + { + B3_PROFILE("m_data->m_deltaLinearVelocities->resize"); + m_data->m_deltaLinearVelocities->resize(totalNumSplitBodies); + m_data->m_deltaAngularVelocities->resize(totalNumSplitBodies); + } + + + + { + B3_PROFILE("m_clearVelocitiesKernel"); + b3LauncherCL launch(m_queue,m_data->m_clearVelocitiesKernel,"m_clearVelocitiesKernel"); + launch.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL()); + launch.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL()); + launch.setConst(totalNumSplitBodies); + launch.launch1D(totalNumSplitBodies); + clFinish(m_queue); + } + + + int maxIter = solverInfo.m_numIterations; + + for (int iter = 0;iter<maxIter;iter++) + { + { + B3_PROFILE("m_solveContactKernel"); + b3LauncherCL launcher( m_queue, m_data->m_solveContactKernel,"m_solveContactKernel" ); + launcher.setBuffer(m_data->m_contactConstraints->getBufferCL()); + launcher.setBuffer(bodyBuf); + launcher.setBuffer(inertiaBuf); + launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL()); + launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL()); + launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL()); + launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL()); + launcher.setConst(solverInfo.m_deltaTime); + launcher.setConst(solverInfo.m_positionDrift); + launcher.setConst(solverInfo.m_positionConstraintCoeff); + launcher.setConst(solverInfo.m_fixedBodyIndex); + launcher.setConst(numManifolds); + + launcher.launch1D(numManifolds); + clFinish(m_queue); + } + + + + { + B3_PROFILE("average velocities"); + b3LauncherCL launcher( m_queue, m_data->m_averageVelocitiesKernel,"m_averageVelocitiesKernel"); + launcher.setBuffer(bodyBuf); + launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL()); + launcher.setBuffer(m_data->m_bodyCount->getBufferCL()); + launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL()); + launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL()); + launcher.setConst(numBodies); + launcher.launch1D(numBodies); + clFinish(m_queue); + } + + + { + B3_PROFILE("m_solveFrictionKernel"); + b3LauncherCL launcher( m_queue, m_data->m_solveFrictionKernel,"m_solveFrictionKernel"); + launcher.setBuffer(m_data->m_contactConstraints->getBufferCL()); + launcher.setBuffer(bodyBuf); + launcher.setBuffer(inertiaBuf); + launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL()); + launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL()); + launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL()); + launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL()); + launcher.setConst(solverInfo.m_deltaTime); + launcher.setConst(solverInfo.m_positionDrift); + launcher.setConst(solverInfo.m_positionConstraintCoeff); + launcher.setConst(solverInfo.m_fixedBodyIndex); + launcher.setConst(numManifolds); + + launcher.launch1D(numManifolds); + clFinish(m_queue); + } + + + { + B3_PROFILE("average velocities"); + b3LauncherCL launcher( m_queue, m_data->m_averageVelocitiesKernel,"m_averageVelocitiesKernel"); + launcher.setBuffer(bodyBuf); + launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL()); + launcher.setBuffer(m_data->m_bodyCount->getBufferCL()); + launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL()); + launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL()); + launcher.setConst(numBodies); + launcher.launch1D(numBodies); + clFinish(m_queue); + } + + + + } + + + { + B3_PROFILE("update body velocities"); + b3LauncherCL launcher( m_queue, m_data->m_updateBodyVelocitiesKernel,"m_updateBodyVelocitiesKernel"); + launcher.setBuffer(bodyBuf); + launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL()); + launcher.setBuffer(m_data->m_bodyCount->getBufferCL()); + launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL()); + launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL()); + launcher.setConst(numBodies); + launcher.launch1D(numBodies); + clFinish(m_queue); + } + + + +} + +#if 0 + +void b3GpuJacobiContactSolver::solveGroupMixed(b3OpenCLArray<b3RigidBodyData>* bodiesGPU,b3OpenCLArray<b3InertiaData>* inertiasGPU,b3OpenCLArray<b3Contact4>* manifoldPtrGPU,const btJacobiSolverInfo& solverInfo) +{ + + b3AlignedObjectArray<b3RigidBodyData> bodiesCPU; + bodiesGPU->copyToHost(bodiesCPU); + b3AlignedObjectArray<b3InertiaData> inertiasCPU; + inertiasGPU->copyToHost(inertiasCPU); + b3AlignedObjectArray<b3Contact4> manifoldPtrCPU; + manifoldPtrGPU->copyToHost(manifoldPtrCPU); + + int numBodiesCPU = bodiesGPU->size(); + int numManifoldsCPU = manifoldPtrGPU->size(); + B3_PROFILE("b3GpuJacobiContactSolver::solveGroupMixed"); + + b3AlignedObjectArray<unsigned int> bodyCount; + bodyCount.resize(numBodiesCPU); + for (int i=0;i<numBodiesCPU;i++) + bodyCount[i] = 0; + + b3AlignedObjectArray<b3Int2> contactConstraintOffsets; + contactConstraintOffsets.resize(numManifoldsCPU); + + + for (int i=0;i<numManifoldsCPU;i++) + { + int pa = manifoldPtrCPU[i].m_bodyAPtrAndSignBit; + int pb = manifoldPtrCPU[i].m_bodyBPtrAndSignBit; + + bool isFixedA = (pa <0) || (pa == solverInfo.m_fixedBodyIndex); + bool isFixedB = (pb <0) || (pb == solverInfo.m_fixedBodyIndex); + + int bodyIndexA = manifoldPtrCPU[i].getBodyA(); + int bodyIndexB = manifoldPtrCPU[i].getBodyB(); + + if (!isFixedA) + { + contactConstraintOffsets[i].x = bodyCount[bodyIndexA]; + bodyCount[bodyIndexA]++; + } + if (!isFixedB) + { + contactConstraintOffsets[i].y = bodyCount[bodyIndexB]; + bodyCount[bodyIndexB]++; + } + } + + b3AlignedObjectArray<unsigned int> offsetSplitBodies; + offsetSplitBodies.resize(numBodiesCPU); + unsigned int totalNumSplitBodiesCPU; + m_data->m_scan->executeHost(bodyCount,offsetSplitBodies,numBodiesCPU,&totalNumSplitBodiesCPU); + int numlastBody = bodyCount[numBodiesCPU-1]; + totalNumSplitBodiesCPU += numlastBody; + + int numBodies = bodiesGPU->size(); + int numManifolds = manifoldPtrGPU->size(); + + m_data->m_bodyCount->resize(numBodies); + + unsigned int val=0; + b3Int2 val2; + val2.x=0; + val2.y=0; + + { + B3_PROFILE("m_filler"); + m_data->m_contactConstraintOffsets->resize(numManifolds); + m_data->m_filler->execute(*m_data->m_bodyCount,val,numBodies); + + + m_data->m_filler->execute(*m_data->m_contactConstraintOffsets,val2,numManifolds); + } + + { + B3_PROFILE("m_countBodiesKernel"); + b3LauncherCL launcher(this->m_queue,m_data->m_countBodiesKernel); + launcher.setBuffer(manifoldPtrGPU->getBufferCL()); + launcher.setBuffer(m_data->m_bodyCount->getBufferCL()); + launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL()); + launcher.setConst(numManifolds); + launcher.setConst(solverInfo.m_fixedBodyIndex); + launcher.launch1D(numManifolds); + } + + unsigned int totalNumSplitBodies=0; + m_data->m_offsetSplitBodies->resize(numBodies); + m_data->m_scan->execute(*m_data->m_bodyCount,*m_data->m_offsetSplitBodies,numBodies,&totalNumSplitBodies); + totalNumSplitBodies+=m_data->m_bodyCount->at(numBodies-1); + + if (totalNumSplitBodies != totalNumSplitBodiesCPU) + { + printf("error in totalNumSplitBodies!\n"); + } + + int numContacts = manifoldPtrGPU->size(); + m_data->m_contactConstraints->resize(numContacts); + + + { + B3_PROFILE("contactToConstraintSplitKernel"); + b3LauncherCL launcher( m_queue, m_data->m_contactToConstraintSplitKernel); + launcher.setBuffer(manifoldPtrGPU->getBufferCL()); + launcher.setBuffer(bodiesGPU->getBufferCL()); + launcher.setBuffer(inertiasGPU->getBufferCL()); + launcher.setBuffer(m_data->m_contactConstraints->getBufferCL()); + launcher.setBuffer(m_data->m_bodyCount->getBufferCL()); + launcher.setConst(numContacts); + launcher.setConst(solverInfo.m_deltaTime); + launcher.setConst(solverInfo.m_positionDrift); + launcher.setConst(solverInfo.m_positionConstraintCoeff); + launcher.launch1D( numContacts, 64 ); + clFinish(m_queue); + } + + + + b3AlignedObjectArray<b3GpuConstraint4> contactConstraints; + contactConstraints.resize(numManifoldsCPU); + + for (int i=0;i<numManifoldsCPU;i++) + { + ContactToConstraintKernel(&manifoldPtrCPU[0],&bodiesCPU[0],&inertiasCPU[0],&contactConstraints[0],numManifoldsCPU, + solverInfo.m_deltaTime, + solverInfo.m_positionDrift, + solverInfo.m_positionConstraintCoeff, + i, bodyCount); + } + int maxIter = solverInfo.m_numIterations; + + + b3AlignedObjectArray<b3Vector3> deltaLinearVelocities; + b3AlignedObjectArray<b3Vector3> deltaAngularVelocities; + deltaLinearVelocities.resize(totalNumSplitBodiesCPU); + deltaAngularVelocities.resize(totalNumSplitBodiesCPU); + for (int i=0;i<totalNumSplitBodiesCPU;i++) + { + deltaLinearVelocities[i].setZero(); + deltaAngularVelocities[i].setZero(); + } + + m_data->m_deltaLinearVelocities->resize(totalNumSplitBodies); + m_data->m_deltaAngularVelocities->resize(totalNumSplitBodies); + + + + { + B3_PROFILE("m_clearVelocitiesKernel"); + b3LauncherCL launch(m_queue,m_data->m_clearVelocitiesKernel); + launch.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL()); + launch.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL()); + launch.setConst(totalNumSplitBodies); + launch.launch1D(totalNumSplitBodies); + } + + + ///!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + + m_data->m_contactConstraints->copyToHost(contactConstraints); + m_data->m_offsetSplitBodies->copyToHost(offsetSplitBodies); + m_data->m_contactConstraintOffsets->copyToHost(contactConstraintOffsets); + m_data->m_deltaLinearVelocities->copyToHost(deltaLinearVelocities); + m_data->m_deltaAngularVelocities->copyToHost(deltaAngularVelocities); + + for (int iter = 0;iter<maxIter;iter++) + { + + { + B3_PROFILE("m_solveContactKernel"); + b3LauncherCL launcher( m_queue, m_data->m_solveContactKernel ); + launcher.setBuffer(m_data->m_contactConstraints->getBufferCL()); + launcher.setBuffer(bodiesGPU->getBufferCL()); + launcher.setBuffer(inertiasGPU->getBufferCL()); + launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL()); + launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL()); + launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL()); + launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL()); + launcher.setConst(solverInfo.m_deltaTime); + launcher.setConst(solverInfo.m_positionDrift); + launcher.setConst(solverInfo.m_positionConstraintCoeff); + launcher.setConst(solverInfo.m_fixedBodyIndex); + launcher.setConst(numManifolds); + + launcher.launch1D(numManifolds); + clFinish(m_queue); + } + + + int i=0; + for( i=0; i<numManifoldsCPU; i++) + { + + float frictionCoeff = contactConstraints[i].getFrictionCoeff(); + int aIdx = (int)contactConstraints[i].m_bodyA; + int bIdx = (int)contactConstraints[i].m_bodyB; + b3RigidBodyData& bodyA = bodiesCPU[aIdx]; + b3RigidBodyData& bodyB = bodiesCPU[bIdx]; + + b3Vector3 zero(0,0,0); + + b3Vector3* dlvAPtr=&zero; + b3Vector3* davAPtr=&zero; + b3Vector3* dlvBPtr=&zero; + b3Vector3* davBPtr=&zero; + + if (bodyA.m_invMass) + { + int bodyOffsetA = offsetSplitBodies[aIdx]; + int constraintOffsetA = contactConstraintOffsets[i].x; + int splitIndexA = bodyOffsetA+constraintOffsetA; + dlvAPtr = &deltaLinearVelocities[splitIndexA]; + davAPtr = &deltaAngularVelocities[splitIndexA]; + } + + if (bodyB.m_invMass) + { + int bodyOffsetB = offsetSplitBodies[bIdx]; + int constraintOffsetB = contactConstraintOffsets[i].y; + int splitIndexB= bodyOffsetB+constraintOffsetB; + dlvBPtr =&deltaLinearVelocities[splitIndexB]; + davBPtr = &deltaAngularVelocities[splitIndexB]; + } + + + + { + float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + + solveContact( contactConstraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, inertiasCPU[aIdx].m_invInertiaWorld, + (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, inertiasCPU[bIdx].m_invInertiaWorld, + maxRambdaDt, minRambdaDt , *dlvAPtr,*davAPtr,*dlvBPtr,*davBPtr ); + + + } + } + + + { + B3_PROFILE("average velocities"); + b3LauncherCL launcher( m_queue, m_data->m_averageVelocitiesKernel); + launcher.setBuffer(bodiesGPU->getBufferCL()); + launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL()); + launcher.setBuffer(m_data->m_bodyCount->getBufferCL()); + launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL()); + launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL()); + launcher.setConst(numBodies); + launcher.launch1D(numBodies); + clFinish(m_queue); + } + + //easy + for (int i=0;i<numBodiesCPU;i++) + { + if (bodiesCPU[i].m_invMass) + { + int bodyOffset = offsetSplitBodies[i]; + int count = bodyCount[i]; + float factor = 1.f/float(count); + b3Vector3 averageLinVel; + averageLinVel.setZero(); + b3Vector3 averageAngVel; + averageAngVel.setZero(); + for (int j=0;j<count;j++) + { + averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor; + averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor; + } + for (int j=0;j<count;j++) + { + deltaLinearVelocities[bodyOffset+j] = averageLinVel; + deltaAngularVelocities[bodyOffset+j] = averageAngVel; + } + } + } +// m_data->m_deltaAngularVelocities->copyFromHost(deltaAngularVelocities); + //m_data->m_deltaLinearVelocities->copyFromHost(deltaLinearVelocities); + m_data->m_deltaAngularVelocities->copyToHost(deltaAngularVelocities); + m_data->m_deltaLinearVelocities->copyToHost(deltaLinearVelocities); + +#if 0 + + { + B3_PROFILE("m_solveFrictionKernel"); + b3LauncherCL launcher( m_queue, m_data->m_solveFrictionKernel); + launcher.setBuffer(m_data->m_contactConstraints->getBufferCL()); + launcher.setBuffer(bodiesGPU->getBufferCL()); + launcher.setBuffer(inertiasGPU->getBufferCL()); + launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL()); + launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL()); + launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL()); + launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL()); + launcher.setConst(solverInfo.m_deltaTime); + launcher.setConst(solverInfo.m_positionDrift); + launcher.setConst(solverInfo.m_positionConstraintCoeff); + launcher.setConst(solverInfo.m_fixedBodyIndex); + launcher.setConst(numManifolds); + + launcher.launch1D(numManifolds); + clFinish(m_queue); + } + + //solve friction + + for(int i=0; i<numManifoldsCPU; i++) + { + float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + + float sum = 0; + for(int j=0; j<4; j++) + { + sum +=contactConstraints[i].m_appliedRambdaDt[j]; + } + float frictionCoeff = contactConstraints[i].getFrictionCoeff(); + int aIdx = (int)contactConstraints[i].m_bodyA; + int bIdx = (int)contactConstraints[i].m_bodyB; + b3RigidBodyData& bodyA = bodiesCPU[aIdx]; + b3RigidBodyData& bodyB = bodiesCPU[bIdx]; + + b3Vector3 zero(0,0,0); + + b3Vector3* dlvAPtr=&zero; + b3Vector3* davAPtr=&zero; + b3Vector3* dlvBPtr=&zero; + b3Vector3* davBPtr=&zero; + + if (bodyA.m_invMass) + { + int bodyOffsetA = offsetSplitBodies[aIdx]; + int constraintOffsetA = contactConstraintOffsets[i].x; + int splitIndexA = bodyOffsetA+constraintOffsetA; + dlvAPtr = &deltaLinearVelocities[splitIndexA]; + davAPtr = &deltaAngularVelocities[splitIndexA]; + } + + if (bodyB.m_invMass) + { + int bodyOffsetB = offsetSplitBodies[bIdx]; + int constraintOffsetB = contactConstraintOffsets[i].y; + int splitIndexB= bodyOffsetB+constraintOffsetB; + dlvBPtr =&deltaLinearVelocities[splitIndexB]; + davBPtr = &deltaAngularVelocities[splitIndexB]; + } + + for(int j=0; j<4; j++) + { + maxRambdaDt[j] = frictionCoeff*sum; + minRambdaDt[j] = -maxRambdaDt[j]; + } + + solveFriction( contactConstraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,inertiasCPU[aIdx].m_invInertiaWorld, + (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, inertiasCPU[bIdx].m_invInertiaWorld, + maxRambdaDt, minRambdaDt , *dlvAPtr,*davAPtr,*dlvBPtr,*davBPtr); + + } + + { + B3_PROFILE("average velocities"); + b3LauncherCL launcher( m_queue, m_data->m_averageVelocitiesKernel); + launcher.setBuffer(bodiesGPU->getBufferCL()); + launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL()); + launcher.setBuffer(m_data->m_bodyCount->getBufferCL()); + launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL()); + launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL()); + launcher.setConst(numBodies); + launcher.launch1D(numBodies); + clFinish(m_queue); + } + + //easy + for (int i=0;i<numBodiesCPU;i++) + { + if (bodiesCPU[i].m_invMass) + { + int bodyOffset = offsetSplitBodies[i]; + int count = bodyCount[i]; + float factor = 1.f/float(count); + b3Vector3 averageLinVel; + averageLinVel.setZero(); + b3Vector3 averageAngVel; + averageAngVel.setZero(); + for (int j=0;j<count;j++) + { + averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor; + averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor; + } + for (int j=0;j<count;j++) + { + deltaLinearVelocities[bodyOffset+j] = averageLinVel; + deltaAngularVelocities[bodyOffset+j] = averageAngVel; + } + } + } + +#endif + + } + + { + B3_PROFILE("update body velocities"); + b3LauncherCL launcher( m_queue, m_data->m_updateBodyVelocitiesKernel); + launcher.setBuffer(bodiesGPU->getBufferCL()); + launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL()); + launcher.setBuffer(m_data->m_bodyCount->getBufferCL()); + launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL()); + launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL()); + launcher.setConst(numBodies); + launcher.launch1D(numBodies); + clFinish(m_queue); + } + + + //easy + for (int i=0;i<numBodiesCPU;i++) + { + if (bodiesCPU[i].m_invMass) + { + int bodyOffset = offsetSplitBodies[i]; + int count = bodyCount[i]; + if (count) + { + bodiesCPU[i].m_linVel += deltaLinearVelocities[bodyOffset]; + bodiesCPU[i].m_angVel += deltaAngularVelocities[bodyOffset]; + } + } + } + + +// bodiesGPU->copyFromHost(bodiesCPU); + + +} +#endif diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.h new file mode 100644 index 0000000000..b418f29ec4 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.h @@ -0,0 +1,62 @@ + +#ifndef B3_GPU_JACOBI_CONTACT_SOLVER_H +#define B3_GPU_JACOBI_CONTACT_SOLVER_H +#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h" +//#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h" + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" + + +//struct b3InertiaData; +//b3InertiaData + +class b3TypedConstraint; + +struct b3JacobiSolverInfo +{ + int m_fixedBodyIndex; + + float m_deltaTime; + float m_positionDrift; + float m_positionConstraintCoeff; + int m_numIterations; + + b3JacobiSolverInfo() + :m_fixedBodyIndex(0), + m_deltaTime(1./60.f), + m_positionDrift( 0.005f ), + m_positionConstraintCoeff( 0.99f ), + m_numIterations(7) + { + } +}; +class b3GpuJacobiContactSolver +{ +protected: + + struct b3GpuJacobiSolverInternalData* m_data; + + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + +public: + + b3GpuJacobiContactSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity); + virtual ~b3GpuJacobiContactSolver(); + + + void solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index); + void solveGroupHost(b3RigidBodyData* bodies,b3InertiaData* inertias,int numBodies,struct b3Contact4* manifoldPtr, int numManifolds,const b3JacobiSolverInfo& solverInfo); + //void solveGroupHost(btRigidBodyCL* bodies,b3InertiaData* inertias,int numBodies,btContact4* manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btJacobiSolverInfo& solverInfo); + + //b3Scalar solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal); + + //void solveGroup(btOpenCLArray<btRigidBodyCL>* bodies,btOpenCLArray<btInertiaCL>* inertias,btOpenCLArray<btContact4>* manifoldPtr,const btJacobiSolverInfo& solverInfo); + //void solveGroupMixed(btOpenCLArray<btRigidBodyCL>* bodies,btOpenCLArray<btInertiaCL>* inertias,btOpenCLArray<btContact4>* manifoldPtr,const btJacobiSolverInfo& solverInfo); + +}; +#endif //B3_GPU_JACOBI_CONTACT_SOLVER_H + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.cpp b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.cpp new file mode 100644 index 0000000000..698fa15f96 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.cpp @@ -0,0 +1,1107 @@ +#include "b3GpuNarrowPhase.h" + + +#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h" +#include "Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h" +#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h" +#include <string.h> +#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h" +#include "Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h" +#include "Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h" +#include "Bullet3Geometry/b3AabbUtil.h" +#include "Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h" + +#include "b3GpuNarrowPhaseInternalData.h" +#include "Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h" +#include "Bullet3Collision/NarrowPhaseCollision/b3ConvexUtility.h" + + + + +b3GpuNarrowPhase::b3GpuNarrowPhase(cl_context ctx, cl_device_id device, cl_command_queue queue, const b3Config& config) +:m_data(0) ,m_planeBodyIndex(-1),m_static0Index(-1), +m_context(ctx), +m_device(device), +m_queue(queue) +{ + + m_data = new b3GpuNarrowPhaseInternalData(); + m_data->m_currentContactBuffer = 0; + + memset(m_data,0,sizeof(b3GpuNarrowPhaseInternalData)); + + + m_data->m_config = config; + + m_data->m_gpuSatCollision = new GpuSatCollision(ctx,device,queue); + + + m_data->m_triangleConvexPairs = new b3OpenCLArray<b3Int4>(m_context,m_queue, config.m_maxTriConvexPairCapacity); + + + //m_data->m_convexPairsOutGPU = new b3OpenCLArray<b3Int2>(ctx,queue,config.m_maxBroadphasePairs,false); + //m_data->m_planePairs = new b3OpenCLArray<b3Int2>(ctx,queue,config.m_maxBroadphasePairs,false); + + m_data->m_pBufContactOutCPU = new b3AlignedObjectArray<b3Contact4>(); + m_data->m_pBufContactOutCPU->resize(config.m_maxBroadphasePairs); + m_data->m_bodyBufferCPU = new b3AlignedObjectArray<b3RigidBodyData>(); + m_data->m_bodyBufferCPU->resize(config.m_maxConvexBodies); + + m_data->m_inertiaBufferCPU = new b3AlignedObjectArray<b3InertiaData>(); + m_data->m_inertiaBufferCPU->resize(config.m_maxConvexBodies); + + m_data->m_pBufContactBuffersGPU[0] = new b3OpenCLArray<b3Contact4>(ctx,queue, config.m_maxContactCapacity,true); + m_data->m_pBufContactBuffersGPU[1] = new b3OpenCLArray<b3Contact4>(ctx,queue, config.m_maxContactCapacity,true); + + m_data->m_inertiaBufferGPU = new b3OpenCLArray<b3InertiaData>(ctx,queue,config.m_maxConvexBodies,false); + m_data->m_collidablesGPU = new b3OpenCLArray<b3Collidable>(ctx,queue,config.m_maxConvexShapes); + m_data->m_collidablesCPU.reserve(config.m_maxConvexShapes); + + m_data->m_localShapeAABBCPU = new b3AlignedObjectArray<b3SapAabb>; + m_data->m_localShapeAABBGPU = new b3OpenCLArray<b3SapAabb>(ctx,queue,config.m_maxConvexShapes); + + + //m_data->m_solverDataGPU = adl::Solver<adl::TYPE_CL>::allocate(ctx,queue, config.m_maxBroadphasePairs,false); + m_data->m_bodyBufferGPU = new b3OpenCLArray<b3RigidBodyData>(ctx,queue, config.m_maxConvexBodies,false); + + m_data->m_convexFacesGPU = new b3OpenCLArray<b3GpuFace>(ctx,queue,config.m_maxConvexShapes*config.m_maxFacesPerShape,false); + m_data->m_convexFaces.reserve(config.m_maxConvexShapes*config.m_maxFacesPerShape); + + m_data->m_gpuChildShapes = new b3OpenCLArray<b3GpuChildShape>(ctx,queue,config.m_maxCompoundChildShapes,false); + + m_data->m_convexPolyhedraGPU = new b3OpenCLArray<b3ConvexPolyhedronData>(ctx,queue,config.m_maxConvexShapes,false); + m_data->m_convexPolyhedra.reserve(config.m_maxConvexShapes); + + m_data->m_uniqueEdgesGPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexUniqueEdges,true); + m_data->m_uniqueEdges.reserve(config.m_maxConvexUniqueEdges); + + + + m_data->m_convexVerticesGPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexVertices,true); + m_data->m_convexVertices.reserve(config.m_maxConvexVertices); + + m_data->m_convexIndicesGPU = new b3OpenCLArray<int>(ctx,queue,config.m_maxConvexIndices,true); + m_data->m_convexIndices.reserve(config.m_maxConvexIndices); + + m_data->m_worldVertsB1GPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexBodies*config.m_maxVerticesPerFace); + m_data->m_clippingFacesOutGPU = new b3OpenCLArray<b3Int4>(ctx,queue,config.m_maxConvexBodies); + m_data->m_worldNormalsAGPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexBodies); + m_data->m_worldVertsA1GPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexBodies*config.m_maxVerticesPerFace); + m_data->m_worldVertsB2GPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexBodies*config.m_maxVerticesPerFace); + + + + m_data->m_convexData = new b3AlignedObjectArray<b3ConvexUtility* >(); + + m_data->m_convexData->resize(config.m_maxConvexShapes); + m_data->m_convexPolyhedra.resize(config.m_maxConvexShapes); + + m_data->m_numAcceleratedShapes = 0; + m_data->m_numAcceleratedRigidBodies = 0; + + + m_data->m_subTreesGPU = new b3OpenCLArray<b3BvhSubtreeInfo>(this->m_context,this->m_queue); + m_data->m_treeNodesGPU = new b3OpenCLArray<b3QuantizedBvhNode>(this->m_context,this->m_queue); + m_data->m_bvhInfoGPU = new b3OpenCLArray<b3BvhInfo>(this->m_context,this->m_queue); + + //m_data->m_contactCGPU = new b3OpenCLArray<Constraint4>(ctx,queue,config.m_maxBroadphasePairs,false); + //m_data->m_frictionCGPU = new b3OpenCLArray<adl::Solver<adl::TYPE_CL>::allocateFrictionConstraint( m_data->m_deviceCL, config.m_maxBroadphasePairs); + + + +} + + +b3GpuNarrowPhase::~b3GpuNarrowPhase() +{ + delete m_data->m_gpuSatCollision; + + delete m_data->m_triangleConvexPairs; + //delete m_data->m_convexPairsOutGPU; + //delete m_data->m_planePairs; + delete m_data->m_pBufContactOutCPU; + delete m_data->m_bodyBufferCPU; + delete m_data->m_inertiaBufferCPU; + delete m_data->m_pBufContactBuffersGPU[0]; + delete m_data->m_pBufContactBuffersGPU[1]; + + + delete m_data->m_inertiaBufferGPU; + delete m_data->m_collidablesGPU; + delete m_data->m_localShapeAABBCPU; + delete m_data->m_localShapeAABBGPU; + delete m_data->m_bodyBufferGPU; + delete m_data->m_convexFacesGPU; + delete m_data->m_gpuChildShapes; + delete m_data->m_convexPolyhedraGPU; + delete m_data->m_uniqueEdgesGPU; + delete m_data->m_convexVerticesGPU; + delete m_data->m_convexIndicesGPU; + delete m_data->m_worldVertsB1GPU; + delete m_data->m_clippingFacesOutGPU; + delete m_data->m_worldNormalsAGPU; + delete m_data->m_worldVertsA1GPU; + delete m_data->m_worldVertsB2GPU; + + delete m_data->m_bvhInfoGPU; + + for (int i=0;i<m_data->m_bvhData.size();i++) + { + delete m_data->m_bvhData[i]; + } + for (int i=0;i<m_data->m_meshInterfaces.size();i++) + { + delete m_data->m_meshInterfaces[i]; + } + m_data->m_meshInterfaces.clear(); + m_data->m_bvhData.clear(); + delete m_data->m_treeNodesGPU; + delete m_data->m_subTreesGPU; + + + delete m_data->m_convexData; + delete m_data; +} + + +int b3GpuNarrowPhase::allocateCollidable() +{ + int curSize = m_data->m_collidablesCPU.size(); + if (curSize<m_data->m_config.m_maxConvexShapes) + { + m_data->m_collidablesCPU.expand(); + return curSize; + } + else + { + b3Error("allocateCollidable out-of-range %d\n",m_data->m_config.m_maxConvexShapes); + } + return -1; + +} + + + + + +int b3GpuNarrowPhase::registerSphereShape(float radius) +{ + int collidableIndex = allocateCollidable(); + if (collidableIndex<0) + return collidableIndex; + + + b3Collidable& col = getCollidableCpu(collidableIndex); + col.m_shapeType = SHAPE_SPHERE; + col.m_shapeIndex = 0; + col.m_radius = radius; + + if (col.m_shapeIndex>=0) + { + b3SapAabb aabb; + b3Vector3 myAabbMin=b3MakeVector3(-radius,-radius,-radius); + b3Vector3 myAabbMax=b3MakeVector3(radius,radius,radius); + + aabb.m_min[0] = myAabbMin[0];//s_convexHeightField->m_aabb.m_min.x; + aabb.m_min[1] = myAabbMin[1];//s_convexHeightField->m_aabb.m_min.y; + aabb.m_min[2] = myAabbMin[2];//s_convexHeightField->m_aabb.m_min.z; + aabb.m_minIndices[3] = 0; + + aabb.m_max[0] = myAabbMax[0];//s_convexHeightField->m_aabb.m_max.x; + aabb.m_max[1] = myAabbMax[1];//s_convexHeightField->m_aabb.m_max.y; + aabb.m_max[2] = myAabbMax[2];//s_convexHeightField->m_aabb.m_max.z; + aabb.m_signedMaxIndices[3] = 0; + + m_data->m_localShapeAABBCPU->push_back(aabb); +// m_data->m_localShapeAABBGPU->push_back(aabb); + clFinish(m_queue); + } + + return collidableIndex; +} + + +int b3GpuNarrowPhase::registerFace(const b3Vector3& faceNormal, float faceConstant) +{ + int faceOffset = m_data->m_convexFaces.size(); + b3GpuFace& face = m_data->m_convexFaces.expand(); + face.m_plane = b3MakeVector3(faceNormal.x,faceNormal.y,faceNormal.z,faceConstant); + return faceOffset; +} + +int b3GpuNarrowPhase::registerPlaneShape(const b3Vector3& planeNormal, float planeConstant) +{ + int collidableIndex = allocateCollidable(); + if (collidableIndex<0) + return collidableIndex; + + + b3Collidable& col = getCollidableCpu(collidableIndex); + col.m_shapeType = SHAPE_PLANE; + col.m_shapeIndex = registerFace(planeNormal,planeConstant); + col.m_radius = planeConstant; + + if (col.m_shapeIndex>=0) + { + b3SapAabb aabb; + aabb.m_min[0] = -1e30f; + aabb.m_min[1] = -1e30f; + aabb.m_min[2] = -1e30f; + aabb.m_minIndices[3] = 0; + + aabb.m_max[0] = 1e30f; + aabb.m_max[1] = 1e30f; + aabb.m_max[2] = 1e30f; + aabb.m_signedMaxIndices[3] = 0; + + m_data->m_localShapeAABBCPU->push_back(aabb); +// m_data->m_localShapeAABBGPU->push_back(aabb); + clFinish(m_queue); + } + + return collidableIndex; +} + + +int b3GpuNarrowPhase::registerConvexHullShapeInternal(b3ConvexUtility* convexPtr,b3Collidable& col) +{ + + m_data->m_convexData->resize(m_data->m_numAcceleratedShapes+1); + m_data->m_convexPolyhedra.resize(m_data->m_numAcceleratedShapes+1); + + + b3ConvexPolyhedronData& convex = m_data->m_convexPolyhedra.at(m_data->m_convexPolyhedra.size()-1); + convex.mC = convexPtr->mC; + convex.mE = convexPtr->mE; + convex.m_extents= convexPtr->m_extents; + convex.m_localCenter = convexPtr->m_localCenter; + convex.m_radius = convexPtr->m_radius; + + convex.m_numUniqueEdges = convexPtr->m_uniqueEdges.size(); + int edgeOffset = m_data->m_uniqueEdges.size(); + convex.m_uniqueEdgesOffset = edgeOffset; + + m_data->m_uniqueEdges.resize(edgeOffset+convex.m_numUniqueEdges); + + //convex data here + int i; + for ( i=0;i<convexPtr->m_uniqueEdges.size();i++) + { + m_data->m_uniqueEdges[edgeOffset+i] = convexPtr->m_uniqueEdges[i]; + } + + int faceOffset = m_data->m_convexFaces.size(); + convex.m_faceOffset = faceOffset; + convex.m_numFaces = convexPtr->m_faces.size(); + + m_data->m_convexFaces.resize(faceOffset+convex.m_numFaces); + + + for (i=0;i<convexPtr->m_faces.size();i++) + { + m_data->m_convexFaces[convex.m_faceOffset+i].m_plane = b3MakeVector3(convexPtr->m_faces[i].m_plane[0], + convexPtr->m_faces[i].m_plane[1], + convexPtr->m_faces[i].m_plane[2], + convexPtr->m_faces[i].m_plane[3]); + + + int indexOffset = m_data->m_convexIndices.size(); + int numIndices = convexPtr->m_faces[i].m_indices.size(); + m_data->m_convexFaces[convex.m_faceOffset+i].m_numIndices = numIndices; + m_data->m_convexFaces[convex.m_faceOffset+i].m_indexOffset = indexOffset; + m_data->m_convexIndices.resize(indexOffset+numIndices); + for (int p=0;p<numIndices;p++) + { + m_data->m_convexIndices[indexOffset+p] = convexPtr->m_faces[i].m_indices[p]; + } + } + + convex.m_numVertices = convexPtr->m_vertices.size(); + int vertexOffset = m_data->m_convexVertices.size(); + convex.m_vertexOffset =vertexOffset; + + m_data->m_convexVertices.resize(vertexOffset+convex.m_numVertices); + for (int i=0;i<convexPtr->m_vertices.size();i++) + { + m_data->m_convexVertices[vertexOffset+i] = convexPtr->m_vertices[i]; + } + + (*m_data->m_convexData)[m_data->m_numAcceleratedShapes] = convexPtr; + + + + return m_data->m_numAcceleratedShapes++; +} + + +int b3GpuNarrowPhase::registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling) +{ + b3AlignedObjectArray<b3Vector3> verts; + + unsigned char* vts = (unsigned char*) vertices; + for (int i=0;i<numVertices;i++) + { + float* vertex = (float*) &vts[i*strideInBytes]; + verts.push_back(b3MakeVector3(vertex[0]*scaling[0],vertex[1]*scaling[1],vertex[2]*scaling[2])); + } + + b3ConvexUtility* utilPtr = new b3ConvexUtility(); + bool merge = true; + if (numVertices) + { + utilPtr->initializePolyhedralFeatures(&verts[0],verts.size(),merge); + } + + int collidableIndex = registerConvexHullShape(utilPtr); + delete utilPtr; + return collidableIndex; +} + +int b3GpuNarrowPhase::registerConvexHullShape(b3ConvexUtility* utilPtr) +{ + int collidableIndex = allocateCollidable(); + if (collidableIndex<0) + return collidableIndex; + + b3Collidable& col = getCollidableCpu(collidableIndex); + col.m_shapeType = SHAPE_CONVEX_HULL; + col.m_shapeIndex = -1; + + + { + b3Vector3 localCenter=b3MakeVector3(0,0,0); + for (int i=0;i<utilPtr->m_vertices.size();i++) + localCenter+=utilPtr->m_vertices[i]; + localCenter*= (1.f/utilPtr->m_vertices.size()); + utilPtr->m_localCenter = localCenter; + + col.m_shapeIndex = registerConvexHullShapeInternal(utilPtr,col); + } + + if (col.m_shapeIndex>=0) + { + b3SapAabb aabb; + + b3Vector3 myAabbMin=b3MakeVector3(1e30f,1e30f,1e30f); + b3Vector3 myAabbMax=b3MakeVector3(-1e30f,-1e30f,-1e30f); + + for (int i=0;i<utilPtr->m_vertices.size();i++) + { + myAabbMin.setMin(utilPtr->m_vertices[i]); + myAabbMax.setMax(utilPtr->m_vertices[i]); + } + aabb.m_min[0] = myAabbMin[0]; + aabb.m_min[1] = myAabbMin[1]; + aabb.m_min[2] = myAabbMin[2]; + aabb.m_minIndices[3] = 0; + + aabb.m_max[0] = myAabbMax[0]; + aabb.m_max[1] = myAabbMax[1]; + aabb.m_max[2] = myAabbMax[2]; + aabb.m_signedMaxIndices[3] = 0; + + m_data->m_localShapeAABBCPU->push_back(aabb); +// m_data->m_localShapeAABBGPU->push_back(aabb); + } + + return collidableIndex; + +} + +int b3GpuNarrowPhase::registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes) +{ + + int collidableIndex = allocateCollidable(); + if (collidableIndex<0) + return collidableIndex; + + b3Collidable& col = getCollidableCpu(collidableIndex); + col.m_shapeType = SHAPE_COMPOUND_OF_CONVEX_HULLS; + col.m_shapeIndex = m_data->m_cpuChildShapes.size(); + col.m_compoundBvhIndex = m_data->m_bvhInfoCPU.size(); + + { + b3Assert(col.m_shapeIndex+childShapes->size()<m_data->m_config.m_maxCompoundChildShapes); + for (int i=0;i<childShapes->size();i++) + { + m_data->m_cpuChildShapes.push_back(childShapes->at(i)); + } + } + + + + col.m_numChildShapes = childShapes->size(); + + + b3SapAabb aabbLocalSpace; + b3Vector3 myAabbMin=b3MakeVector3(1e30f,1e30f,1e30f); + b3Vector3 myAabbMax=b3MakeVector3(-1e30f,-1e30f,-1e30f); + + b3AlignedObjectArray<b3Aabb> childLocalAabbs; + childLocalAabbs.resize(childShapes->size()); + + //compute local AABB of the compound of all children + for (int i=0;i<childShapes->size();i++) + { + int childColIndex = childShapes->at(i).m_shapeIndex; + //b3Collidable& childCol = getCollidableCpu(childColIndex); + b3SapAabb aabbLoc =m_data->m_localShapeAABBCPU->at(childColIndex); + + b3Vector3 childLocalAabbMin=b3MakeVector3(aabbLoc.m_min[0],aabbLoc.m_min[1],aabbLoc.m_min[2]); + b3Vector3 childLocalAabbMax=b3MakeVector3(aabbLoc.m_max[0],aabbLoc.m_max[1],aabbLoc.m_max[2]); + b3Vector3 aMin,aMax; + b3Scalar margin(0.f); + b3Transform childTr; + childTr.setIdentity(); + + childTr.setOrigin(childShapes->at(i).m_childPosition); + childTr.setRotation(b3Quaternion(childShapes->at(i).m_childOrientation)); + b3TransformAabb(childLocalAabbMin,childLocalAabbMax,margin,childTr,aMin,aMax); + myAabbMin.setMin(aMin); + myAabbMax.setMax(aMax); + childLocalAabbs[i].m_min[0] = aMin[0]; + childLocalAabbs[i].m_min[1] = aMin[1]; + childLocalAabbs[i].m_min[2] = aMin[2]; + childLocalAabbs[i].m_min[3] = 0; + childLocalAabbs[i].m_max[0] = aMax[0]; + childLocalAabbs[i].m_max[1] = aMax[1]; + childLocalAabbs[i].m_max[2] = aMax[2]; + childLocalAabbs[i].m_max[3] = 0; + } + + aabbLocalSpace.m_min[0] = myAabbMin[0];//s_convexHeightField->m_aabb.m_min.x; + aabbLocalSpace.m_min[1]= myAabbMin[1];//s_convexHeightField->m_aabb.m_min.y; + aabbLocalSpace.m_min[2]= myAabbMin[2];//s_convexHeightField->m_aabb.m_min.z; + aabbLocalSpace.m_minIndices[3] = 0; + + aabbLocalSpace.m_max[0] = myAabbMax[0];//s_convexHeightField->m_aabb.m_max.x; + aabbLocalSpace.m_max[1]= myAabbMax[1];//s_convexHeightField->m_aabb.m_max.y; + aabbLocalSpace.m_max[2]= myAabbMax[2];//s_convexHeightField->m_aabb.m_max.z; + aabbLocalSpace.m_signedMaxIndices[3] = 0; + + m_data->m_localShapeAABBCPU->push_back(aabbLocalSpace); + + + b3QuantizedBvh* bvh = new b3QuantizedBvh; + bvh->setQuantizationValues(myAabbMin,myAabbMax); + QuantizedNodeArray& nodes = bvh->getLeafNodeArray(); + int numNodes = childShapes->size(); + + for (int i=0;i<numNodes;i++) + { + b3QuantizedBvhNode node; + b3Vector3 aabbMin,aabbMax; + aabbMin = (b3Vector3&) childLocalAabbs[i].m_min; + aabbMax = (b3Vector3&) childLocalAabbs[i].m_max; + + bvh->quantize(&node.m_quantizedAabbMin[0],aabbMin,0); + bvh->quantize(&node.m_quantizedAabbMax[0],aabbMax,1); + int partId = 0; + node.m_escapeIndexOrTriangleIndex = (partId<<(31-MAX_NUM_PARTS_IN_BITS)) | i; + nodes.push_back(node); + } + bvh->buildInternal(); + + int numSubTrees = bvh->getSubtreeInfoArray().size(); + + //void setQuantizationValues(const b3Vector3& bvhAabbMin,const b3Vector3& bvhAabbMax,b3Scalar quantizationMargin=b3Scalar(1.0)); + //QuantizedNodeArray& getLeafNodeArray() { return m_quantizedLeafNodes; } + ///buildInternal is expert use only: assumes that setQuantizationValues and LeafNodeArray are initialized + //void buildInternal(); + + b3BvhInfo bvhInfo; + + bvhInfo.m_aabbMin = bvh->m_bvhAabbMin; + bvhInfo.m_aabbMax = bvh->m_bvhAabbMax; + bvhInfo.m_quantization = bvh->m_bvhQuantization; + bvhInfo.m_numNodes = numNodes; + bvhInfo.m_numSubTrees = numSubTrees; + bvhInfo.m_nodeOffset = m_data->m_treeNodesCPU.size(); + bvhInfo.m_subTreeOffset = m_data->m_subTreesCPU.size(); + + int numNewNodes = bvh->getQuantizedNodeArray().size(); + + for (int i=0;i<numNewNodes-1;i++) + { + + if (bvh->getQuantizedNodeArray()[i].isLeafNode()) + { + int orgIndex = bvh->getQuantizedNodeArray()[i].getTriangleIndex(); + + b3Vector3 nodeMinVec = bvh->unQuantize(bvh->getQuantizedNodeArray()[i].m_quantizedAabbMin); + b3Vector3 nodeMaxVec = bvh->unQuantize(bvh->getQuantizedNodeArray()[i].m_quantizedAabbMax); + + for (int c=0;c<3;c++) + { + if (childLocalAabbs[orgIndex].m_min[c] < nodeMinVec[c]) + { + printf("min org (%f) and new (%f) ? at i:%d,c:%d\n",childLocalAabbs[i].m_min[c],nodeMinVec[c],i,c); + } + if (childLocalAabbs[orgIndex].m_max[c] > nodeMaxVec[c]) + { + printf("max org (%f) and new (%f) ? at i:%d,c:%d\n",childLocalAabbs[i].m_max[c],nodeMaxVec[c],i,c); + } + + } + } + + } + + m_data->m_bvhInfoCPU.push_back(bvhInfo); + + int numNewSubtrees = bvh->getSubtreeInfoArray().size(); + m_data->m_subTreesCPU.reserve(m_data->m_subTreesCPU.size()+numNewSubtrees); + for (int i=0;i<numNewSubtrees;i++) + { + m_data->m_subTreesCPU.push_back(bvh->getSubtreeInfoArray()[i]); + } + int numNewTreeNodes = bvh->getQuantizedNodeArray().size(); + + for (int i=0;i<numNewTreeNodes;i++) + { + m_data->m_treeNodesCPU.push_back(bvh->getQuantizedNodeArray()[i]); + } + +// m_data->m_localShapeAABBGPU->push_back(aabbWS); + clFinish(m_queue); + return collidableIndex; + +} + + +int b3GpuNarrowPhase::registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices,const float* scaling1) +{ + + + b3Vector3 scaling=b3MakeVector3(scaling1[0],scaling1[1],scaling1[2]); + + int collidableIndex = allocateCollidable(); + if (collidableIndex<0) + return collidableIndex; + + b3Collidable& col = getCollidableCpu(collidableIndex); + + col.m_shapeType = SHAPE_CONCAVE_TRIMESH; + col.m_shapeIndex = registerConcaveMeshShape(vertices,indices,col,scaling); + col.m_bvhIndex = m_data->m_bvhInfoCPU.size(); + + + b3SapAabb aabb; + b3Vector3 myAabbMin=b3MakeVector3(1e30f,1e30f,1e30f); + b3Vector3 myAabbMax=b3MakeVector3(-1e30f,-1e30f,-1e30f); + + for (int i=0;i<vertices->size();i++) + { + b3Vector3 vtx(vertices->at(i)*scaling); + myAabbMin.setMin(vtx); + myAabbMax.setMax(vtx); + } + aabb.m_min[0] = myAabbMin[0]; + aabb.m_min[1] = myAabbMin[1]; + aabb.m_min[2] = myAabbMin[2]; + aabb.m_minIndices[3] = 0; + + aabb.m_max[0] = myAabbMax[0]; + aabb.m_max[1]= myAabbMax[1]; + aabb.m_max[2]= myAabbMax[2]; + aabb.m_signedMaxIndices[3]= 0; + + m_data->m_localShapeAABBCPU->push_back(aabb); +// m_data->m_localShapeAABBGPU->push_back(aabb); + + b3OptimizedBvh* bvh = new b3OptimizedBvh(); + //void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax) + + bool useQuantizedAabbCompression = true; + b3TriangleIndexVertexArray* meshInterface=new b3TriangleIndexVertexArray(); + m_data->m_meshInterfaces.push_back(meshInterface); + b3IndexedMesh mesh; + mesh.m_numTriangles = indices->size()/3; + mesh.m_numVertices = vertices->size(); + mesh.m_vertexBase = (const unsigned char *)&vertices->at(0).x; + mesh.m_vertexStride = sizeof(b3Vector3); + mesh.m_triangleIndexStride = 3 * sizeof(int);// or sizeof(int) + mesh.m_triangleIndexBase = (const unsigned char *)&indices->at(0); + + meshInterface->addIndexedMesh(mesh); + bvh->build(meshInterface, useQuantizedAabbCompression, (b3Vector3&)aabb.m_min, (b3Vector3&)aabb.m_max); + m_data->m_bvhData.push_back(bvh); + int numNodes = bvh->getQuantizedNodeArray().size(); + //b3OpenCLArray<b3QuantizedBvhNode>* treeNodesGPU = new b3OpenCLArray<b3QuantizedBvhNode>(this->m_context,this->m_queue,numNodes); + int numSubTrees = bvh->getSubtreeInfoArray().size(); + + b3BvhInfo bvhInfo; + + bvhInfo.m_aabbMin = bvh->m_bvhAabbMin; + bvhInfo.m_aabbMax = bvh->m_bvhAabbMax; + bvhInfo.m_quantization = bvh->m_bvhQuantization; + bvhInfo.m_numNodes = numNodes; + bvhInfo.m_numSubTrees = numSubTrees; + bvhInfo.m_nodeOffset = m_data->m_treeNodesCPU.size(); + bvhInfo.m_subTreeOffset = m_data->m_subTreesCPU.size(); + + m_data->m_bvhInfoCPU.push_back(bvhInfo); + + + int numNewSubtrees = bvh->getSubtreeInfoArray().size(); + m_data->m_subTreesCPU.reserve(m_data->m_subTreesCPU.size()+numNewSubtrees); + for (int i=0;i<numNewSubtrees;i++) + { + m_data->m_subTreesCPU.push_back(bvh->getSubtreeInfoArray()[i]); + } + int numNewTreeNodes = bvh->getQuantizedNodeArray().size(); + + for (int i=0;i<numNewTreeNodes;i++) + { + m_data->m_treeNodesCPU.push_back(bvh->getQuantizedNodeArray()[i]); + } + + + + + return collidableIndex; +} + +int b3GpuNarrowPhase::registerConcaveMeshShape(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices,b3Collidable& col, const float* scaling1) +{ + + + b3Vector3 scaling=b3MakeVector3(scaling1[0],scaling1[1],scaling1[2]); + + m_data->m_convexData->resize(m_data->m_numAcceleratedShapes+1); + m_data->m_convexPolyhedra.resize(m_data->m_numAcceleratedShapes+1); + + + b3ConvexPolyhedronData& convex = m_data->m_convexPolyhedra.at(m_data->m_convexPolyhedra.size()-1); + convex.mC = b3MakeVector3(0,0,0); + convex.mE = b3MakeVector3(0,0,0); + convex.m_extents= b3MakeVector3(0,0,0); + convex.m_localCenter = b3MakeVector3(0,0,0); + convex.m_radius = 0.f; + + convex.m_numUniqueEdges = 0; + int edgeOffset = m_data->m_uniqueEdges.size(); + convex.m_uniqueEdgesOffset = edgeOffset; + + int faceOffset = m_data->m_convexFaces.size(); + convex.m_faceOffset = faceOffset; + + convex.m_numFaces = indices->size()/3; + m_data->m_convexFaces.resize(faceOffset+convex.m_numFaces); + m_data->m_convexIndices.reserve(convex.m_numFaces*3); + for (int i=0;i<convex.m_numFaces;i++) + { + if (i%256==0) + { + //printf("i=%d out of %d", i,convex.m_numFaces); + } + b3Vector3 vert0(vertices->at(indices->at(i*3))*scaling); + b3Vector3 vert1(vertices->at(indices->at(i*3+1))*scaling); + b3Vector3 vert2(vertices->at(indices->at(i*3+2))*scaling); + + b3Vector3 normal = ((vert1-vert0).cross(vert2-vert0)).normalize(); + b3Scalar c = -(normal.dot(vert0)); + + m_data->m_convexFaces[convex.m_faceOffset+i].m_plane = b3MakeVector4(normal.x,normal.y,normal.z,c); + int indexOffset = m_data->m_convexIndices.size(); + int numIndices = 3; + m_data->m_convexFaces[convex.m_faceOffset+i].m_numIndices = numIndices; + m_data->m_convexFaces[convex.m_faceOffset+i].m_indexOffset = indexOffset; + m_data->m_convexIndices.resize(indexOffset+numIndices); + for (int p=0;p<numIndices;p++) + { + int vi = indices->at(i*3+p); + m_data->m_convexIndices[indexOffset+p] = vi;//convexPtr->m_faces[i].m_indices[p]; + } + } + + convex.m_numVertices = vertices->size(); + int vertexOffset = m_data->m_convexVertices.size(); + convex.m_vertexOffset =vertexOffset; + m_data->m_convexVertices.resize(vertexOffset+convex.m_numVertices); + for (int i=0;i<vertices->size();i++) + { + m_data->m_convexVertices[vertexOffset+i] = vertices->at(i)*scaling; + } + + (*m_data->m_convexData)[m_data->m_numAcceleratedShapes] = 0; + + + return m_data->m_numAcceleratedShapes++; +} + + + +cl_mem b3GpuNarrowPhase::getBodiesGpu() +{ + return (cl_mem)m_data->m_bodyBufferGPU->getBufferCL(); +} + +const struct b3RigidBodyData* b3GpuNarrowPhase::getBodiesCpu() const +{ + return &m_data->m_bodyBufferCPU->at(0); +}; + + + + +int b3GpuNarrowPhase::getNumBodiesGpu() const +{ + return m_data->m_bodyBufferGPU->size(); +} + +cl_mem b3GpuNarrowPhase::getBodyInertiasGpu() +{ + return (cl_mem)m_data->m_inertiaBufferGPU->getBufferCL(); +} + +int b3GpuNarrowPhase::getNumBodyInertiasGpu() const +{ + return m_data->m_inertiaBufferGPU->size(); +} + + +b3Collidable& b3GpuNarrowPhase::getCollidableCpu(int collidableIndex) +{ + return m_data->m_collidablesCPU[collidableIndex]; +} + +const b3Collidable& b3GpuNarrowPhase::getCollidableCpu(int collidableIndex) const +{ + return m_data->m_collidablesCPU[collidableIndex]; +} + +cl_mem b3GpuNarrowPhase::getCollidablesGpu() +{ + return m_data->m_collidablesGPU->getBufferCL(); +} + +const struct b3Collidable* b3GpuNarrowPhase::getCollidablesCpu() const +{ + if (m_data->m_collidablesCPU.size()) + return &m_data->m_collidablesCPU[0]; + return 0; +} + +const struct b3SapAabb* b3GpuNarrowPhase::getLocalSpaceAabbsCpu() const +{ + if (m_data->m_localShapeAABBCPU->size()) + { + return &m_data->m_localShapeAABBCPU->at(0); + } + return 0; +} + + +cl_mem b3GpuNarrowPhase::getAabbLocalSpaceBufferGpu() +{ + return m_data->m_localShapeAABBGPU->getBufferCL(); +} +int b3GpuNarrowPhase::getNumCollidablesGpu() const +{ + return m_data->m_collidablesGPU->size(); +} + + + + + +int b3GpuNarrowPhase::getNumContactsGpu() const +{ + return m_data->m_pBufContactBuffersGPU[m_data->m_currentContactBuffer]->size(); +} +cl_mem b3GpuNarrowPhase::getContactsGpu() +{ + return m_data->m_pBufContactBuffersGPU[m_data->m_currentContactBuffer]->getBufferCL(); +} + +const b3Contact4* b3GpuNarrowPhase::getContactsCPU() const +{ + m_data->m_pBufContactBuffersGPU[m_data->m_currentContactBuffer]->copyToHost(*m_data->m_pBufContactOutCPU); + return &m_data->m_pBufContactOutCPU->at(0); +} + +void b3GpuNarrowPhase::computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbsWorldSpace, int numObjects) +{ + + cl_mem aabbsLocalSpace = m_data->m_localShapeAABBGPU->getBufferCL(); + + int nContactOut = 0; + + //swap buffer + m_data->m_currentContactBuffer=1-m_data->m_currentContactBuffer; + + //int curSize = m_data->m_pBufContactBuffersGPU[m_data->m_currentContactBuffer]->size(); + + int maxTriConvexPairCapacity = m_data->m_config.m_maxTriConvexPairCapacity; + int numTriConvexPairsOut=0; + + b3OpenCLArray<b3Int4> broadphasePairsGPU(m_context,m_queue); + broadphasePairsGPU.setFromOpenCLBuffer(broadphasePairs,numBroadphasePairs); + + + + + b3OpenCLArray<b3Aabb> clAabbArrayWorldSpace(this->m_context,this->m_queue); + clAabbArrayWorldSpace.setFromOpenCLBuffer(aabbsWorldSpace,numObjects); + + b3OpenCLArray<b3Aabb> clAabbArrayLocalSpace(this->m_context,this->m_queue); + clAabbArrayLocalSpace.setFromOpenCLBuffer(aabbsLocalSpace,numObjects); + + m_data->m_gpuSatCollision->computeConvexConvexContactsGPUSAT( + &broadphasePairsGPU, numBroadphasePairs, + m_data->m_bodyBufferGPU, + m_data->m_pBufContactBuffersGPU[m_data->m_currentContactBuffer], + nContactOut, + m_data->m_pBufContactBuffersGPU[1-m_data->m_currentContactBuffer], + m_data->m_config.m_maxContactCapacity, + m_data->m_config.m_compoundPairCapacity, + *m_data->m_convexPolyhedraGPU, + *m_data->m_convexVerticesGPU, + *m_data->m_uniqueEdgesGPU, + *m_data->m_convexFacesGPU, + *m_data->m_convexIndicesGPU, + *m_data->m_collidablesGPU, + *m_data->m_gpuChildShapes, + clAabbArrayWorldSpace, + clAabbArrayLocalSpace, + *m_data->m_worldVertsB1GPU, + *m_data->m_clippingFacesOutGPU, + *m_data->m_worldNormalsAGPU, + *m_data->m_worldVertsA1GPU, + *m_data->m_worldVertsB2GPU, + m_data->m_bvhData, + m_data->m_treeNodesGPU, + m_data->m_subTreesGPU, + m_data->m_bvhInfoGPU, + numObjects, + maxTriConvexPairCapacity, + *m_data->m_triangleConvexPairs, + numTriConvexPairsOut + ); + + /*b3AlignedObjectArray<b3Int4> broadphasePairsCPU; + broadphasePairsGPU.copyToHost(broadphasePairsCPU); + printf("checking pairs\n"); + */ +} + +const b3SapAabb& b3GpuNarrowPhase::getLocalSpaceAabb(int collidableIndex) const +{ + return m_data->m_localShapeAABBCPU->at(collidableIndex); +} + + + + + +int b3GpuNarrowPhase::registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation , const float* aabbMinPtr, const float* aabbMaxPtr,bool writeToGpu) +{ + b3Vector3 aabbMin=b3MakeVector3(aabbMinPtr[0],aabbMinPtr[1],aabbMinPtr[2]); + b3Vector3 aabbMax=b3MakeVector3(aabbMaxPtr[0],aabbMaxPtr[1],aabbMaxPtr[2]); + + + if (m_data->m_numAcceleratedRigidBodies >= (m_data->m_config.m_maxConvexBodies)) + { + b3Error("registerRigidBody: exceeding the number of rigid bodies, %d > %d \n",m_data->m_numAcceleratedRigidBodies,m_data->m_config.m_maxConvexBodies); + return -1; + } + + m_data->m_bodyBufferCPU->resize(m_data->m_numAcceleratedRigidBodies+1); + + b3RigidBodyData& body = m_data->m_bodyBufferCPU->at(m_data->m_numAcceleratedRigidBodies); + + float friction = 1.f; + float restitution = 0.f; + + body.m_frictionCoeff = friction; + body.m_restituitionCoeff = restitution; + body.m_angVel = b3MakeVector3(0,0,0); + body.m_linVel=b3MakeVector3(0,0,0);//.setZero(); + body.m_pos =b3MakeVector3(position[0],position[1],position[2]); + body.m_quat.setValue(orientation[0],orientation[1],orientation[2],orientation[3]); + body.m_collidableIdx = collidableIndex; + if (collidableIndex>=0) + { +// body.m_shapeType = m_data->m_collidablesCPU.at(collidableIndex).m_shapeType; + } else + { + // body.m_shapeType = CollisionShape::SHAPE_PLANE; + m_planeBodyIndex = m_data->m_numAcceleratedRigidBodies; + } + //body.m_shapeType = shapeType; + + + body.m_invMass = mass? 1.f/mass : 0.f; + + if (writeToGpu) + { + m_data->m_bodyBufferGPU->copyFromHostPointer(&body,1,m_data->m_numAcceleratedRigidBodies); + } + + b3InertiaData& shapeInfo = m_data->m_inertiaBufferCPU->at(m_data->m_numAcceleratedRigidBodies); + + if (mass==0.f) + { + if (m_data->m_numAcceleratedRigidBodies==0) + m_static0Index = 0; + + shapeInfo.m_initInvInertia.setValue(0,0,0,0,0,0,0,0,0); + shapeInfo.m_invInertiaWorld.setValue(0,0,0,0,0,0,0,0,0); + } else + { + + b3Assert(body.m_collidableIdx>=0); + + //approximate using the aabb of the shape + + //Aabb aabb = (*m_data->m_shapePointers)[shapeIndex]->m_aabb; + b3Vector3 halfExtents = (aabbMax-aabbMin);//*0.5f;//fake larger inertia makes demos more stable ;-) + + b3Vector3 localInertia; + + float lx=2.f*halfExtents[0]; + float ly=2.f*halfExtents[1]; + float lz=2.f*halfExtents[2]; + + localInertia.setValue( (mass/12.0f) * (ly*ly + lz*lz), + (mass/12.0f) * (lx*lx + lz*lz), + (mass/12.0f) * (lx*lx + ly*ly)); + + b3Vector3 invLocalInertia; + invLocalInertia[0] = 1.f/localInertia[0]; + invLocalInertia[1] = 1.f/localInertia[1]; + invLocalInertia[2] = 1.f/localInertia[2]; + invLocalInertia[3] = 0.f; + + shapeInfo.m_initInvInertia.setValue( + invLocalInertia[0], 0, 0, + 0, invLocalInertia[1], 0, + 0, 0, invLocalInertia[2]); + + b3Matrix3x3 m (body.m_quat); + + shapeInfo.m_invInertiaWorld = m.scaled(invLocalInertia) * m.transpose(); + + } + + if (writeToGpu) + m_data->m_inertiaBufferGPU->copyFromHostPointer(&shapeInfo,1,m_data->m_numAcceleratedRigidBodies); + + + + return m_data->m_numAcceleratedRigidBodies++; +} + +int b3GpuNarrowPhase::getNumRigidBodies() const +{ + return m_data->m_numAcceleratedRigidBodies; +} + +void b3GpuNarrowPhase::writeAllBodiesToGpu() +{ + + if (m_data->m_localShapeAABBCPU->size()) + { + m_data->m_localShapeAABBGPU->copyFromHost(*m_data->m_localShapeAABBCPU); + } + + + m_data->m_gpuChildShapes->copyFromHost(m_data->m_cpuChildShapes); + m_data->m_convexFacesGPU->copyFromHost(m_data->m_convexFaces); + m_data->m_convexPolyhedraGPU->copyFromHost(m_data->m_convexPolyhedra); + m_data->m_uniqueEdgesGPU->copyFromHost(m_data->m_uniqueEdges); + m_data->m_convexVerticesGPU->copyFromHost(m_data->m_convexVertices); + m_data->m_convexIndicesGPU->copyFromHost(m_data->m_convexIndices); + m_data->m_bvhInfoGPU->copyFromHost(m_data->m_bvhInfoCPU); + m_data->m_treeNodesGPU->copyFromHost(m_data->m_treeNodesCPU); + m_data->m_subTreesGPU->copyFromHost(m_data->m_subTreesCPU); + + + m_data->m_bodyBufferGPU->resize(m_data->m_numAcceleratedRigidBodies); + m_data->m_inertiaBufferGPU->resize(m_data->m_numAcceleratedRigidBodies); + + if (m_data->m_numAcceleratedRigidBodies) + { + m_data->m_bodyBufferGPU->copyFromHostPointer(&m_data->m_bodyBufferCPU->at(0),m_data->m_numAcceleratedRigidBodies); + m_data->m_inertiaBufferGPU->copyFromHostPointer(&m_data->m_inertiaBufferCPU->at(0),m_data->m_numAcceleratedRigidBodies); + } + if (m_data->m_collidablesCPU.size()) + { + m_data->m_collidablesGPU->copyFromHost(m_data->m_collidablesCPU); + } + + +} + + +void b3GpuNarrowPhase::reset() +{ + m_data->m_numAcceleratedShapes = 0; + m_data->m_numAcceleratedRigidBodies = 0; + this->m_static0Index = -1; + m_data->m_uniqueEdges.resize(0); + m_data->m_convexVertices.resize(0); + m_data->m_convexPolyhedra.resize(0); + m_data->m_convexIndices.resize(0); + m_data->m_cpuChildShapes.resize(0); + m_data->m_convexFaces.resize(0); + m_data->m_collidablesCPU.resize(0); + m_data->m_localShapeAABBCPU->resize(0); + m_data->m_bvhData.resize(0); + m_data->m_treeNodesCPU.resize(0); + m_data->m_subTreesCPU.resize(0); + m_data->m_bvhInfoCPU.resize(0); + +} + + +void b3GpuNarrowPhase::readbackAllBodiesToCpu() +{ + m_data->m_bodyBufferGPU->copyToHostPointer(&m_data->m_bodyBufferCPU->at(0),m_data->m_numAcceleratedRigidBodies); +} + +void b3GpuNarrowPhase::setObjectTransformCpu(float* position, float* orientation , int bodyIndex) +{ + if (bodyIndex>=0 && bodyIndex<m_data->m_bodyBufferCPU->size()) + { + m_data->m_bodyBufferCPU->at(bodyIndex).m_pos=b3MakeVector3(position[0],position[1],position[2]); + m_data->m_bodyBufferCPU->at(bodyIndex).m_quat.setValue(orientation[0],orientation[1],orientation[2],orientation[3]); + } + else + { + b3Warning("setObjectVelocityCpu out of range.\n"); + } +} +void b3GpuNarrowPhase::setObjectVelocityCpu(float* linVel, float* angVel, int bodyIndex) +{ + if (bodyIndex>=0 && bodyIndex<m_data->m_bodyBufferCPU->size()) + { + m_data->m_bodyBufferCPU->at(bodyIndex).m_linVel=b3MakeVector3(linVel[0],linVel[1],linVel[2]); + m_data->m_bodyBufferCPU->at(bodyIndex).m_angVel=b3MakeVector3(angVel[0],angVel[1],angVel[2]); + } else + { + b3Warning("setObjectVelocityCpu out of range.\n"); + } +} + +bool b3GpuNarrowPhase::getObjectTransformFromCpu(float* position, float* orientation , int bodyIndex) const +{ + if (bodyIndex>=0 && bodyIndex<m_data->m_bodyBufferCPU->size()) + { + position[0] = m_data->m_bodyBufferCPU->at(bodyIndex).m_pos.x; + position[1] = m_data->m_bodyBufferCPU->at(bodyIndex).m_pos.y; + position[2] = m_data->m_bodyBufferCPU->at(bodyIndex).m_pos.z; + position[3] = 1.f;//or 1 + + orientation[0] = m_data->m_bodyBufferCPU->at(bodyIndex).m_quat.x; + orientation[1] = m_data->m_bodyBufferCPU->at(bodyIndex).m_quat.y; + orientation[2] = m_data->m_bodyBufferCPU->at(bodyIndex).m_quat.z; + orientation[3] = m_data->m_bodyBufferCPU->at(bodyIndex).m_quat.w; + return true; + } + + b3Warning("getObjectTransformFromCpu out of range.\n"); + return false; +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h new file mode 100644 index 0000000000..05ff3fd09e --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h @@ -0,0 +1,109 @@ +#ifndef B3_GPU_NARROWPHASE_H +#define B3_GPU_NARROWPHASE_H + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h" +#include "Bullet3Common/b3AlignedObjectArray.h" +#include "Bullet3Common/b3Vector3.h" + +class b3GpuNarrowPhase +{ +protected: + + struct b3GpuNarrowPhaseInternalData* m_data; + int m_acceleratedCompanionShapeIndex; + int m_planeBodyIndex; + int m_static0Index; + + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + + int registerConvexHullShapeInternal(class b3ConvexUtility* convexPtr, b3Collidable& col); + int registerConcaveMeshShape(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, b3Collidable& col, const float* scaling); + +public: + + + + + b3GpuNarrowPhase(cl_context vtx, cl_device_id dev, cl_command_queue q, const struct b3Config& config); + + virtual ~b3GpuNarrowPhase(void); + + int registerSphereShape(float radius); + int registerPlaneShape(const b3Vector3& planeNormal, float planeConstant); + + int registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes); + int registerFace(const b3Vector3& faceNormal, float faceConstant); + + int registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices,const float* scaling); + + //do they need to be merged? + + int registerConvexHullShape(b3ConvexUtility* utilPtr); + int registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling); + + int registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation, const float* aabbMin, const float* aabbMax,bool writeToGpu); + void setObjectTransform(const float* position, const float* orientation , int bodyIndex); + + void writeAllBodiesToGpu(); + void reset(); + void readbackAllBodiesToCpu(); + bool getObjectTransformFromCpu(float* position, float* orientation , int bodyIndex) const; + + void setObjectTransformCpu(float* position, float* orientation , int bodyIndex); + void setObjectVelocityCpu(float* linVel, float* angVel, int bodyIndex); + + + virtual void computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbsWorldSpace, int numObjects); + + + cl_mem getBodiesGpu(); + const struct b3RigidBodyData* getBodiesCpu() const; + //struct b3RigidBodyData* getBodiesCpu(); + + int getNumBodiesGpu() const; + + cl_mem getBodyInertiasGpu(); + int getNumBodyInertiasGpu() const; + + cl_mem getCollidablesGpu(); + const struct b3Collidable* getCollidablesCpu() const; + int getNumCollidablesGpu() const; + + const struct b3SapAabb* getLocalSpaceAabbsCpu() const; + + const struct b3Contact4* getContactsCPU() const; + + cl_mem getContactsGpu(); + int getNumContactsGpu() const; + + cl_mem getAabbLocalSpaceBufferGpu(); + + int getNumRigidBodies() const; + + int allocateCollidable(); + + int getStatic0Index() const + { + return m_static0Index; + } + b3Collidable& getCollidableCpu(int collidableIndex); + const b3Collidable& getCollidableCpu(int collidableIndex) const; + + const b3GpuNarrowPhaseInternalData* getInternalData() const + { + return m_data; + } + + b3GpuNarrowPhaseInternalData* getInternalData() + { + return m_data; + } + + const struct b3SapAabb& getLocalSpaceAabb(int collidableIndex) const; +}; + +#endif //B3_GPU_NARROWPHASE_H + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h new file mode 100644 index 0000000000..8a7f1ea859 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h @@ -0,0 +1,95 @@ + +#ifndef B3_GPU_NARROWPHASE_INTERNAL_DATA_H +#define B3_GPU_NARROWPHASE_INTERNAL_DATA_H + +#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h" +#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h" + +#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h" +#include "Bullet3Common/b3AlignedObjectArray.h" +#include "Bullet3Common/b3Vector3.h" + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h" +#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h" +#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h" + +#include "Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h" +#include "Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h" +#include "Bullet3Common/shared/b3Int4.h" +#include "Bullet3Common/shared/b3Int2.h" + + +class b3ConvexUtility; + +struct b3GpuNarrowPhaseInternalData +{ + b3AlignedObjectArray<b3ConvexUtility*>* m_convexData; + + b3AlignedObjectArray<b3ConvexPolyhedronData> m_convexPolyhedra; + b3AlignedObjectArray<b3Vector3> m_uniqueEdges; + b3AlignedObjectArray<b3Vector3> m_convexVertices; + b3AlignedObjectArray<int> m_convexIndices; + + b3OpenCLArray<b3ConvexPolyhedronData>* m_convexPolyhedraGPU; + b3OpenCLArray<b3Vector3>* m_uniqueEdgesGPU; + b3OpenCLArray<b3Vector3>* m_convexVerticesGPU; + b3OpenCLArray<int>* m_convexIndicesGPU; + + b3OpenCLArray<b3Vector3>* m_worldVertsB1GPU; + b3OpenCLArray<b3Int4>* m_clippingFacesOutGPU; + b3OpenCLArray<b3Vector3>* m_worldNormalsAGPU; + b3OpenCLArray<b3Vector3>* m_worldVertsA1GPU; + b3OpenCLArray<b3Vector3>* m_worldVertsB2GPU; + + b3AlignedObjectArray<b3GpuChildShape> m_cpuChildShapes; + b3OpenCLArray<b3GpuChildShape>* m_gpuChildShapes; + + b3AlignedObjectArray<b3GpuFace> m_convexFaces; + b3OpenCLArray<b3GpuFace>* m_convexFacesGPU; + + struct GpuSatCollision* m_gpuSatCollision; + + + b3OpenCLArray<b3Int4>* m_triangleConvexPairs; + + + b3OpenCLArray<b3Contact4>* m_pBufContactBuffersGPU[2]; + int m_currentContactBuffer; + b3AlignedObjectArray<b3Contact4>* m_pBufContactOutCPU; + + + b3AlignedObjectArray<b3RigidBodyData>* m_bodyBufferCPU; + b3OpenCLArray<b3RigidBodyData>* m_bodyBufferGPU; + + b3AlignedObjectArray<b3InertiaData>* m_inertiaBufferCPU; + b3OpenCLArray<b3InertiaData>* m_inertiaBufferGPU; + + int m_numAcceleratedShapes; + int m_numAcceleratedRigidBodies; + + b3AlignedObjectArray<b3Collidable> m_collidablesCPU; + b3OpenCLArray<b3Collidable>* m_collidablesGPU; + + b3OpenCLArray<b3SapAabb>* m_localShapeAABBGPU; + b3AlignedObjectArray<b3SapAabb>* m_localShapeAABBCPU; + + b3AlignedObjectArray<class b3OptimizedBvh*> m_bvhData; + b3AlignedObjectArray<class b3TriangleIndexVertexArray*> m_meshInterfaces; + + b3AlignedObjectArray<b3QuantizedBvhNode> m_treeNodesCPU; + b3AlignedObjectArray<b3BvhSubtreeInfo> m_subTreesCPU; + + b3AlignedObjectArray<b3BvhInfo> m_bvhInfoCPU; + b3OpenCLArray<b3BvhInfo>* m_bvhInfoGPU; + + b3OpenCLArray<b3QuantizedBvhNode>* m_treeNodesGPU; + b3OpenCLArray<b3BvhSubtreeInfo>* m_subTreesGPU; + + + b3Config m_config; + +}; + +#endif //B3_GPU_NARROWPHASE_INTERNAL_DATA_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp new file mode 100644 index 0000000000..0d3d50c548 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp @@ -0,0 +1,1158 @@ + +/* +Copyright (c) 2013 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + + +bool useGpuInitSolverBodies = true; +bool useGpuInfo1 = true; +bool useGpuInfo2= true; +bool useGpuSolveJointConstraintRows=true; +bool useGpuWriteBackVelocities = true; +bool gpuBreakConstraints = true; + +#include "b3GpuPgsConstraintSolver.h" + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h" + +#include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h" +#include <new> +#include "Bullet3Common/b3AlignedObjectArray.h" +#include <string.h> //for memset +#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" + +#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h" + +#include "Bullet3OpenCL/RigidBody/kernels/jointSolver.h" //solveConstraintRowsCL +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" + +#define B3_JOINT_SOLVER_PATH "src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl" + + +struct b3GpuPgsJacobiSolverInternalData +{ + + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + + b3PrefixScanCL* m_prefixScan; + + cl_kernel m_solveJointConstraintRowsKernels; + cl_kernel m_initSolverBodiesKernel; + cl_kernel m_getInfo1Kernel; + cl_kernel m_initBatchConstraintsKernel; + cl_kernel m_getInfo2Kernel; + cl_kernel m_writeBackVelocitiesKernel; + cl_kernel m_breakViolatedConstraintsKernel; + + b3OpenCLArray<unsigned int>* m_gpuConstraintRowOffsets; + + b3OpenCLArray<b3GpuSolverBody>* m_gpuSolverBodies; + b3OpenCLArray<b3BatchConstraint>* m_gpuBatchConstraints; + b3OpenCLArray<b3GpuSolverConstraint>* m_gpuConstraintRows; + b3OpenCLArray<unsigned int>* m_gpuConstraintInfo1; + +// b3AlignedObjectArray<b3GpuSolverBody> m_cpuSolverBodies; + b3AlignedObjectArray<b3BatchConstraint> m_cpuBatchConstraints; + b3AlignedObjectArray<b3GpuSolverConstraint> m_cpuConstraintRows; + b3AlignedObjectArray<unsigned int> m_cpuConstraintInfo1; + b3AlignedObjectArray<unsigned int> m_cpuConstraintRowOffsets; + + b3AlignedObjectArray<b3RigidBodyData> m_cpuBodies; + b3AlignedObjectArray<b3InertiaData> m_cpuInertias; + + + b3AlignedObjectArray<b3GpuGenericConstraint> m_cpuConstraints; + + b3AlignedObjectArray<int> m_batchSizes; + + +}; + + +/* +static b3Transform getWorldTransform(b3RigidBodyData* rb) +{ + b3Transform newTrans; + newTrans.setOrigin(rb->m_pos); + newTrans.setRotation(rb->m_quat); + return newTrans; +} + +static const b3Matrix3x3& getInvInertiaTensorWorld(b3InertiaData* inertia) +{ + return inertia->m_invInertiaWorld; +} + +*/ + +static const b3Vector3& getLinearVelocity(b3RigidBodyData* rb) +{ + return rb->m_linVel; +} + +static const b3Vector3& getAngularVelocity(b3RigidBodyData* rb) +{ + return rb->m_angVel; +} + +b3Vector3 getVelocityInLocalPoint(b3RigidBodyData* rb, const b3Vector3& rel_pos) +{ + //we also calculate lin/ang velocity for kinematic objects + return getLinearVelocity(rb) + getAngularVelocity(rb).cross(rel_pos); + +} + + + +b3GpuPgsConstraintSolver::b3GpuPgsConstraintSolver (cl_context ctx, cl_device_id device, cl_command_queue queue,bool usePgs) +{ + m_usePgs = usePgs; + m_gpuData = new b3GpuPgsJacobiSolverInternalData(); + m_gpuData->m_context = ctx; + m_gpuData->m_device = device; + m_gpuData->m_queue = queue; + + m_gpuData->m_prefixScan = new b3PrefixScanCL(ctx,device,queue); + + m_gpuData->m_gpuConstraintRowOffsets = new b3OpenCLArray<unsigned int>(m_gpuData->m_context,m_gpuData->m_queue); + + m_gpuData->m_gpuSolverBodies = new b3OpenCLArray<b3GpuSolverBody>(m_gpuData->m_context,m_gpuData->m_queue); + m_gpuData->m_gpuBatchConstraints = new b3OpenCLArray<b3BatchConstraint>(m_gpuData->m_context,m_gpuData->m_queue); + m_gpuData->m_gpuConstraintRows = new b3OpenCLArray<b3GpuSolverConstraint>(m_gpuData->m_context,m_gpuData->m_queue); + m_gpuData->m_gpuConstraintInfo1 = new b3OpenCLArray<unsigned int>(m_gpuData->m_context,m_gpuData->m_queue); + cl_int errNum=0; + + { + cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,&errNum,"",B3_JOINT_SOLVER_PATH); + //cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_gpuData->m_context,m_gpuData->m_device,0,&errNum,"",B3_JOINT_SOLVER_PATH,true); + b3Assert(errNum==CL_SUCCESS); + m_gpuData->m_solveJointConstraintRowsKernels = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device,solveConstraintRowsCL, "solveJointConstraintRows",&errNum,prog); + b3Assert(errNum==CL_SUCCESS); + m_gpuData->m_initSolverBodiesKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"initSolverBodies",&errNum,prog); + b3Assert(errNum==CL_SUCCESS); + m_gpuData->m_getInfo1Kernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"getInfo1Kernel",&errNum,prog); + b3Assert(errNum==CL_SUCCESS); + m_gpuData->m_initBatchConstraintsKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"initBatchConstraintsKernel",&errNum,prog); + b3Assert(errNum==CL_SUCCESS); + m_gpuData->m_getInfo2Kernel= b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"getInfo2Kernel",&errNum,prog); + b3Assert(errNum==CL_SUCCESS); + m_gpuData->m_writeBackVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"writeBackVelocitiesKernel",&errNum,prog); + b3Assert(errNum==CL_SUCCESS); + m_gpuData->m_breakViolatedConstraintsKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"breakViolatedConstraintsKernel",&errNum,prog); + b3Assert(errNum==CL_SUCCESS); + + + + + clReleaseProgram(prog); + } + + +} + +b3GpuPgsConstraintSolver::~b3GpuPgsConstraintSolver () +{ + clReleaseKernel(m_gpuData->m_solveJointConstraintRowsKernels); + clReleaseKernel(m_gpuData->m_initSolverBodiesKernel); + clReleaseKernel(m_gpuData->m_getInfo1Kernel); + clReleaseKernel(m_gpuData->m_initBatchConstraintsKernel); + clReleaseKernel(m_gpuData->m_getInfo2Kernel); + clReleaseKernel(m_gpuData->m_writeBackVelocitiesKernel); + clReleaseKernel(m_gpuData->m_breakViolatedConstraintsKernel); + + delete m_gpuData->m_prefixScan; + delete m_gpuData->m_gpuConstraintRowOffsets; + delete m_gpuData->m_gpuSolverBodies; + delete m_gpuData->m_gpuBatchConstraints; + delete m_gpuData->m_gpuConstraintRows; + delete m_gpuData->m_gpuConstraintInfo1; + + delete m_gpuData; +} + +struct b3BatchConstraint +{ + int m_bodyAPtrAndSignBit; + int m_bodyBPtrAndSignBit; + int m_originalConstraintIndex; + int m_batchId; +}; + +static b3AlignedObjectArray<b3BatchConstraint> batchConstraints; + + +void b3GpuPgsConstraintSolver::recomputeBatches() +{ + m_gpuData->m_batchSizes.clear(); +} + + + + +b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal) +{ + B3_PROFILE("GPU solveGroupCacheFriendlySetup"); + batchConstraints.resize(numConstraints); + m_gpuData->m_gpuBatchConstraints->resize(numConstraints); + m_staticIdx = -1; + m_maxOverrideNumSolverIterations = 0; + + + /* m_gpuData->m_gpuBodies->resize(numBodies); + m_gpuData->m_gpuBodies->copyFromHostPointer(bodies,numBodies); + + b3OpenCLArray<b3InertiaData> gpuInertias(m_gpuData->m_context,m_gpuData->m_queue); + gpuInertias.resize(numBodies); + gpuInertias.copyFromHostPointer(inertias,numBodies); + */ + + m_gpuData->m_gpuSolverBodies->resize(numBodies); + + + m_tmpSolverBodyPool.resize(numBodies); + { + + if (useGpuInitSolverBodies) + { + B3_PROFILE("m_initSolverBodiesKernel"); + + b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_initSolverBodiesKernel,"m_initSolverBodiesKernel"); + launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL()); + launcher.setBuffer(gpuBodies->getBufferCL()); + launcher.setConst(numBodies); + launcher.launch1D(numBodies); + clFinish(m_gpuData->m_queue); + + // m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); + } else + { + gpuBodies->copyToHost(m_gpuData->m_cpuBodies); + for (int i=0;i<numBodies;i++) + { + + b3RigidBodyData& body = m_gpuData->m_cpuBodies[i]; + b3GpuSolverBody& solverBody = m_tmpSolverBodyPool[i]; + initSolverBody(i,&solverBody,&body); + solverBody.m_originalBodyIndex = i; + } + m_gpuData->m_gpuSolverBodies->copyFromHost(m_tmpSolverBodyPool); + } + } + +// int totalBodies = 0; + int totalNumRows = 0; + //b3RigidBody* rb0=0,*rb1=0; + //if (1) + { + { + + + // int i; + + m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints); + + // b3OpenCLArray<b3GpuGenericConstraint> gpuConstraints(m_gpuData->m_context,m_gpuData->m_queue); + + + if (useGpuInfo1) + { + B3_PROFILE("info1 and init batchConstraint"); + + m_gpuData->m_gpuConstraintInfo1->resize(numConstraints); + + + if (1) + { + B3_PROFILE("getInfo1Kernel"); + + b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_getInfo1Kernel,"m_getInfo1Kernel"); + launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL()); + launcher.setBuffer(gpuConstraints->getBufferCL()); + launcher.setConst(numConstraints); + launcher.launch1D(numConstraints); + clFinish(m_gpuData->m_queue); + } + + if (m_gpuData->m_batchSizes.size()==0) + { + B3_PROFILE("initBatchConstraintsKernel"); + + m_gpuData->m_gpuConstraintRowOffsets->resize(numConstraints); + unsigned int total=0; + m_gpuData->m_prefixScan->execute(*m_gpuData->m_gpuConstraintInfo1,*m_gpuData->m_gpuConstraintRowOffsets,numConstraints,&total); + unsigned int lastElem = m_gpuData->m_gpuConstraintInfo1->at(numConstraints-1); + totalNumRows = total+lastElem; + + { + B3_PROFILE("init batch constraints"); + b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_initBatchConstraintsKernel,"m_initBatchConstraintsKernel"); + launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL()); + launcher.setBuffer(gpuConstraints->getBufferCL()); + launcher.setBuffer(gpuBodies->getBufferCL()); + launcher.setConst(numConstraints); + launcher.launch1D(numConstraints); + clFinish(m_gpuData->m_queue); + } + //assume the batching happens on CPU, so copy the data + m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); + } + } + else + { + totalNumRows = 0; + gpuConstraints->copyToHost(m_gpuData->m_cpuConstraints); + //calculate the total number of contraint rows + for (int i=0;i<numConstraints;i++) + { + unsigned int& info1= m_tmpConstraintSizesPool[i]; + // unsigned int info1; + if (m_gpuData->m_cpuConstraints[i].isEnabled()) + { + + m_gpuData->m_cpuConstraints[i].getInfo1(&info1,&m_gpuData->m_cpuBodies[0]); + } else + { + info1 = 0; + } + + totalNumRows += info1; + } + + m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints); + m_gpuData->m_gpuConstraintInfo1->copyFromHost(m_tmpConstraintSizesPool); + + } + m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows); + m_gpuData->m_gpuConstraintRows->resize(totalNumRows); + + // b3GpuConstraintArray verify; + + if (useGpuInfo2) + { + { + B3_PROFILE("getInfo2Kernel"); + b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_getInfo2Kernel,"m_getInfo2Kernel"); + launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL()); + launcher.setBuffer(gpuConstraints->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL()); + launcher.setBuffer(gpuBodies->getBufferCL()); + launcher.setBuffer(gpuInertias->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL()); + launcher.setConst(infoGlobal.m_timeStep); + launcher.setConst(infoGlobal.m_erp); + launcher.setConst(infoGlobal.m_globalCfm); + launcher.setConst(infoGlobal.m_damping); + launcher.setConst(infoGlobal.m_numIterations); + launcher.setConst(numConstraints); + launcher.launch1D(numConstraints); + clFinish(m_gpuData->m_queue); + + if (m_gpuData->m_batchSizes.size()==0) + m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); + //m_gpuData->m_gpuConstraintRows->copyToHost(verify); + //m_gpuData->m_gpuConstraintRows->copyToHost(m_tmpSolverNonContactConstraintPool); + + + + } + } + else + { + + gpuInertias->copyToHost(m_gpuData->m_cpuInertias); + + ///setup the b3SolverConstraints + + for (int i=0;i<numConstraints;i++) + { + const int& info1 = m_tmpConstraintSizesPool[i]; + + if (info1) + { + int constraintIndex = batchConstraints[i].m_originalConstraintIndex; + int constraintRowOffset = m_gpuData->m_cpuConstraintRowOffsets[constraintIndex]; + + b3GpuSolverConstraint* currentConstraintRow = &m_tmpSolverNonContactConstraintPool[constraintRowOffset]; + b3GpuGenericConstraint& constraint = m_gpuData->m_cpuConstraints[i]; + + b3RigidBodyData& rbA = m_gpuData->m_cpuBodies[ constraint.getRigidBodyA()]; + //b3RigidBody& rbA = constraint.getRigidBodyA(); + // b3RigidBody& rbB = constraint.getRigidBodyB(); + b3RigidBodyData& rbB = m_gpuData->m_cpuBodies[ constraint.getRigidBodyB()]; + + + + int solverBodyIdA = constraint.getRigidBodyA();//getOrInitSolverBody(constraint.getRigidBodyA(),bodies,inertias); + int solverBodyIdB = constraint.getRigidBodyB();//getOrInitSolverBody(constraint.getRigidBodyB(),bodies,inertias); + + b3GpuSolverBody* bodyAPtr = &m_tmpSolverBodyPool[solverBodyIdA]; + b3GpuSolverBody* bodyBPtr = &m_tmpSolverBodyPool[solverBodyIdB]; + + if (rbA.m_invMass) + { + batchConstraints[i].m_bodyAPtrAndSignBit = solverBodyIdA; + } else + { + if (!solverBodyIdA) + m_staticIdx = 0; + batchConstraints[i].m_bodyAPtrAndSignBit = -solverBodyIdA; + } + + if (rbB.m_invMass) + { + batchConstraints[i].m_bodyBPtrAndSignBit = solverBodyIdB; + } else + { + if (!solverBodyIdB) + m_staticIdx = 0; + batchConstraints[i].m_bodyBPtrAndSignBit = -solverBodyIdB; + } + + + int overrideNumSolverIterations = 0;//constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations; + if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations) + m_maxOverrideNumSolverIterations = overrideNumSolverIterations; + + + int j; + for ( j=0;j<info1;j++) + { + memset(¤tConstraintRow[j],0,sizeof(b3GpuSolverConstraint)); + currentConstraintRow[j].m_angularComponentA.setValue(0,0,0); + currentConstraintRow[j].m_angularComponentB.setValue(0,0,0); + currentConstraintRow[j].m_appliedImpulse = 0.f; + currentConstraintRow[j].m_appliedPushImpulse = 0.f; + currentConstraintRow[j].m_cfm = 0.f; + currentConstraintRow[j].m_contactNormal.setValue(0,0,0); + currentConstraintRow[j].m_friction = 0.f; + currentConstraintRow[j].m_frictionIndex = 0; + currentConstraintRow[j].m_jacDiagABInv = 0.f; + currentConstraintRow[j].m_lowerLimit = 0.f; + currentConstraintRow[j].m_upperLimit = 0.f; + + currentConstraintRow[j].m_originalContactPoint = 0; + currentConstraintRow[j].m_overrideNumSolverIterations = 0; + currentConstraintRow[j].m_relpos1CrossNormal.setValue(0,0,0); + currentConstraintRow[j].m_relpos2CrossNormal.setValue(0,0,0); + currentConstraintRow[j].m_rhs = 0.f; + currentConstraintRow[j].m_rhsPenetration = 0.f; + currentConstraintRow[j].m_solverBodyIdA = 0; + currentConstraintRow[j].m_solverBodyIdB = 0; + + currentConstraintRow[j].m_lowerLimit = -B3_INFINITY; + currentConstraintRow[j].m_upperLimit = B3_INFINITY; + currentConstraintRow[j].m_appliedImpulse = 0.f; + currentConstraintRow[j].m_appliedPushImpulse = 0.f; + currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA; + currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB; + currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations; + } + + bodyAPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f); + bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f); + bodyAPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f); + bodyAPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f); + bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f); + bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f); + bodyBPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f); + bodyBPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f); + + + b3GpuConstraintInfo2 info2; + info2.fps = 1.f/infoGlobal.m_timeStep; + info2.erp = infoGlobal.m_erp; + info2.m_J1linearAxis = currentConstraintRow->m_contactNormal; + info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal; + info2.m_J2linearAxis = 0; + info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal; + info2.rowskip = sizeof(b3GpuSolverConstraint)/sizeof(b3Scalar);//check this + ///the size of b3GpuSolverConstraint needs be a multiple of b3Scalar + b3Assert(info2.rowskip*sizeof(b3Scalar)== sizeof(b3GpuSolverConstraint)); + info2.m_constraintError = ¤tConstraintRow->m_rhs; + currentConstraintRow->m_cfm = infoGlobal.m_globalCfm; + info2.m_damping = infoGlobal.m_damping; + info2.cfm = ¤tConstraintRow->m_cfm; + info2.m_lowerLimit = ¤tConstraintRow->m_lowerLimit; + info2.m_upperLimit = ¤tConstraintRow->m_upperLimit; + info2.m_numIterations = infoGlobal.m_numIterations; + m_gpuData->m_cpuConstraints[i].getInfo2(&info2,&m_gpuData->m_cpuBodies[0]); + + ///finalize the constraint setup + for ( j=0;j<info1;j++) + { + b3GpuSolverConstraint& solverConstraint = currentConstraintRow[j]; + + if (solverConstraint.m_upperLimit>=m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold()) + { + solverConstraint.m_upperLimit = m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold(); + } + + if (solverConstraint.m_lowerLimit<=-m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold()) + { + solverConstraint.m_lowerLimit = -m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold(); + } + + // solverConstraint.m_originalContactPoint = constraint; + + b3Matrix3x3& invInertiaWorldA= m_gpuData->m_cpuInertias[constraint.getRigidBodyA()].m_invInertiaWorld; + { + + //b3Vector3 angularFactorA(1,1,1); + const b3Vector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal; + solverConstraint.m_angularComponentA = invInertiaWorldA*ftorqueAxis1;//*angularFactorA; + } + + b3Matrix3x3& invInertiaWorldB= m_gpuData->m_cpuInertias[constraint.getRigidBodyB()].m_invInertiaWorld; + { + + const b3Vector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal; + solverConstraint.m_angularComponentB = invInertiaWorldB*ftorqueAxis2;//*constraint.getRigidBodyB().getAngularFactor(); + } + + { + //it is ok to use solverConstraint.m_contactNormal instead of -solverConstraint.m_contactNormal + //because it gets multiplied iMJlB + b3Vector3 iMJlA = solverConstraint.m_contactNormal*rbA.m_invMass; + b3Vector3 iMJaA = invInertiaWorldA*solverConstraint.m_relpos1CrossNormal; + b3Vector3 iMJlB = solverConstraint.m_contactNormal*rbB.m_invMass;//sign of normal? + b3Vector3 iMJaB = invInertiaWorldB*solverConstraint.m_relpos2CrossNormal; + + b3Scalar sum = iMJlA.dot(solverConstraint.m_contactNormal); + sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal); + sum += iMJlB.dot(solverConstraint.m_contactNormal); + sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal); + b3Scalar fsum = b3Fabs(sum); + b3Assert(fsum > B3_EPSILON); + solverConstraint.m_jacDiagABInv = fsum>B3_EPSILON?b3Scalar(1.)/sum : 0.f; + } + + + ///fix rhs + ///todo: add force/torque accelerators + { + b3Scalar rel_vel; + b3Scalar vel1Dotn = solverConstraint.m_contactNormal.dot(rbA.m_linVel) + solverConstraint.m_relpos1CrossNormal.dot(rbA.m_angVel); + b3Scalar vel2Dotn = -solverConstraint.m_contactNormal.dot(rbB.m_linVel) + solverConstraint.m_relpos2CrossNormal.dot(rbB.m_angVel); + + rel_vel = vel1Dotn+vel2Dotn; + + b3Scalar restitution = 0.f; + b3Scalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2 + b3Scalar velocityError = restitution - rel_vel * info2.m_damping; + b3Scalar penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv; + b3Scalar velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv; + solverConstraint.m_rhs = penetrationImpulse+velocityImpulse; + solverConstraint.m_appliedImpulse = 0.f; + + } + } + + } + } + + + + m_gpuData->m_gpuConstraintRows->copyFromHost(m_tmpSolverNonContactConstraintPool); + m_gpuData->m_gpuConstraintInfo1->copyFromHost(m_tmpConstraintSizesPool); + + if (m_gpuData->m_batchSizes.size()==0) + m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints); + else + m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); + + m_gpuData->m_gpuSolverBodies->copyFromHost(m_tmpSolverBodyPool); + + + + }//end useGpuInfo2 + + + } + +#ifdef B3_SUPPORT_CONTACT_CONSTRAINTS + { + int i; + + for (i=0;i<numManifolds;i++) + { + b3Contact4& manifold = manifoldPtr[i]; + convertContact(bodies,inertias,&manifold,infoGlobal); + } + } +#endif //B3_SUPPORT_CONTACT_CONSTRAINTS + } + +// b3ContactSolverInfo info = infoGlobal; + + +// int numNonContactPool = m_tmpSolverNonContactConstraintPool.size(); +// int numConstraintPool = m_tmpSolverContactConstraintPool.size(); +// int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size(); + + + return 0.f; + +} + + + +///a straight copy from GPU/OpenCL kernel, for debugging +__inline void internalApplyImpulse( b3GpuSolverBody* body, const b3Vector3& linearComponent, const b3Vector3& angularComponent,float impulseMagnitude) +{ + body->m_deltaLinearVelocity += linearComponent*impulseMagnitude*body->m_linearFactor; + body->m_deltaAngularVelocity += angularComponent*(impulseMagnitude*body->m_angularFactor); +} + + +void resolveSingleConstraintRowGeneric2( b3GpuSolverBody* body1, b3GpuSolverBody* body2, b3GpuSolverConstraint* c) +{ + float deltaImpulse = c->m_rhs-b3Scalar(c->m_appliedImpulse)*c->m_cfm; + float deltaVel1Dotn = b3Dot(c->m_contactNormal,body1->m_deltaLinearVelocity) + b3Dot(c->m_relpos1CrossNormal,body1->m_deltaAngularVelocity); + float deltaVel2Dotn = -b3Dot(c->m_contactNormal,body2->m_deltaLinearVelocity) + b3Dot(c->m_relpos2CrossNormal,body2->m_deltaAngularVelocity); + + deltaImpulse -= deltaVel1Dotn*c->m_jacDiagABInv; + deltaImpulse -= deltaVel2Dotn*c->m_jacDiagABInv; + + float sum = b3Scalar(c->m_appliedImpulse) + deltaImpulse; + if (sum < c->m_lowerLimit) + { + deltaImpulse = c->m_lowerLimit-b3Scalar(c->m_appliedImpulse); + c->m_appliedImpulse = c->m_lowerLimit; + } + else if (sum > c->m_upperLimit) + { + deltaImpulse = c->m_upperLimit-b3Scalar(c->m_appliedImpulse); + c->m_appliedImpulse = c->m_upperLimit; + } + else + { + c->m_appliedImpulse = sum; + } + + internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse); + internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse); + +} + + + +void b3GpuPgsConstraintSolver::initSolverBody(int bodyIndex, b3GpuSolverBody* solverBody, b3RigidBodyData* rb) +{ + + solverBody->m_deltaLinearVelocity.setValue(0.f,0.f,0.f); + solverBody->m_deltaAngularVelocity.setValue(0.f,0.f,0.f); + solverBody->internalGetPushVelocity().setValue(0.f,0.f,0.f); + solverBody->internalGetTurnVelocity().setValue(0.f,0.f,0.f); + + b3Assert(rb); +// solverBody->m_worldTransform = getWorldTransform(rb); + solverBody->internalSetInvMass(b3MakeVector3(rb->m_invMass,rb->m_invMass,rb->m_invMass)); + solverBody->m_originalBodyIndex = bodyIndex; + solverBody->m_angularFactor = b3MakeVector3(1,1,1); + solverBody->m_linearFactor = b3MakeVector3(1,1,1); + solverBody->m_linearVelocity = getLinearVelocity(rb); + solverBody->m_angularVelocity = getAngularVelocity(rb); +} + + +void b3GpuPgsConstraintSolver::averageVelocities() +{ +} + + +b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyIterations(b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints1,int numConstraints,const b3ContactSolverInfo& infoGlobal) +{ + //only create the batches once. + //@todo: incrementally update batches when constraints are added/activated and/or removed/deactivated + B3_PROFILE("GpuSolveGroupCacheFriendlyIterations"); + + bool createBatches = m_gpuData->m_batchSizes.size()==0; + { + + if (createBatches) + { + + m_gpuData->m_batchSizes.resize(0); + + { + m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); + + B3_PROFILE("batch joints"); + b3Assert(batchConstraints.size()==numConstraints); + int simdWidth =numConstraints+1; + int numBodies = m_tmpSolverBodyPool.size(); + sortConstraintByBatch3( &batchConstraints[0], numConstraints, simdWidth , m_staticIdx, numBodies); + + m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints); + + } + } else + { + /*b3AlignedObjectArray<b3BatchConstraint> cpuCheckBatches; + m_gpuData->m_gpuBatchConstraints->copyToHost(cpuCheckBatches); + b3Assert(cpuCheckBatches.size()==batchConstraints.size()); + printf(".\n"); + */ + //>copyFromHost(batchConstraints); + } + int maxIterations = infoGlobal.m_numIterations; + + bool useBatching = true; + + if (useBatching ) + { + + if (!useGpuSolveJointConstraintRows) + { + B3_PROFILE("copy to host"); + m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); + m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); + m_gpuData->m_gpuConstraintRows->copyToHost(m_tmpSolverNonContactConstraintPool); + m_gpuData->m_gpuConstraintInfo1->copyToHost(m_gpuData->m_cpuConstraintInfo1); + m_gpuData->m_gpuConstraintRowOffsets->copyToHost(m_gpuData->m_cpuConstraintRowOffsets); + gpuConstraints1->copyToHost(m_gpuData->m_cpuConstraints); + + } + + for ( int iteration = 0 ; iteration< maxIterations ; iteration++) + { + + int batchOffset = 0; + int constraintOffset=0; + int numBatches = m_gpuData->m_batchSizes.size(); + for (int bb=0;bb<numBatches;bb++) + { + int numConstraintsInBatch = m_gpuData->m_batchSizes[bb]; + + + if (useGpuSolveJointConstraintRows) + { + B3_PROFILE("solveJointConstraintRowsKernels"); + + /* + __kernel void solveJointConstraintRows(__global b3GpuSolverBody* solverBodies, + __global b3BatchConstraint* batchConstraints, + __global b3SolverConstraint* rows, + __global unsigned int* numConstraintRowsInfo1, + __global unsigned int* rowOffsets, + __global b3GpuGenericConstraint* constraints, + int batchOffset, + int numConstraintsInBatch*/ + + + b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_solveJointConstraintRowsKernels,"m_solveJointConstraintRowsKernels"); + launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL()); + launcher.setBuffer(gpuConstraints1->getBufferCL());//to detect disabled constraints + launcher.setConst(batchOffset); + launcher.setConst(numConstraintsInBatch); + + launcher.launch1D(numConstraintsInBatch); + + + } else//useGpu + { + + + + for (int b=0;b<numConstraintsInBatch;b++) + { + const b3BatchConstraint& c = batchConstraints[batchOffset+b]; + /*printf("-----------\n"); + printf("bb=%d\n",bb); + printf("c.batchId = %d\n", c.m_batchId); + */ + b3Assert(c.m_batchId==bb); + b3GpuGenericConstraint* constraint = &m_gpuData->m_cpuConstraints[c.m_originalConstraintIndex]; + if (constraint->m_flags&B3_CONSTRAINT_FLAG_ENABLED) + { + int numConstraintRows = m_gpuData->m_cpuConstraintInfo1[c.m_originalConstraintIndex]; + int constraintOffset = m_gpuData->m_cpuConstraintRowOffsets[c.m_originalConstraintIndex]; + + for (int jj=0;jj<numConstraintRows;jj++) + { + // + b3GpuSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[constraintOffset+jj]; + //resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint); + resolveSingleConstraintRowGeneric2(&m_tmpSolverBodyPool[constraint.m_solverBodyIdA],&m_tmpSolverBodyPool[constraint.m_solverBodyIdB],&constraint); + } + } + } + }//useGpu + batchOffset+=numConstraintsInBatch; + constraintOffset+=numConstraintsInBatch; + } + }//for (int iteration... + + if (!useGpuSolveJointConstraintRows) + { + { + B3_PROFILE("copy from host"); + m_gpuData->m_gpuSolverBodies->copyFromHost(m_tmpSolverBodyPool); + m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints); + m_gpuData->m_gpuConstraintRows->copyFromHost(m_tmpSolverNonContactConstraintPool); + } + + //B3_PROFILE("copy to host"); + //m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); + } + //int sz = sizeof(b3GpuSolverBody); + //printf("cpu sizeof(b3GpuSolverBody)=%d\n",sz); + + + + + + } else + { + for ( int iteration = 0 ; iteration< maxIterations ; iteration++) + { + int numJoints = m_tmpSolverNonContactConstraintPool.size(); + for (int j=0;j<numJoints;j++) + { + b3GpuSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[j]; + resolveSingleConstraintRowGeneric2(&m_tmpSolverBodyPool[constraint.m_solverBodyIdA],&m_tmpSolverBodyPool[constraint.m_solverBodyIdB],&constraint); + } + + if (!m_usePgs) + { + averageVelocities(); + } + } + } + + } + clFinish(m_gpuData->m_queue); + return 0.f; +} + + + + +static b3AlignedObjectArray<int> bodyUsed; +static b3AlignedObjectArray<int> curUsed; + + + +inline int b3GpuPgsConstraintSolver::sortConstraintByBatch3( b3BatchConstraint* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies) +{ + //int sz = sizeof(b3BatchConstraint); + + B3_PROFILE("sortConstraintByBatch3"); + + static int maxSwaps = 0; + int numSwaps = 0; + + curUsed.resize(2*simdWidth); + + static int maxNumConstraints = 0; + if (maxNumConstraints<numConstraints) + { + maxNumConstraints = numConstraints; + //printf("maxNumConstraints = %d\n",maxNumConstraints ); + } + + int numUsedArray = numBodies/32+1; + bodyUsed.resize(numUsedArray); + + for (int q=0;q<numUsedArray;q++) + bodyUsed[q]=0; + + + int curBodyUsed = 0; + + int numIter = 0; + + +#if defined(_DEBUG) + for(int i=0; i<numConstraints; i++) + cs[i].m_batchId = -1; +#endif + + int numValidConstraints = 0; +// int unprocessedConstraintIndex = 0; + + int batchIdx = 0; + + + { + B3_PROFILE("cpu batch innerloop"); + + while( numValidConstraints < numConstraints) + { + numIter++; + int nCurrentBatch = 0; + // clear flag + for(int i=0; i<curBodyUsed; i++) + bodyUsed[curUsed[i]/32] = 0; + + curBodyUsed = 0; + + for(int i=numValidConstraints; i<numConstraints; i++) + { + int idx = i; + b3Assert( idx < numConstraints ); + // check if it can go + int bodyAS = cs[idx].m_bodyAPtrAndSignBit; + int bodyBS = cs[idx].m_bodyBPtrAndSignBit; + int bodyA = abs(bodyAS); + int bodyB = abs(bodyBS); + bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx; + bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx; + int aUnavailable = 0; + int bUnavailable = 0; + if (!aIsStatic) + { + aUnavailable = bodyUsed[ bodyA/32 ] & (1<<(bodyA&31)); + } + if (!aUnavailable) + if (!bIsStatic) + { + bUnavailable = bodyUsed[ bodyB/32 ] & (1<<(bodyB&31)); + } + + if( aUnavailable==0 && bUnavailable==0 ) // ok + { + if (!aIsStatic) + { + bodyUsed[ bodyA/32 ] |= (1<<(bodyA&31)); + curUsed[curBodyUsed++]=bodyA; + } + if (!bIsStatic) + { + bodyUsed[ bodyB/32 ] |= (1<<(bodyB&31)); + curUsed[curBodyUsed++]=bodyB; + } + + cs[idx].m_batchId = batchIdx; + + if (i!=numValidConstraints) + { + b3Swap(cs[i],cs[numValidConstraints]); + numSwaps++; + } + + numValidConstraints++; + { + nCurrentBatch++; + if( nCurrentBatch == simdWidth ) + { + nCurrentBatch = 0; + for(int i=0; i<curBodyUsed; i++) + bodyUsed[curUsed[i]/32] = 0; + curBodyUsed = 0; + } + } + } + } + m_gpuData->m_batchSizes.push_back(nCurrentBatch); + batchIdx ++; + } + } + +#if defined(_DEBUG) + // debugPrintf( "nBatches: %d\n", batchIdx ); + for(int i=0; i<numConstraints; i++) + { + b3Assert( cs[i].m_batchId != -1 ); + } +#endif + + if (maxSwaps<numSwaps) + { + maxSwaps = numSwaps; + //printf("maxSwaps = %d\n", maxSwaps); + } + + return batchIdx; +} + + +/// b3PgsJacobiSolver Sequentially applies impulses +b3Scalar b3GpuPgsConstraintSolver::solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, + int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints, const b3ContactSolverInfo& infoGlobal) +{ + + B3_PROFILE("solveJoints"); + //you need to provide at least some bodies + + solveGroupCacheFriendlySetup( gpuBodies, gpuInertias,numBodies,gpuConstraints, numConstraints,infoGlobal); + + solveGroupCacheFriendlyIterations(gpuConstraints, numConstraints,infoGlobal); + + solveGroupCacheFriendlyFinish(gpuBodies, gpuInertias,numBodies, gpuConstraints, numConstraints, infoGlobal); + + return 0.f; +} + +void b3GpuPgsConstraintSolver::solveJoints(int numBodies, b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, + int numConstraints, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints) +{ + b3ContactSolverInfo infoGlobal; + infoGlobal.m_splitImpulse = false; + infoGlobal.m_timeStep = 1.f/60.f; + infoGlobal.m_numIterations = 4;//4; +// infoGlobal.m_solverMode|=B3_SOLVER_USE_2_FRICTION_DIRECTIONS|B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS|B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION; + //infoGlobal.m_solverMode|=B3_SOLVER_USE_2_FRICTION_DIRECTIONS|B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS; + infoGlobal.m_solverMode|=B3_SOLVER_USE_2_FRICTION_DIRECTIONS; + + //if (infoGlobal.m_solverMode & B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS) + //if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS) && (infoGlobal.m_solverMode & B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION)) + + + solveGroup(gpuBodies,gpuInertias,numBodies,gpuConstraints,numConstraints,infoGlobal); + +} + +//b3AlignedObjectArray<b3RigidBodyData> testBodies; + + +b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyFinish(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias,int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal) +{ + B3_PROFILE("solveGroupCacheFriendlyFinish"); +// int numPoolConstraints = m_tmpSolverContactConstraintPool.size(); +// int i,j; + + + { + if (gpuBreakConstraints) + { + B3_PROFILE("breakViolatedConstraintsKernel"); + b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_breakViolatedConstraintsKernel,"m_breakViolatedConstraintsKernel"); + launcher.setBuffer(gpuConstraints->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL()); + launcher.setConst(numConstraints); + launcher.launch1D(numConstraints); + } else + { + gpuConstraints->copyToHost(m_gpuData->m_cpuConstraints); + m_gpuData->m_gpuBatchConstraints->copyToHost(m_gpuData->m_cpuBatchConstraints); + m_gpuData->m_gpuConstraintRows->copyToHost(m_gpuData->m_cpuConstraintRows); + gpuConstraints->copyToHost(m_gpuData->m_cpuConstraints); + m_gpuData->m_gpuConstraintInfo1->copyToHost(m_gpuData->m_cpuConstraintInfo1); + m_gpuData->m_gpuConstraintRowOffsets->copyToHost(m_gpuData->m_cpuConstraintRowOffsets); + + for (int cid=0;cid<numConstraints;cid++) + { + int originalConstraintIndex = batchConstraints[cid].m_originalConstraintIndex; + int constraintRowOffset = m_gpuData->m_cpuConstraintRowOffsets[originalConstraintIndex]; + int numRows = m_gpuData->m_cpuConstraintInfo1[originalConstraintIndex]; + if (numRows) + { + + // printf("cid=%d, breakingThreshold =%f\n",cid,breakingThreshold); + for (int i=0;i<numRows;i++) + { + int rowIndex =constraintRowOffset+i; + int orgConstraintIndex = m_gpuData->m_cpuConstraintRows[rowIndex].m_originalConstraintIndex; + float breakingThreshold = m_gpuData->m_cpuConstraints[orgConstraintIndex].m_breakingImpulseThreshold; + // printf("rows[%d].m_appliedImpulse=%f\n",rowIndex,rows[rowIndex].m_appliedImpulse); + if (b3Fabs(m_gpuData->m_cpuConstraintRows[rowIndex].m_appliedImpulse) >= breakingThreshold) + { + + m_gpuData->m_cpuConstraints[orgConstraintIndex].m_flags =0;//&= ~B3_CONSTRAINT_FLAG_ENABLED; + } + } + } + } + + + gpuConstraints->copyFromHost(m_gpuData->m_cpuConstraints); + } + } + + { + if (useGpuWriteBackVelocities) + { + B3_PROFILE("GPU write back velocities and transforms"); + + b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_writeBackVelocitiesKernel,"m_writeBackVelocitiesKernel"); + launcher.setBuffer(gpuBodies->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL()); + launcher.setConst(numBodies); + launcher.launch1D(numBodies); + clFinish(m_gpuData->m_queue); +// m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); +// m_gpuData->m_gpuBodies->copyToHostPointer(bodies,numBodies); + //m_gpuData->m_gpuBodies->copyToHost(testBodies); + + } + else + { + B3_PROFILE("CPU write back velocities and transforms"); + + m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); + gpuBodies->copyToHost(m_gpuData->m_cpuBodies); + for ( int i=0;i<m_tmpSolverBodyPool.size();i++) + { + int bodyIndex = m_tmpSolverBodyPool[i].m_originalBodyIndex; + //printf("bodyIndex=%d\n",bodyIndex); + b3Assert(i==bodyIndex); + + b3RigidBodyData* body = &m_gpuData->m_cpuBodies[bodyIndex]; + if (body->m_invMass) + { + if (infoGlobal.m_splitImpulse) + m_tmpSolverBodyPool[i].writebackVelocityAndTransform(infoGlobal.m_timeStep, infoGlobal.m_splitImpulseTurnErp); + else + m_tmpSolverBodyPool[i].writebackVelocity(); + + if (m_usePgs) + { + body->m_linVel = m_tmpSolverBodyPool[i].m_linearVelocity; + body->m_angVel = m_tmpSolverBodyPool[i].m_angularVelocity; + } else + { + b3Assert(0); + } + /* + if (infoGlobal.m_splitImpulse) + { + body->m_pos = m_tmpSolverBodyPool[i].m_worldTransform.getOrigin(); + b3Quaternion orn; + orn = m_tmpSolverBodyPool[i].m_worldTransform.getRotation(); + body->m_quat = orn; + } + */ + } + }//for + + gpuBodies->copyFromHost(m_gpuData->m_cpuBodies); + + } + } + + clFinish(m_gpuData->m_queue); + + m_tmpSolverContactConstraintPool.resizeNoInitialize(0); + m_tmpSolverNonContactConstraintPool.resizeNoInitialize(0); + m_tmpSolverContactFrictionConstraintPool.resizeNoInitialize(0); + m_tmpSolverContactRollingFrictionConstraintPool.resizeNoInitialize(0); + + m_tmpSolverBodyPool.resizeNoInitialize(0); + return 0.f; +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h new file mode 100644 index 0000000000..ec0e3f73d6 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h @@ -0,0 +1,78 @@ +/* +Copyright (c) 2013 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + +#ifndef B3_GPU_PGS_CONSTRAINT_SOLVER_H +#define B3_GPU_PGS_CONSTRAINT_SOLVER_H + +struct b3Contact4; +struct b3ContactPoint; + + +class b3Dispatcher; + +#include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h" +#include "Bullet3Dynamics/ConstraintSolver/b3ContactSolverInfo.h" +#include "b3GpuSolverBody.h" +#include "b3GpuSolverConstraint.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" +struct b3RigidBodyData; +struct b3InertiaData; + +#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h" +#include "b3GpuGenericConstraint.h" + +class b3GpuPgsConstraintSolver +{ +protected: + int m_staticIdx; + struct b3GpuPgsJacobiSolverInternalData* m_gpuData; + protected: + b3AlignedObjectArray<b3GpuSolverBody> m_tmpSolverBodyPool; + b3GpuConstraintArray m_tmpSolverContactConstraintPool; + b3GpuConstraintArray m_tmpSolverNonContactConstraintPool; + b3GpuConstraintArray m_tmpSolverContactFrictionConstraintPool; + b3GpuConstraintArray m_tmpSolverContactRollingFrictionConstraintPool; + + b3AlignedObjectArray<unsigned int> m_tmpConstraintSizesPool; + + + bool m_usePgs; + void averageVelocities(); + + int m_maxOverrideNumSolverIterations; + + int m_numSplitImpulseRecoveries; + +// int getOrInitSolverBody(int bodyIndex, b3RigidBodyData* bodies,b3InertiaData* inertias); + void initSolverBody(int bodyIndex, b3GpuSolverBody* solverBody, b3RigidBodyData* rb); + +public: + b3GpuPgsConstraintSolver (cl_context ctx, cl_device_id device, cl_command_queue queue,bool usePgs); + virtual~b3GpuPgsConstraintSolver (); + + virtual b3Scalar solveGroupCacheFriendlyIterations(b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints1,int numConstraints,const b3ContactSolverInfo& infoGlobal); + virtual b3Scalar solveGroupCacheFriendlySetup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal); + b3Scalar solveGroupCacheFriendlyFinish(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias,int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal); + + + b3Scalar solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal); + void solveJoints(int numBodies, b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, + int numConstraints, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints); + + int sortConstraintByBatch3( struct b3BatchConstraint* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies); + void recomputeBatches(); +}; + +#endif //B3_GPU_PGS_CONSTRAINT_SOLVER_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp new file mode 100644 index 0000000000..f0b0abd5e0 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp @@ -0,0 +1,1708 @@ + +bool gUseLargeBatches = false; +bool gCpuBatchContacts = false; +bool gCpuSolveConstraint = false; +bool gCpuRadixSort=false; +bool gCpuSetSortData = false; +bool gCpuSortContactsDeterminism = false; +bool gUseCpuCopyConstraints = false; +bool gUseScanHost = false; +bool gReorderContactsOnCpu = false; + +bool optionalSortContactsDeterminism = true; + + +#include "b3GpuPgsContactSolver.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" + +#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h" +#include <string.h> +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h" +#include "b3Solver.h" + + +#define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl" +#define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl" +#define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl" +#define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl" +#define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl" +#define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl" + +#include "kernels/solverSetup.h" +#include "kernels/solverSetup2.h" +#include "kernels/solveContact.h" +#include "kernels/solveFriction.h" +#include "kernels/batchingKernels.h" +#include "kernels/batchingKernelsNew.h" + + + + + +struct b3GpuBatchingPgsSolverInternalData +{ + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + int m_pairCapacity; + int m_nIterations; + + b3OpenCLArray<b3GpuConstraint4>* m_contactCGPU; + b3OpenCLArray<unsigned int>* m_numConstraints; + b3OpenCLArray<unsigned int>* m_offsets; + + b3Solver* m_solverGPU; + + cl_kernel m_batchingKernel; + cl_kernel m_batchingKernelNew; + cl_kernel m_solveContactKernel; + cl_kernel m_solveSingleContactKernel; + cl_kernel m_solveSingleFrictionKernel; + cl_kernel m_solveFrictionKernel; + cl_kernel m_contactToConstraintKernel; + cl_kernel m_setSortDataKernel; + cl_kernel m_reorderContactKernel; + cl_kernel m_copyConstraintKernel; + + cl_kernel m_setDeterminismSortDataBodyAKernel; + cl_kernel m_setDeterminismSortDataBodyBKernel; + cl_kernel m_setDeterminismSortDataChildShapeAKernel; + cl_kernel m_setDeterminismSortDataChildShapeBKernel; + + + + + class b3RadixSort32CL* m_sort32; + class b3BoundSearchCL* m_search; + class b3PrefixScanCL* m_scan; + + b3OpenCLArray<b3SortData>* m_sortDataBuffer; + b3OpenCLArray<b3Contact4>* m_contactBuffer; + + b3OpenCLArray<b3RigidBodyData>* m_bodyBufferGPU; + b3OpenCLArray<b3InertiaData>* m_inertiaBufferGPU; + b3OpenCLArray<b3Contact4>* m_pBufContactOutGPU; + + b3OpenCLArray<b3Contact4>* m_pBufContactOutGPUCopy; + b3OpenCLArray<b3SortData>* m_contactKeyValues; + + + b3AlignedObjectArray<unsigned int> m_idxBuffer; + b3AlignedObjectArray<b3SortData> m_sortData; + b3AlignedObjectArray<b3Contact4> m_old; + + b3AlignedObjectArray<int> m_batchSizes; + b3OpenCLArray<int>* m_batchSizesGpu; + +}; + + + +b3GpuPgsContactSolver::b3GpuPgsContactSolver(cl_context ctx,cl_device_id device, cl_command_queue q,int pairCapacity) +{ + m_debugOutput=0; + m_data = new b3GpuBatchingPgsSolverInternalData; + m_data->m_context = ctx; + m_data->m_device = device; + m_data->m_queue = q; + m_data->m_pairCapacity = pairCapacity; + m_data->m_nIterations = 4; + m_data->m_batchSizesGpu = new b3OpenCLArray<int>(ctx,q); + m_data->m_bodyBufferGPU = new b3OpenCLArray<b3RigidBodyData>(ctx,q); + m_data->m_inertiaBufferGPU = new b3OpenCLArray<b3InertiaData>(ctx,q); + m_data->m_pBufContactOutGPU = new b3OpenCLArray<b3Contact4>(ctx,q); + + m_data->m_pBufContactOutGPUCopy = new b3OpenCLArray<b3Contact4>(ctx,q); + m_data->m_contactKeyValues = new b3OpenCLArray<b3SortData>(ctx,q); + + + m_data->m_solverGPU = new b3Solver(ctx,device,q,512*1024); + + m_data->m_sort32 = new b3RadixSort32CL(ctx,device,m_data->m_queue); + m_data->m_scan = new b3PrefixScanCL(ctx,device,m_data->m_queue,B3_SOLVER_N_CELLS); + m_data->m_search = new b3BoundSearchCL(ctx,device,m_data->m_queue,B3_SOLVER_N_CELLS); + + const int sortSize = B3NEXTMULTIPLEOF( pairCapacity, 512 ); + + m_data->m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx,m_data->m_queue,sortSize); + m_data->m_contactBuffer = new b3OpenCLArray<b3Contact4>(ctx,m_data->m_queue); + + m_data->m_numConstraints = new b3OpenCLArray<unsigned int>(ctx,m_data->m_queue,B3_SOLVER_N_CELLS); + m_data->m_numConstraints->resize(B3_SOLVER_N_CELLS); + + m_data->m_contactCGPU = new b3OpenCLArray<b3GpuConstraint4>(ctx,q,pairCapacity); + + m_data->m_offsets = new b3OpenCLArray<unsigned int>( ctx,m_data->m_queue,B3_SOLVER_N_CELLS); + m_data->m_offsets->resize(B3_SOLVER_N_CELLS); + const char* additionalMacros = ""; + //const char* srcFileNameForCaching=""; + + + + cl_int pErrNum; + const char* batchKernelSource = batchingKernelsCL; + const char* batchKernelNewSource = batchingKernelsNewCL; + const char* solverSetupSource = solverSetupCL; + const char* solverSetup2Source = solverSetup2CL; + const char* solveContactSource = solveContactCL; + const char* solveFrictionSource = solveFrictionCL; + + + { + + cl_program solveContactProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveContactSource, &pErrNum,additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH); + b3Assert(solveContactProg); + + cl_program solveFrictionProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH); + b3Assert(solveFrictionProg); + + cl_program solverSetup2Prog= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetup2Source, &pErrNum,additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH); + + + b3Assert(solverSetup2Prog); + + + cl_program solverSetupProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetupSource, &pErrNum,additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH); + b3Assert(solverSetupProg); + + + m_data->m_solveFrictionKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg,additionalMacros ); + b3Assert(m_data->m_solveFrictionKernel); + + m_data->m_solveContactKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros ); + b3Assert(m_data->m_solveContactKernel); + + m_data->m_solveSingleContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "solveSingleContactKernel", &pErrNum, solveContactProg,additionalMacros ); + b3Assert(m_data->m_solveSingleContactKernel); + + m_data->m_solveSingleFrictionKernel =b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "solveSingleFrictionKernel", &pErrNum, solveFrictionProg,additionalMacros ); + b3Assert(m_data->m_solveSingleFrictionKernel); + + m_data->m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg,additionalMacros ); + b3Assert(m_data->m_contactToConstraintKernel); + + m_data->m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + b3Assert(m_data->m_setSortDataKernel); + + m_data->m_setDeterminismSortDataBodyAKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyA", &pErrNum, solverSetup2Prog,additionalMacros ); + b3Assert(m_data->m_setDeterminismSortDataBodyAKernel); + + m_data->m_setDeterminismSortDataBodyBKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyB", &pErrNum, solverSetup2Prog,additionalMacros ); + b3Assert(m_data->m_setDeterminismSortDataBodyBKernel); + + m_data->m_setDeterminismSortDataChildShapeAKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeA", &pErrNum, solverSetup2Prog,additionalMacros ); + b3Assert(m_data->m_setDeterminismSortDataChildShapeAKernel); + + m_data->m_setDeterminismSortDataChildShapeBKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeB", &pErrNum, solverSetup2Prog,additionalMacros ); + b3Assert(m_data->m_setDeterminismSortDataChildShapeBKernel); + + + m_data->m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + b3Assert(m_data->m_reorderContactKernel); + + + m_data->m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + b3Assert(m_data->m_copyConstraintKernel); + + } + + { + cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelSource, &pErrNum,additionalMacros, B3_BATCHING_PATH); + b3Assert(batchingProg); + + m_data->m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg,additionalMacros ); + b3Assert(m_data->m_batchingKernel); + } + + { + cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelNewSource, &pErrNum,additionalMacros, B3_BATCHING_NEW_PATH); + b3Assert(batchingNewProg); + + m_data->m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg,additionalMacros ); + b3Assert(m_data->m_batchingKernelNew); + } + + + + + + + +} + +b3GpuPgsContactSolver::~b3GpuPgsContactSolver() +{ + delete m_data->m_batchSizesGpu; + delete m_data->m_bodyBufferGPU; + delete m_data->m_inertiaBufferGPU; + delete m_data->m_pBufContactOutGPU; + delete m_data->m_pBufContactOutGPUCopy; + delete m_data->m_contactKeyValues; + + + + delete m_data->m_contactCGPU; + delete m_data->m_numConstraints; + delete m_data->m_offsets; + delete m_data->m_sortDataBuffer; + delete m_data->m_contactBuffer; + + delete m_data->m_sort32; + delete m_data->m_scan; + delete m_data->m_search; + delete m_data->m_solverGPU; + + clReleaseKernel(m_data->m_batchingKernel); + clReleaseKernel(m_data->m_batchingKernelNew); + clReleaseKernel(m_data->m_solveSingleContactKernel); + clReleaseKernel(m_data->m_solveSingleFrictionKernel); + clReleaseKernel( m_data->m_solveContactKernel); + clReleaseKernel( m_data->m_solveFrictionKernel); + + clReleaseKernel( m_data->m_contactToConstraintKernel); + clReleaseKernel( m_data->m_setSortDataKernel); + clReleaseKernel( m_data->m_reorderContactKernel); + clReleaseKernel( m_data->m_copyConstraintKernel); + + clReleaseKernel(m_data->m_setDeterminismSortDataBodyAKernel); + clReleaseKernel(m_data->m_setDeterminismSortDataBodyBKernel); + clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeAKernel); + clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeBKernel); + + + + delete m_data; +} + + + +struct b3ConstraintCfg +{ + b3ConstraintCfg( float dt = 0.f ): m_positionDrift( 0.005f ), m_positionConstraintCoeff( 0.2f ), m_dt(dt), m_staticIdx(0) {} + + float m_positionDrift; + float m_positionConstraintCoeff; + float m_dt; + bool m_enableParallelSolve; + float m_batchCellSize; + int m_staticIdx; +}; + + + +void b3GpuPgsContactSolver::solveContactConstraintBatchSizes( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches,int numIterations, const b3AlignedObjectArray<int>* batchSizes)//const b3OpenCLArray<int>* gpuBatchSizes) +{ + B3_PROFILE("solveContactConstraintBatchSizes"); + int numBatches = batchSizes->size()/B3_MAX_NUM_BATCHES; + for(int iter=0; iter<numIterations; iter++) + { + + for (int cellId=0;cellId<numBatches;cellId++) + { + int offset = 0; + for (int ii=0;ii<B3_MAX_NUM_BATCHES;ii++) + { + int numInBatch = batchSizes->at(cellId*B3_MAX_NUM_BATCHES+ii); + if (!numInBatch) + break; + + { + b3LauncherCL launcher( m_data->m_queue, m_data->m_solveSingleContactKernel,"m_solveSingleContactKernel" ); + launcher.setBuffer(bodyBuf->getBufferCL() ); + launcher.setBuffer(shapeBuf->getBufferCL() ); + launcher.setBuffer( constraint->getBufferCL() ); + launcher.setConst(cellId); + launcher.setConst(offset); + launcher.setConst(numInBatch); + launcher.launch1D(numInBatch); + offset+=numInBatch; + } + } + } + } + + + for(int iter=0; iter<numIterations; iter++) + { + for (int cellId=0;cellId<numBatches;cellId++) + { + int offset = 0; + for (int ii=0;ii<B3_MAX_NUM_BATCHES;ii++) + { + int numInBatch = batchSizes->at(cellId*B3_MAX_NUM_BATCHES+ii); + if (!numInBatch) + break; + + { + b3LauncherCL launcher( m_data->m_queue, m_data->m_solveSingleFrictionKernel,"m_solveSingleFrictionKernel" ); + launcher.setBuffer(bodyBuf->getBufferCL() ); + launcher.setBuffer(shapeBuf->getBufferCL() ); + launcher.setBuffer( constraint->getBufferCL() ); + launcher.setConst(cellId); + launcher.setConst(offset); + launcher.setConst(numInBatch); + launcher.launch1D(numInBatch); + offset+=numInBatch; + } + } + } + } +} + +void b3GpuPgsContactSolver::solveContactConstraint( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches,int numIterations, const b3AlignedObjectArray<int>* batchSizes)//,const b3OpenCLArray<int>* gpuBatchSizes) +{ + + //sort the contacts + + + b3Int4 cdata = b3MakeInt4( n, 0, 0, 0 ); + { + + const int nn = B3_SOLVER_N_CELLS; + + cdata.x = 0; + cdata.y = maxNumBatches;//250; + + + int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES; +#ifdef DEBUG_ME + SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems]; + adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems); +#endif + + + + { + + B3_PROFILE("m_batchSolveKernel iterations"); + for(int iter=0; iter<numIterations; iter++) + { + for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++) + { +#ifdef DEBUG_ME + memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems); + gpuDebugInfo.write(debugInfo,numWorkItems); +#endif + + + cdata.z = ib; + + + b3LauncherCL launcher( m_data->m_queue, m_data->m_solveContactKernel,"m_solveContactKernel" ); +#if 1 + + b3BufferInfoCL bInfo[] = { + + b3BufferInfoCL( bodyBuf->getBufferCL() ), + b3BufferInfoCL( shapeBuf->getBufferCL() ), + b3BufferInfoCL( constraint->getBufferCL() ), + b3BufferInfoCL( m_data->m_solverGPU->m_numConstraints->getBufferCL() ), + b3BufferInfoCL( m_data->m_solverGPU->m_offsets->getBufferCL() ) +#ifdef DEBUG_ME + , b3BufferInfoCL(&gpuDebugInfo) +#endif + }; + + + + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffer( m_data->m_solverGPU->m_batchSizes.getBufferCL()); + //launcher.setConst( cdata.x ); + launcher.setConst( cdata.y ); + launcher.setConst( cdata.z ); + b3Int4 nSplit; + nSplit.x = B3_SOLVER_N_SPLIT_X; + nSplit.y = B3_SOLVER_N_SPLIT_Y; + nSplit.z = B3_SOLVER_N_SPLIT_Z; + + launcher.setConst( nSplit ); + launcher.launch1D( numWorkItems, 64 ); + + +#else + const char* fileName = "m_batchSolveKernel.bin"; + FILE* f = fopen(fileName,"rb"); + if (f) + { + int sizeInBytes=0; + if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) + { + printf("error, cannot get file size\n"); + exit(0); + } + + unsigned char* buf = (unsigned char*) malloc(sizeInBytes); + fread(buf,sizeInBytes,1,f); + int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context); + int num = *(int*)&buf[serializedBytes]; + + launcher.launch1D( num); + + //this clFinish is for testing on errors + clFinish(m_queue); + } + +#endif + + +#ifdef DEBUG_ME + clFinish(m_queue); + gpuDebugInfo.read(debugInfo,numWorkItems); + clFinish(m_queue); + for (int i=0;i<numWorkItems;i++) + { + if (debugInfo[i].m_valInt2>0) + { + printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2); + } + + if (debugInfo[i].m_valInt3>0) + { + printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3); + } + } +#endif //DEBUG_ME + + + } + } + + clFinish(m_data->m_queue); + + + } + + cdata.x = 1; + bool applyFriction=true; + if (applyFriction) + { + B3_PROFILE("m_batchSolveKernel iterations2"); + for(int iter=0; iter<numIterations; iter++) + { + for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++) + { + cdata.z = ib; + + + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( bodyBuf->getBufferCL() ), + b3BufferInfoCL( shapeBuf->getBufferCL() ), + b3BufferInfoCL( constraint->getBufferCL() ), + b3BufferInfoCL( m_data->m_solverGPU->m_numConstraints->getBufferCL() ), + b3BufferInfoCL( m_data->m_solverGPU->m_offsets->getBufferCL() ) +#ifdef DEBUG_ME + ,b3BufferInfoCL(&gpuDebugInfo) +#endif //DEBUG_ME + }; + b3LauncherCL launcher( m_data->m_queue, m_data->m_solveFrictionKernel,"m_solveFrictionKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffer( m_data->m_solverGPU->m_batchSizes.getBufferCL()); + //launcher.setConst( cdata.x ); + launcher.setConst( cdata.y ); + launcher.setConst( cdata.z ); + + b3Int4 nSplit; + nSplit.x = B3_SOLVER_N_SPLIT_X; + nSplit.y = B3_SOLVER_N_SPLIT_Y; + nSplit.z = B3_SOLVER_N_SPLIT_Z; + + launcher.setConst( nSplit ); + + launcher.launch1D( 64*nn/B3_SOLVER_N_BATCHES, 64 ); + } + } + clFinish(m_data->m_queue); + + } +#ifdef DEBUG_ME + delete[] debugInfo; +#endif //DEBUG_ME + } + + +} + + + + + + + + + + + +static bool sortfnc(const b3SortData& a,const b3SortData& b) +{ + return (a.m_key<b.m_key); +} + +static bool b3ContactCmp(const b3Contact4& p, const b3Contact4& q) +{ + return ((p.m_bodyAPtrAndSignBit<q.m_bodyAPtrAndSignBit) || + ((p.m_bodyAPtrAndSignBit==q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit<q.m_bodyBPtrAndSignBit)) || + ((p.m_bodyAPtrAndSignBit==q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit==q.m_bodyBPtrAndSignBit) && p.m_childIndexA<q.m_childIndexA ) || + ((p.m_bodyAPtrAndSignBit==q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit==q.m_bodyBPtrAndSignBit) && p.m_childIndexA<q.m_childIndexA ) || + ((p.m_bodyAPtrAndSignBit==q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit==q.m_bodyBPtrAndSignBit) && p.m_childIndexA==q.m_childIndexA && p.m_childIndexB<q.m_childIndexB) + ); +} + + + + + + + + + + + +#define USE_SPATIAL_BATCHING 1 +#define USE_4x4_GRID 1 + +#ifndef USE_SPATIAL_BATCHING +static const int gridTable4x4[] = +{ + 0,1,17,16, + 1,2,18,19, + 17,18,32,3, + 16,19,3,34 +}; +static const int gridTable8x8[] = +{ + 0, 2, 3, 16, 17, 18, 19, 1, + 66, 64, 80, 67, 82, 81, 65, 83, + 131,144,128,130,147,129,145,146, + 208,195,194,192,193,211,210,209, + 21, 22, 23, 5, 4, 6, 7, 20, + 86, 85, 69, 87, 70, 68, 84, 71, + 151,133,149,150,135,148,132,134, + 197,27,214,213,212,199,198,196 + +}; + + +#endif + + +void SetSortDataCPU(b3Contact4* gContact, b3RigidBodyData* gBodies, b3SortData* gSortDataOut, int nContacts,float scale,const b3Int4& nSplit,int staticIdx) +{ + for (int gIdx=0;gIdx<nContacts;gIdx++) + { + if( gIdx < nContacts ) + { + int aPtrAndSignBit = gContact[gIdx].m_bodyAPtrAndSignBit; + int bPtrAndSignBit = gContact[gIdx].m_bodyBPtrAndSignBit; + + int aIdx = abs(aPtrAndSignBit ); + int bIdx = abs(bPtrAndSignBit); + + bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx); + + #if USE_SPATIAL_BATCHING + int idx = (aStatic)? bIdx: aIdx; + b3Vector3 p = gBodies[idx].m_pos; + int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (nSplit.x-1); + int yIdx = (int)((p.y-((p.y<0.f)?1.f:0.f))*scale) & (nSplit.y-1); + int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (nSplit.z-1); + + int newIndex = (xIdx+yIdx*nSplit.x+zIdx*nSplit.x*nSplit.y); + + #else//USE_SPATIAL_BATCHING + bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx); + + #if USE_4x4_GRID + int aa = aIdx&3; + int bb = bIdx&3; + if (aStatic) + aa = bb; + if (bStatic) + bb = aa; + + int gridIndex = aa + bb*4; + int newIndex = gridTable4x4[gridIndex]; + #else//USE_4x4_GRID + int aa = aIdx&7; + int bb = bIdx&7; + if (aStatic) + aa = bb; + if (bStatic) + bb = aa; + + int gridIndex = aa + bb*8; + int newIndex = gridTable8x8[gridIndex]; + #endif//USE_4x4_GRID + #endif//USE_SPATIAL_BATCHING + + + gSortDataOut[gIdx].x = newIndex; + gSortDataOut[gIdx].y = gIdx; + } + else + { + gSortDataOut[gIdx].x = 0xffffffff; + } + } +} + + + + + + +void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const b3Config& config, int static0Index) +{ + B3_PROFILE("solveContacts"); + m_data->m_bodyBufferGPU->setFromOpenCLBuffer(bodyBuf,numBodies); + m_data->m_inertiaBufferGPU->setFromOpenCLBuffer(inertiaBuf,numBodies); + m_data->m_pBufContactOutGPU->setFromOpenCLBuffer(contactBuf,numContacts); + + if (optionalSortContactsDeterminism) + { + if (!gCpuSortContactsDeterminism) + { + B3_PROFILE("GPU Sort contact constraints (determinism)"); + + m_data->m_pBufContactOutGPUCopy->resize(numContacts); + m_data->m_contactKeyValues->resize(numContacts); + + m_data->m_pBufContactOutGPU->copyToCL(m_data->m_pBufContactOutGPUCopy->getBufferCL(),numContacts,0,0); + + { + b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeBKernel,"m_setDeterminismSortDataChildShapeBKernel"); + launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); + launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); + launcher.setConst(numContacts); + launcher.launch1D( numContacts, 64 ); + } + m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues); + { + b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeAKernel,"m_setDeterminismSortDataChildShapeAKernel"); + launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); + launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); + launcher.setConst(numContacts); + launcher.launch1D( numContacts, 64 ); + } + m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues); + { + b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyBKernel,"m_setDeterminismSortDataBodyBKernel"); + launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); + launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); + launcher.setConst(numContacts); + launcher.launch1D( numContacts, 64 ); + } + + m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues); + + { + b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyAKernel,"m_setDeterminismSortDataBodyAKernel"); + launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); + launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); + launcher.setConst(numContacts); + launcher.launch1D( numContacts, 64 ); + } + + m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues); + + { + B3_PROFILE("gpu reorderContactKernel (determinism)"); + + b3Int4 cdata; + cdata.x = numContacts; + + //b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL()) + // , b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; + b3LauncherCL launcher(m_data->m_queue,m_data->m_solverGPU->m_reorderContactKernel,"m_reorderContactKernel"); + launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); + launcher.setBuffer(m_data->m_pBufContactOutGPU->getBufferCL()); + launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); + launcher.setConst( cdata ); + launcher.launch1D( numContacts, 64 ); + } + + } else + { + B3_PROFILE("CPU Sort contact constraints (determinism)"); + b3AlignedObjectArray<b3Contact4> cpuConstraints; + m_data->m_pBufContactOutGPU->copyToHost(cpuConstraints); + bool sort = true; + if (sort) + { + cpuConstraints.quickSort(b3ContactCmp); + + for (int i=0;i<cpuConstraints.size();i++) + { + cpuConstraints[i].m_batchIdx = i; + } + } + m_data->m_pBufContactOutGPU->copyFromHost(cpuConstraints); + if (m_debugOutput==100) + { + for (int i=0;i<cpuConstraints.size();i++) + { + printf("c[%d].m_bodyA = %d, m_bodyB = %d, batchId = %d\n",i,cpuConstraints[i].m_bodyAPtrAndSignBit,cpuConstraints[i].m_bodyBPtrAndSignBit, cpuConstraints[i].m_batchIdx); + } + } + + m_debugOutput++; + } + } + + + + + int nContactOut = m_data->m_pBufContactOutGPU->size(); + + bool useSolver = true; + + + if (useSolver) + { + float dt=1./60.; + b3ConstraintCfg csCfg( dt ); + csCfg.m_enableParallelSolve = true; + csCfg.m_batchCellSize = 6; + csCfg.m_staticIdx = static0Index; + + + b3OpenCLArray<b3RigidBodyData>* bodyBuf = m_data->m_bodyBufferGPU; + + void* additionalData = 0;//m_data->m_frictionCGPU; + const b3OpenCLArray<b3InertiaData>* shapeBuf = m_data->m_inertiaBufferGPU; + b3OpenCLArray<b3GpuConstraint4>* contactConstraintOut = m_data->m_contactCGPU; + int nContacts = nContactOut; + + + int maxNumBatches = 0; + + if (!gUseLargeBatches) + { + + if( m_data->m_solverGPU->m_contactBuffer2) + { + m_data->m_solverGPU->m_contactBuffer2->resize(nContacts); + } + + if( m_data->m_solverGPU->m_contactBuffer2 == 0 ) + { + m_data->m_solverGPU->m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(m_data->m_context,m_data->m_queue, nContacts ); + m_data->m_solverGPU->m_contactBuffer2->resize(nContacts); + } + + //clFinish(m_data->m_queue); + + + + { + B3_PROFILE("batching"); + //@todo: just reserve it, without copy of original contact (unless we use warmstarting) + + + + //const b3OpenCLArray<b3RigidBodyData>* bodyNative = bodyBuf; + + + { + + //b3OpenCLArray<b3RigidBodyData>* bodyNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf ); + //b3OpenCLArray<b3Contact4>* contactNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn ); + + const int sortAlignment = 512; // todo. get this out of sort + if( csCfg.m_enableParallelSolve ) + { + + + int sortSize = B3NEXTMULTIPLEOF( nContacts, sortAlignment ); + + b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints; + b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets; + + + if (!gCpuSetSortData) + { // 2. set cell idx + B3_PROFILE("GPU set cell idx"); + struct CB + { + int m_nContacts; + int m_staticIdx; + float m_scale; + b3Int4 m_nSplit; + }; + + b3Assert( sortSize%64 == 0 ); + CB cdata; + cdata.m_nContacts = nContacts; + cdata.m_staticIdx = csCfg.m_staticIdx; + cdata.m_scale = 1.f/csCfg.m_batchCellSize; + cdata.m_nSplit.x = B3_SOLVER_N_SPLIT_X; + cdata.m_nSplit.y = B3_SOLVER_N_SPLIT_Y; + cdata.m_nSplit.z = B3_SOLVER_N_SPLIT_Z; + + m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts); + + + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL()), b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; + b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_setSortDataKernel,"m_setSortDataKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata.m_nContacts ); + launcher.setConst( cdata.m_scale ); + launcher.setConst(cdata.m_nSplit); + launcher.setConst(cdata.m_staticIdx); + + + launcher.launch1D( sortSize, 64 ); + } else + { + m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts); + b3AlignedObjectArray<b3SortData> sortDataCPU; + m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataCPU); + + b3AlignedObjectArray<b3Contact4> contactCPU; + m_data->m_pBufContactOutGPU->copyToHost(contactCPU); + b3AlignedObjectArray<b3RigidBodyData> bodiesCPU; + bodyBuf->copyToHost(bodiesCPU); + float scale = 1.f/csCfg.m_batchCellSize; + b3Int4 nSplit; + nSplit.x = B3_SOLVER_N_SPLIT_X; + nSplit.y = B3_SOLVER_N_SPLIT_Y; + nSplit.z = B3_SOLVER_N_SPLIT_Z; + + SetSortDataCPU(&contactCPU[0], &bodiesCPU[0], &sortDataCPU[0], nContacts,scale,nSplit,csCfg.m_staticIdx); + + + m_data->m_solverGPU->m_sortDataBuffer->copyFromHost(sortDataCPU); + } + + + + if (!gCpuRadixSort) + { // 3. sort by cell idx + B3_PROFILE("gpuRadixSort"); + //int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT; + //int sortBit = 32; + //if( n <= 0xffff ) sortBit = 16; + //if( n <= 0xff ) sortBit = 8; + //adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize ); + //adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize ); + b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer); + this->m_data->m_solverGPU->m_sort32->execute(keyValuesInOut); + + + + } else + { + b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer); + b3AlignedObjectArray<b3SortData> hostValues; + keyValuesInOut.copyToHost(hostValues); + hostValues.quickSort(sortfnc); + keyValuesInOut.copyFromHost(hostValues); + } + + + if (gUseScanHost) + { + // 4. find entries + B3_PROFILE("cpuBoundSearch"); + b3AlignedObjectArray<unsigned int> countsHost; + countsNative->copyToHost(countsHost); + + b3AlignedObjectArray<b3SortData> sortDataHost; + m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost); + + + //m_data->m_solverGPU->m_search->executeHost(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT); + m_data->m_solverGPU->m_search->executeHost(sortDataHost,nContacts,countsHost,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT); + + countsNative->copyFromHost(countsHost); + + + //adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative, + // B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT ); + + //unsigned int sum; + //m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum ); + b3AlignedObjectArray<unsigned int> offsetsHost; + offsetsHost.resize(offsetsNative->size()); + + + m_data->m_solverGPU->m_scan->executeHost(countsHost,offsetsHost, B3_SOLVER_N_CELLS);//,&sum ); + offsetsNative->copyFromHost(offsetsHost); + + //printf("sum = %d\n",sum); + } else + { + // 4. find entries + B3_PROFILE("gpuBoundSearch"); + m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT); + m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum ); + } + + + + + if (nContacts) + { // 5. sort constraints by cellIdx + if (gReorderContactsOnCpu) + { + B3_PROFILE("cpu m_reorderContactKernel"); + b3AlignedObjectArray<b3SortData> sortDataHost; + m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost); + b3AlignedObjectArray<b3Contact4> inContacts; + b3AlignedObjectArray<b3Contact4> outContacts; + m_data->m_pBufContactOutGPU->copyToHost(inContacts); + outContacts.resize(inContacts.size()); + for (int i=0;i<nContacts;i++) + { + int srcIdx = sortDataHost[i].y; + outContacts[i] = inContacts[srcIdx]; + } + m_data->m_solverGPU->m_contactBuffer2->copyFromHost(outContacts); + + /* "void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n" + "{\n" + " int nContacts = cb.x;\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " if( gIdx < nContacts )\n" + " {\n" + " int srcIdx = sortData[gIdx].y;\n" + " out[gIdx] = in[srcIdx];\n" + " }\n" + "}\n" + */ + } else + { + B3_PROFILE("gpu m_reorderContactKernel"); + + b3Int4 cdata; + cdata.x = nContacts; + + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), + b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL()) + , b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; + + b3LauncherCL launcher(m_data->m_queue,m_data->m_solverGPU->m_reorderContactKernel,"m_reorderContactKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( nContacts, 64 ); + } + } + + + + + } + + } + + //clFinish(m_data->m_queue); + + // { + // b3AlignedObjectArray<unsigned int> histogram; + // m_data->m_solverGPU->m_numConstraints->copyToHost(histogram); + // printf(",,,\n"); + // } + + + if (nContacts) + { + + if (gUseCpuCopyConstraints) + { + for (int i=0;i<nContacts;i++) + { + m_data->m_pBufContactOutGPU->copyFromOpenCLArray(*m_data->m_solverGPU->m_contactBuffer2); + // m_data->m_solverGPU->m_contactBuffer2->getBufferCL(); + // m_data->m_pBufContactOutGPU->getBufferCL() + } + + } else + { + B3_PROFILE("gpu m_copyConstraintKernel"); + b3Int4 cdata; cdata.x = nContacts; + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL() ), + b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ) + }; + + b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_copyConstraintKernel,"m_copyConstraintKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( nContacts, 64 ); + //we use the clFinish for proper benchmark/profile + clFinish(m_data->m_queue); + } + } + + +// bool compareGPU = false; + if (nContacts) + { + if (!gCpuBatchContacts) + { + B3_PROFILE("gpu batchContacts"); + maxNumBatches = 250;//250; + m_data->m_solverGPU->batchContacts( m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx ); + clFinish(m_data->m_queue); + } else + { + B3_PROFILE("cpu batchContacts"); + static b3AlignedObjectArray<b3Contact4> cpuContacts; + b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2; + { + B3_PROFILE("copyToHost"); + contactsIn->copyToHost(cpuContacts); + } + b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints; + b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets; + + b3AlignedObjectArray<unsigned int> nNativeHost; + b3AlignedObjectArray<unsigned int> offsetsNativeHost; + + { + B3_PROFILE("countsNative/offsetsNative copyToHost"); + countsNative->copyToHost(nNativeHost); + offsetsNative->copyToHost(offsetsNativeHost); + } + + + int numNonzeroGrid=0; + + if (gUseLargeBatches) + { + m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES); + int totalNumConstraints = cpuContacts.size(); + //int simdWidth =numBodies+1;//-1;//64;//-1;//32; + int numBatches = sortConstraintByBatch3( &cpuContacts[0], totalNumConstraints, totalNumConstraints+1,csCfg.m_staticIdx ,numBodies,&m_data->m_batchSizes[0]); // on GPU + maxNumBatches = b3Max(numBatches,maxNumBatches); + static int globalMaxBatch = 0; + if (maxNumBatches>globalMaxBatch ) + { + globalMaxBatch = maxNumBatches; + b3Printf("maxNumBatches = %d\n",maxNumBatches); + } + + } else + { + m_data->m_batchSizes.resize(B3_SOLVER_N_CELLS*B3_MAX_NUM_BATCHES); + B3_PROFILE("cpu batch grid"); + for(int i=0; i<B3_SOLVER_N_CELLS; i++) + { + int n = (nNativeHost)[i]; + int offset = (offsetsNativeHost)[i]; + if( n ) + { + numNonzeroGrid++; + int simdWidth =numBodies+1;//-1;//64;//-1;//32; + int numBatches = sortConstraintByBatch3( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies,&m_data->m_batchSizes[i*B3_MAX_NUM_BATCHES]); // on GPU + maxNumBatches = b3Max(numBatches,maxNumBatches); + static int globalMaxBatch = 0; + if (maxNumBatches>globalMaxBatch ) + { + globalMaxBatch = maxNumBatches; + b3Printf("maxNumBatches = %d\n",maxNumBatches); + } + //we use the clFinish for proper benchmark/profile + + } + } + //clFinish(m_data->m_queue); + } + { + B3_PROFILE("m_contactBuffer->copyFromHost"); + m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts); + } + + } + + } + + + + + + } + + + } + + + //printf("maxNumBatches = %d\n", maxNumBatches); + + if (gUseLargeBatches) + { + if (nContacts) + { + B3_PROFILE("cpu batchContacts"); + static b3AlignedObjectArray<b3Contact4> cpuContacts; +// b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2; + { + B3_PROFILE("copyToHost"); + m_data->m_pBufContactOutGPU->copyToHost(cpuContacts); + } +// b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints; +// b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets; + + + +// int numNonzeroGrid=0; + + { + m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES); + int totalNumConstraints = cpuContacts.size(); + // int simdWidth =numBodies+1;//-1;//64;//-1;//32; + int numBatches = sortConstraintByBatch3( &cpuContacts[0], totalNumConstraints, totalNumConstraints+1,csCfg.m_staticIdx ,numBodies,&m_data->m_batchSizes[0]); // on GPU + maxNumBatches = b3Max(numBatches,maxNumBatches); + static int globalMaxBatch = 0; + if (maxNumBatches>globalMaxBatch ) + { + globalMaxBatch = maxNumBatches; + b3Printf("maxNumBatches = %d\n",maxNumBatches); + } + + } + { + B3_PROFILE("m_contactBuffer->copyFromHost"); + m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts); + } + + } + + } + + if (nContacts) + { + B3_PROFILE("gpu convertToConstraints"); + m_data->m_solverGPU->convertToConstraints( bodyBuf, + shapeBuf, m_data->m_solverGPU->m_contactBuffer2, + contactConstraintOut, + additionalData, nContacts, + (b3SolverBase::ConstraintCfg&) csCfg ); + clFinish(m_data->m_queue); + } + + + if (1) + { + int numIter = 4; + + m_data->m_solverGPU->m_nIterations = numIter;//10 + if (!gCpuSolveConstraint) + { + B3_PROFILE("GPU solveContactConstraint"); + + /*m_data->m_solverGPU->solveContactConstraint( + m_data->m_bodyBufferGPU, + m_data->m_inertiaBufferGPU, + m_data->m_contactCGPU,0, + nContactOut , + maxNumBatches); + */ + + //m_data->m_batchSizesGpu->copyFromHost(m_data->m_batchSizes); + + if (gUseLargeBatches) + { + solveContactConstraintBatchSizes(m_data->m_bodyBufferGPU, + m_data->m_inertiaBufferGPU, + m_data->m_contactCGPU,0, + nContactOut , + maxNumBatches,numIter,&m_data->m_batchSizes); + } else + { + solveContactConstraint( + m_data->m_bodyBufferGPU, + m_data->m_inertiaBufferGPU, + m_data->m_contactCGPU,0, + nContactOut , + maxNumBatches,numIter,&m_data->m_batchSizes);//m_data->m_batchSizesGpu); + } + } + else + { + B3_PROFILE("Host solveContactConstraint"); + + m_data->m_solverGPU->solveContactConstraintHost(m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU,0, nContactOut ,maxNumBatches,&m_data->m_batchSizes); + } + + + } + + +#if 0 + if (0) + { + B3_PROFILE("read body velocities back to CPU"); + //read body updated linear/angular velocities back to CPU + m_data->m_bodyBufferGPU->read( + m_data->m_bodyBufferCPU->m_ptr,numOfConvexRBodies); + adl::DeviceUtils::waitForCompletion( m_data->m_deviceCL ); + } +#endif + + } + +} + + +void b3GpuPgsContactSolver::batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx ) +{ +} + + + + + + + + + + + +b3AlignedObjectArray<unsigned int> idxBuffer; +b3AlignedObjectArray<b3SortData> sortData; +b3AlignedObjectArray<b3Contact4> old; + + +inline int b3GpuPgsContactSolver::sortConstraintByBatch( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies) +{ + + B3_PROFILE("sortConstraintByBatch"); + int numIter = 0; + + sortData.resize(n); + idxBuffer.resize(n); + old.resize(n); + + unsigned int* idxSrc = &idxBuffer[0]; + unsigned int* idxDst = &idxBuffer[0]; + int nIdxSrc, nIdxDst; + + const int N_FLG = 256; + const int FLG_MASK = N_FLG-1; + unsigned int flg[N_FLG/32]; +#if defined(_DEBUG) + for(int i=0; i<n; i++) + cs[i].getBatchIdx() = -1; +#endif + for(int i=0; i<n; i++) + idxSrc[i] = i; + nIdxSrc = n; + + int batchIdx = 0; + + { + B3_PROFILE("cpu batch innerloop"); + while( nIdxSrc ) + { + numIter++; + nIdxDst = 0; + int nCurrentBatch = 0; + + // clear flag + for(int i=0; i<N_FLG/32; i++) flg[i] = 0; + + for(int i=0; i<nIdxSrc; i++) + { + int idx = idxSrc[i]; + + + b3Assert( idx < n ); + // check if it can go + int bodyAS = cs[idx].m_bodyAPtrAndSignBit; + int bodyBS = cs[idx].m_bodyBPtrAndSignBit; + + + + int bodyA = abs(bodyAS); + int bodyB = abs(bodyBS); + + int aIdx = bodyA & FLG_MASK; + int bIdx = bodyB & FLG_MASK; + + unsigned int aUnavailable = flg[ aIdx/32 ] & (1<<(aIdx&31)); + unsigned int bUnavailable = flg[ bIdx/32 ] & (1<<(bIdx&31)); + + bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx; + bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx; + + //use inv_mass! + aUnavailable = !aIsStatic? aUnavailable:0;// + bUnavailable = !bIsStatic? bUnavailable:0; + + if( aUnavailable==0 && bUnavailable==0 ) // ok + { + if (!aIsStatic) + flg[ aIdx/32 ] |= (1<<(aIdx&31)); + if (!bIsStatic) + flg[ bIdx/32 ] |= (1<<(bIdx&31)); + + cs[idx].getBatchIdx() = batchIdx; + sortData[idx].m_key = batchIdx; + sortData[idx].m_value = idx; + + { + nCurrentBatch++; + if( nCurrentBatch == simdWidth ) + { + nCurrentBatch = 0; + for(int i=0; i<N_FLG/32; i++) flg[i] = 0; + } + } + } + else + { + idxDst[nIdxDst++] = idx; + } + } + b3Swap( idxSrc, idxDst ); + b3Swap( nIdxSrc, nIdxDst ); + batchIdx ++; + } + } + { + B3_PROFILE("quickSort"); + sortData.quickSort(sortfnc); + } + + + { + B3_PROFILE("reorder"); + // reorder + + memcpy( &old[0], cs, sizeof(b3Contact4)*n); + for(int i=0; i<n; i++) + { + int idx = sortData[i].m_value; + cs[i] = old[idx]; + } + } + + +#if defined(_DEBUG) + // debugPrintf( "nBatches: %d\n", batchIdx ); + for(int i=0; i<n; i++) + { + b3Assert( cs[i].getBatchIdx() != -1 ); + } +#endif + return batchIdx; +} + + +b3AlignedObjectArray<int> bodyUsed2; + +inline int b3GpuPgsContactSolver::sortConstraintByBatch2( b3Contact4* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies) +{ + + B3_PROFILE("sortConstraintByBatch2"); + + + + bodyUsed2.resize(2*simdWidth); + + for (int q=0;q<2*simdWidth;q++) + bodyUsed2[q]=0; + + int curBodyUsed = 0; + + int numIter = 0; + + m_data->m_sortData.resize(numConstraints); + m_data->m_idxBuffer.resize(numConstraints); + m_data->m_old.resize(numConstraints); + + unsigned int* idxSrc = &m_data->m_idxBuffer[0]; + +#if defined(_DEBUG) + for(int i=0; i<numConstraints; i++) + cs[i].getBatchIdx() = -1; +#endif + for(int i=0; i<numConstraints; i++) + idxSrc[i] = i; + + int numValidConstraints = 0; +// int unprocessedConstraintIndex = 0; + + int batchIdx = 0; + + + { + B3_PROFILE("cpu batch innerloop"); + + while( numValidConstraints < numConstraints) + { + numIter++; + int nCurrentBatch = 0; + // clear flag + for(int i=0; i<curBodyUsed; i++) + bodyUsed2[i] = 0; + curBodyUsed = 0; + + for(int i=numValidConstraints; i<numConstraints; i++) + { + int idx = idxSrc[i]; + b3Assert( idx < numConstraints ); + // check if it can go + int bodyAS = cs[idx].m_bodyAPtrAndSignBit; + int bodyBS = cs[idx].m_bodyBPtrAndSignBit; + int bodyA = abs(bodyAS); + int bodyB = abs(bodyBS); + bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx; + bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx; + int aUnavailable = 0; + int bUnavailable = 0; + if (!aIsStatic) + { + for (int j=0;j<curBodyUsed;j++) + { + if (bodyA == bodyUsed2[j]) + { + aUnavailable=1; + break; + } + } + } + if (!aUnavailable) + if (!bIsStatic) + { + for (int j=0;j<curBodyUsed;j++) + { + if (bodyB == bodyUsed2[j]) + { + bUnavailable=1; + break; + } + } + } + + if( aUnavailable==0 && bUnavailable==0 ) // ok + { + if (!aIsStatic) + { + bodyUsed2[curBodyUsed++] = bodyA; + } + if (!bIsStatic) + { + bodyUsed2[curBodyUsed++] = bodyB; + } + + cs[idx].getBatchIdx() = batchIdx; + m_data->m_sortData[idx].m_key = batchIdx; + m_data->m_sortData[idx].m_value = idx; + + if (i!=numValidConstraints) + { + b3Swap(idxSrc[i], idxSrc[numValidConstraints]); + } + + numValidConstraints++; + { + nCurrentBatch++; + if( nCurrentBatch == simdWidth ) + { + nCurrentBatch = 0; + for(int i=0; i<curBodyUsed; i++) + bodyUsed2[i] = 0; + + + curBodyUsed = 0; + } + } + } + } + + batchIdx ++; + } + } + { + B3_PROFILE("quickSort"); + //m_data->m_sortData.quickSort(sortfnc); + } + + { + B3_PROFILE("reorder"); + // reorder + + memcpy( &m_data->m_old[0], cs, sizeof(b3Contact4)*numConstraints); + + for(int i=0; i<numConstraints; i++) + { + b3Assert(m_data->m_sortData[idxSrc[i]].m_value == idxSrc[i]); + int idx = m_data->m_sortData[idxSrc[i]].m_value; + cs[i] = m_data->m_old[idx]; + } + } + +#if defined(_DEBUG) + // debugPrintf( "nBatches: %d\n", batchIdx ); + for(int i=0; i<numConstraints; i++) + { + b3Assert( cs[i].getBatchIdx() != -1 ); + } +#endif + + + return batchIdx; +} + + +b3AlignedObjectArray<int> bodyUsed; +b3AlignedObjectArray<int> curUsed; + + +inline int b3GpuPgsContactSolver::sortConstraintByBatch3( b3Contact4* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies, int* batchSizes) +{ + + B3_PROFILE("sortConstraintByBatch3"); + + static int maxSwaps = 0; + int numSwaps = 0; + + curUsed.resize(2*simdWidth); + + static int maxNumConstraints = 0; + if (maxNumConstraints<numConstraints) + { + maxNumConstraints = numConstraints; + //printf("maxNumConstraints = %d\n",maxNumConstraints ); + } + + int numUsedArray = numBodies/32+1; + bodyUsed.resize(numUsedArray); + + for (int q=0;q<numUsedArray;q++) + bodyUsed[q]=0; + + + int curBodyUsed = 0; + + int numIter = 0; + + m_data->m_sortData.resize(0); + m_data->m_idxBuffer.resize(0); + m_data->m_old.resize(0); + + +#if defined(_DEBUG) + for(int i=0; i<numConstraints; i++) + cs[i].getBatchIdx() = -1; +#endif + + int numValidConstraints = 0; +// int unprocessedConstraintIndex = 0; + + int batchIdx = 0; + + + { + B3_PROFILE("cpu batch innerloop"); + + while( numValidConstraints < numConstraints) + { + numIter++; + int nCurrentBatch = 0; + batchSizes[batchIdx] = 0; + + // clear flag + for(int i=0; i<curBodyUsed; i++) + bodyUsed[curUsed[i]/32] = 0; + + curBodyUsed = 0; + + for(int i=numValidConstraints; i<numConstraints; i++) + { + int idx = i; + b3Assert( idx < numConstraints ); + // check if it can go + int bodyAS = cs[idx].m_bodyAPtrAndSignBit; + int bodyBS = cs[idx].m_bodyBPtrAndSignBit; + int bodyA = abs(bodyAS); + int bodyB = abs(bodyBS); + bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx; + bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx; + int aUnavailable = 0; + int bUnavailable = 0; + if (!aIsStatic) + { + aUnavailable = bodyUsed[ bodyA/32 ] & (1<<(bodyA&31)); + } + if (!aUnavailable) + if (!bIsStatic) + { + bUnavailable = bodyUsed[ bodyB/32 ] & (1<<(bodyB&31)); + } + + if( aUnavailable==0 && bUnavailable==0 ) // ok + { + if (!aIsStatic) + { + bodyUsed[ bodyA/32 ] |= (1<<(bodyA&31)); + curUsed[curBodyUsed++]=bodyA; + } + if (!bIsStatic) + { + bodyUsed[ bodyB/32 ] |= (1<<(bodyB&31)); + curUsed[curBodyUsed++]=bodyB; + } + + cs[idx].getBatchIdx() = batchIdx; + + if (i!=numValidConstraints) + { + b3Swap(cs[i],cs[numValidConstraints]); + numSwaps++; + } + + numValidConstraints++; + { + nCurrentBatch++; + if( nCurrentBatch == simdWidth ) + { + batchSizes[batchIdx] += simdWidth; + nCurrentBatch = 0; + for(int i=0; i<curBodyUsed; i++) + bodyUsed[curUsed[i]/32] = 0; + curBodyUsed = 0; + } + } + } + } + + if (batchIdx>=B3_MAX_NUM_BATCHES) + { + b3Error("batchIdx>=B3_MAX_NUM_BATCHES"); + b3Assert(0); + break; + } + + batchSizes[batchIdx] += nCurrentBatch; + + batchIdx ++; + + } + } + +#if defined(_DEBUG) + // debugPrintf( "nBatches: %d\n", batchIdx ); + for(int i=0; i<numConstraints; i++) + { + b3Assert( cs[i].getBatchIdx() != -1 ); + } +#endif + + batchSizes[batchIdx] =0; + + if (maxSwaps<numSwaps) + { + maxSwaps = numSwaps; + //printf("maxSwaps = %d\n", maxSwaps); + } + + return batchIdx; +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.h new file mode 100644 index 0000000000..98e2a5b8c4 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.h @@ -0,0 +1,43 @@ + +#ifndef B3_GPU_BATCHING_PGS_SOLVER_H +#define B3_GPU_BATCHING_PGS_SOLVER_H + +#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h" +#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h" +#include "b3GpuConstraint4.h" + +class b3GpuPgsContactSolver +{ +protected: + + int m_debugOutput; + + struct b3GpuBatchingPgsSolverInternalData* m_data; + + void batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx ); + + inline int sortConstraintByBatch( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies); + inline int sortConstraintByBatch2( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies); + inline int sortConstraintByBatch3( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies, int* batchSizes); + + + + void solveContactConstraintBatchSizes( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes);//const b3OpenCLArray<int>* gpuBatchSizes); + + void solveContactConstraint( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes);//const b3OpenCLArray<int>* gpuBatchSizes); + +public: + + b3GpuPgsContactSolver(cl_context ctx,cl_device_id device, cl_command_queue q,int pairCapacity); + virtual ~b3GpuPgsContactSolver(); + + void solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index); + +}; + +#endif //B3_GPU_BATCHING_PGS_SOLVER_H + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp new file mode 100644 index 0000000000..783e443060 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp @@ -0,0 +1,708 @@ +/* +Copyright (c) 2013 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + +#include "b3GpuRigidBodyPipeline.h" +#include "b3GpuRigidBodyPipelineInternalData.h" +#include "kernels/integrateKernel.h" +#include "kernels/updateAabbsKernel.h" + +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "b3GpuNarrowPhase.h" +#include "Bullet3Geometry/b3AabbUtil.h" +#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h" +#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" +#include "Bullet3Dynamics/ConstraintSolver/b3PgsJacobiSolver.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3UpdateAabbs.h" +#include "Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.h" + +//#define TEST_OTHER_GPU_SOLVER + +#define B3_RIGIDBODY_INTEGRATE_PATH "src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl" +#define B3_RIGIDBODY_UPDATEAABB_PATH "src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl" + +bool useBullet2CpuSolver = true; + +//choice of contact solver +bool gUseJacobi = false; +bool gUseDbvt = false; +bool gDumpContactStats = false; +bool gCalcWorldSpaceAabbOnCpu = false; +bool gUseCalculateOverlappingPairsHost = false; +bool gIntegrateOnCpu = false; +bool gClearPairsOnGpu = true; + +#define TEST_OTHER_GPU_SOLVER 1 +#ifdef TEST_OTHER_GPU_SOLVER +#include "b3GpuJacobiContactSolver.h" +#endif //TEST_OTHER_GPU_SOLVER + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h" +#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h" +#include "Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h" + +#include "b3GpuPgsContactSolver.h" +#include "b3Solver.h" + +#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h" +#include "Bullet3OpenCL/Raycast/b3GpuRaycast.h" + + +#include "Bullet3Dynamics/shared/b3IntegrateTransforms.h" +#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h" + +b3GpuRigidBodyPipeline::b3GpuRigidBodyPipeline(cl_context ctx,cl_device_id device, cl_command_queue q,class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap , struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config) +{ + m_data = new b3GpuRigidBodyPipelineInternalData; + m_data->m_constraintUid=0; + m_data->m_config = config; + m_data->m_context = ctx; + m_data->m_device = device; + m_data->m_queue = q; + + m_data->m_solver = new b3PgsJacobiSolver(true);//new b3PgsJacobiSolver(true); + m_data->m_gpuSolver = new b3GpuPgsConstraintSolver(ctx,device,q,true);//new b3PgsJacobiSolver(true); + + m_data->m_allAabbsGPU = new b3OpenCLArray<b3SapAabb>(ctx,q,config.m_maxConvexBodies); + m_data->m_overlappingPairsGPU = new b3OpenCLArray<b3BroadphasePair>(ctx,q,config.m_maxBroadphasePairs); + + m_data->m_gpuConstraints = new b3OpenCLArray<b3GpuGenericConstraint>(ctx,q); +#ifdef TEST_OTHER_GPU_SOLVER + m_data->m_solver3 = new b3GpuJacobiContactSolver(ctx,device,q,config.m_maxBroadphasePairs); +#endif // TEST_OTHER_GPU_SOLVER + + m_data->m_solver2 = new b3GpuPgsContactSolver(ctx,device,q,config.m_maxBroadphasePairs); + + m_data->m_raycaster = new b3GpuRaycast(ctx,device,q); + + + m_data->m_broadphaseDbvt = broadphaseDbvt; + m_data->m_broadphaseSap = broadphaseSap; + m_data->m_narrowphase = narrowphase; + m_data->m_gravity.setValue(0.f,-9.8f,0.f); + + cl_int errNum=0; + + { + cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,integrateKernelCL,&errNum,"",B3_RIGIDBODY_INTEGRATE_PATH); + b3Assert(errNum==CL_SUCCESS); + m_data->m_integrateTransformsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,integrateKernelCL, "integrateTransformsKernel",&errNum,prog); + b3Assert(errNum==CL_SUCCESS); + clReleaseProgram(prog); + } + { + cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,updateAabbsKernelCL,&errNum,"",B3_RIGIDBODY_UPDATEAABB_PATH); + b3Assert(errNum==CL_SUCCESS); + m_data->m_updateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,updateAabbsKernelCL, "initializeGpuAabbsFull",&errNum,prog); + b3Assert(errNum==CL_SUCCESS); + + + m_data->m_clearOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,updateAabbsKernelCL, "clearOverlappingPairsKernel",&errNum,prog); + b3Assert(errNum==CL_SUCCESS); + + clReleaseProgram(prog); + } + + +} + +b3GpuRigidBodyPipeline::~b3GpuRigidBodyPipeline() +{ + if (m_data->m_integrateTransformsKernel) + clReleaseKernel(m_data->m_integrateTransformsKernel); + + if (m_data->m_updateAabbsKernel) + clReleaseKernel(m_data->m_updateAabbsKernel); + + if (m_data->m_clearOverlappingPairsKernel) + clReleaseKernel(m_data->m_clearOverlappingPairsKernel); + delete m_data->m_raycaster; + delete m_data->m_solver; + delete m_data->m_allAabbsGPU; + delete m_data->m_gpuConstraints; + delete m_data->m_overlappingPairsGPU; + +#ifdef TEST_OTHER_GPU_SOLVER + delete m_data->m_solver3; +#endif //TEST_OTHER_GPU_SOLVER + + delete m_data->m_solver2; + + + delete m_data; +} + +void b3GpuRigidBodyPipeline::reset() +{ + m_data->m_gpuConstraints->resize(0); + m_data->m_cpuConstraints.resize(0); + m_data->m_allAabbsGPU->resize(0); + m_data->m_allAabbsCPU.resize(0); +} + +void b3GpuRigidBodyPipeline::addConstraint(b3TypedConstraint* constraint) +{ + m_data->m_joints.push_back(constraint); +} + +void b3GpuRigidBodyPipeline::removeConstraint(b3TypedConstraint* constraint) +{ + m_data->m_joints.remove(constraint); +} + + + +void b3GpuRigidBodyPipeline::removeConstraintByUid(int uid) +{ + m_data->m_gpuSolver->recomputeBatches(); + //slow linear search + m_data->m_gpuConstraints->copyToHost(m_data->m_cpuConstraints); + //remove + for (int i=0;i<m_data->m_cpuConstraints.size();i++) + { + if (m_data->m_cpuConstraints[i].m_uid == uid) + { + //m_data->m_cpuConstraints.remove(m_data->m_cpuConstraints[i]); + m_data->m_cpuConstraints.swap(i,m_data->m_cpuConstraints.size()-1); + m_data->m_cpuConstraints.pop_back(); + + break; + } + } + + if (m_data->m_cpuConstraints.size()) + { + m_data->m_gpuConstraints->copyFromHost(m_data->m_cpuConstraints); + } else + { + m_data->m_gpuConstraints->resize(0); + } + +} +int b3GpuRigidBodyPipeline::createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB,float breakingThreshold) +{ + m_data->m_gpuSolver->recomputeBatches(); + b3GpuGenericConstraint c; + c.m_uid = m_data->m_constraintUid; + m_data->m_constraintUid++; + c.m_flags = B3_CONSTRAINT_FLAG_ENABLED; + c.m_rbA = bodyA; + c.m_rbB = bodyB; + c.m_pivotInA.setValue(pivotInA[0],pivotInA[1],pivotInA[2]); + c.m_pivotInB.setValue(pivotInB[0],pivotInB[1],pivotInB[2]); + c.m_breakingImpulseThreshold = breakingThreshold; + c.m_constraintType = B3_GPU_POINT2POINT_CONSTRAINT_TYPE; + m_data->m_cpuConstraints.push_back(c); + return c.m_uid; +} +int b3GpuRigidBodyPipeline::createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB,float breakingThreshold) +{ + m_data->m_gpuSolver->recomputeBatches(); + b3GpuGenericConstraint c; + c.m_uid = m_data->m_constraintUid; + m_data->m_constraintUid++; + c.m_flags = B3_CONSTRAINT_FLAG_ENABLED; + c.m_rbA = bodyA; + c.m_rbB = bodyB; + c.m_pivotInA.setValue(pivotInA[0],pivotInA[1],pivotInA[2]); + c.m_pivotInB.setValue(pivotInB[0],pivotInB[1],pivotInB[2]); + c.m_relTargetAB.setValue(relTargetAB[0],relTargetAB[1],relTargetAB[2],relTargetAB[3]); + c.m_breakingImpulseThreshold = breakingThreshold; + c.m_constraintType = B3_GPU_FIXED_CONSTRAINT_TYPE; + + m_data->m_cpuConstraints.push_back(c); + return c.m_uid; +} + + +void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime) +{ + + //update worldspace AABBs from local AABB/worldtransform + { + B3_PROFILE("setupGpuAabbs"); + setupGpuAabbsFull(); + } + + int numPairs =0; + + //compute overlapping pairs + { + + if (gUseDbvt) + { + { + B3_PROFILE("setAabb"); + m_data->m_allAabbsGPU->copyToHost(m_data->m_allAabbsCPU); + for (int i=0;i<m_data->m_allAabbsCPU.size();i++) + { + b3Vector3 aabbMin=b3MakeVector3(m_data->m_allAabbsCPU[i].m_min[0],m_data->m_allAabbsCPU[i].m_min[1],m_data->m_allAabbsCPU[i].m_min[2]); + b3Vector3 aabbMax=b3MakeVector3(m_data->m_allAabbsCPU[i].m_max[0],m_data->m_allAabbsCPU[i].m_max[1],m_data->m_allAabbsCPU[i].m_max[2]); + m_data->m_broadphaseDbvt->setAabb(i,aabbMin,aabbMax,0); + } + } + + { + B3_PROFILE("calculateOverlappingPairs"); + m_data->m_broadphaseDbvt->calculateOverlappingPairs(); + } + numPairs = m_data->m_broadphaseDbvt->getOverlappingPairCache()->getNumOverlappingPairs(); + + } else + { + if (gUseCalculateOverlappingPairsHost) + { + m_data->m_broadphaseSap->calculateOverlappingPairsHost(m_data->m_config.m_maxBroadphasePairs); + } else + { + m_data->m_broadphaseSap->calculateOverlappingPairs(m_data->m_config.m_maxBroadphasePairs); + } + numPairs = m_data->m_broadphaseSap->getNumOverlap(); + } + } + + //compute contact points +// printf("numPairs=%d\n",numPairs); + + int numContacts = 0; + + + int numBodies = m_data->m_narrowphase->getNumRigidBodies(); + + if (numPairs) + { + cl_mem pairs =0; + cl_mem aabbsWS =0; + if (gUseDbvt) + { + B3_PROFILE("m_overlappingPairsGPU->copyFromHost"); + m_data->m_overlappingPairsGPU->copyFromHost(m_data->m_broadphaseDbvt->getOverlappingPairCache()->getOverlappingPairArray()); + pairs = m_data->m_overlappingPairsGPU->getBufferCL(); + aabbsWS = m_data->m_allAabbsGPU->getBufferCL(); + } else + { + pairs = m_data->m_broadphaseSap->getOverlappingPairBuffer(); + aabbsWS = m_data->m_broadphaseSap->getAabbBufferWS(); + } + + m_data->m_overlappingPairsGPU->resize(numPairs); + + //mark the contacts for each pair as 'unused' + if (numPairs) + { + b3OpenCLArray<b3BroadphasePair> gpuPairs(this->m_data->m_context,m_data->m_queue); + gpuPairs.setFromOpenCLBuffer(pairs,numPairs); + + if (gClearPairsOnGpu) + { + + + //b3AlignedObjectArray<b3BroadphasePair> hostPairs;//just for debugging + //gpuPairs.copyToHost(hostPairs); + + b3LauncherCL launcher(m_data->m_queue,m_data->m_clearOverlappingPairsKernel,"clearOverlappingPairsKernel"); + launcher.setBuffer(pairs); + launcher.setConst(numPairs); + launcher.launch1D(numPairs); + + + //gpuPairs.copyToHost(hostPairs); + + + } else + { + b3AlignedObjectArray<b3BroadphasePair> hostPairs; + gpuPairs.copyToHost(hostPairs); + + for (int i=0;i<hostPairs.size();i++) + { + hostPairs[i].z = 0xffffffff; + } + + gpuPairs.copyFromHost(hostPairs); + } + } + + m_data->m_narrowphase->computeContacts(pairs,numPairs,aabbsWS,numBodies); + numContacts = m_data->m_narrowphase->getNumContactsGpu(); + + if (gUseDbvt) + { + ///store the cached information (contact locations in the 'z' component) + B3_PROFILE("m_overlappingPairsGPU->copyToHost"); + m_data->m_overlappingPairsGPU->copyToHost(m_data->m_broadphaseDbvt->getOverlappingPairCache()->getOverlappingPairArray()); + } + if (gDumpContactStats && numContacts) + { + m_data->m_narrowphase->getContactsGpu(); + + printf("numContacts = %d\n", numContacts); + + int totalPoints = 0; + const b3Contact4* contacts = m_data->m_narrowphase->getContactsCPU(); + + for (int i=0;i<numContacts;i++) + { + totalPoints += contacts->getNPoints(); + } + printf("totalPoints=%d\n",totalPoints); + + } + } + + + //convert contact points to contact constraints + + //solve constraints + + b3OpenCLArray<b3RigidBodyData> gpuBodies(m_data->m_context,m_data->m_queue,0,true); + gpuBodies.setFromOpenCLBuffer(m_data->m_narrowphase->getBodiesGpu(),m_data->m_narrowphase->getNumRigidBodies()); + b3OpenCLArray<b3InertiaData> gpuInertias(m_data->m_context,m_data->m_queue,0,true); + gpuInertias.setFromOpenCLBuffer(m_data->m_narrowphase->getBodyInertiasGpu(),m_data->m_narrowphase->getNumRigidBodies()); + b3OpenCLArray<b3Contact4> gpuContacts(m_data->m_context,m_data->m_queue,0,true); + gpuContacts.setFromOpenCLBuffer(m_data->m_narrowphase->getContactsGpu(),m_data->m_narrowphase->getNumContactsGpu()); + + int numJoints = m_data->m_joints.size() ? m_data->m_joints.size() : m_data->m_cpuConstraints.size(); + if (useBullet2CpuSolver && numJoints) + { + + // b3AlignedObjectArray<b3Contact4> hostContacts; + //gpuContacts.copyToHost(hostContacts); + { + bool useGpu = m_data->m_joints.size()==0; + +// b3Contact4* contacts = numContacts? &hostContacts[0]: 0; + //m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,contacts,numJoints, joints); + if (useGpu) + { + m_data->m_gpuSolver->solveJoints(m_data->m_narrowphase->getNumRigidBodies(),&gpuBodies,&gpuInertias,numJoints, m_data->m_gpuConstraints); + } else + { + b3AlignedObjectArray<b3RigidBodyData> hostBodies; + gpuBodies.copyToHost(hostBodies); + b3AlignedObjectArray<b3InertiaData> hostInertias; + gpuInertias.copyToHost(hostInertias); + + b3TypedConstraint** joints = numJoints? &m_data->m_joints[0] : 0; + m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumRigidBodies(),&hostBodies[0],&hostInertias[0],0,0,numJoints, joints); + gpuBodies.copyFromHost(hostBodies); + } + } + } + + if (numContacts) + { + +#ifdef TEST_OTHER_GPU_SOLVER + + if (gUseJacobi) + { + bool useGpu = true; + if (useGpu) + { + + bool forceHost = false; + if (forceHost) + { + b3AlignedObjectArray<b3RigidBodyData> hostBodies; + b3AlignedObjectArray<b3InertiaData> hostInertias; + b3AlignedObjectArray<b3Contact4> hostContacts; + + { + B3_PROFILE("copyToHost"); + gpuBodies.copyToHost(hostBodies); + gpuInertias.copyToHost(hostInertias); + gpuContacts.copyToHost(hostContacts); + } + + { + b3JacobiSolverInfo solverInfo; + m_data->m_solver3->solveGroupHost(&hostBodies[0], &hostInertias[0], hostBodies.size(),&hostContacts[0],hostContacts.size(),solverInfo); + + + } + { + B3_PROFILE("copyFromHost"); + gpuBodies.copyFromHost(hostBodies); + } + } else + + + { + int static0Index = m_data->m_narrowphase->getStatic0Index(); + b3JacobiSolverInfo solverInfo; + //m_data->m_solver3->solveContacts( >solveGroup(&gpuBodies, &gpuInertias, &gpuContacts,solverInfo); + //m_data->m_solver3->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,&hostContacts[0]); + m_data->m_solver3->solveContacts(numBodies, gpuBodies.getBufferCL(),gpuInertias.getBufferCL(),numContacts, gpuContacts.getBufferCL(),m_data->m_config, static0Index); + } + } else + { + b3AlignedObjectArray<b3RigidBodyData> hostBodies; + gpuBodies.copyToHost(hostBodies); + b3AlignedObjectArray<b3InertiaData> hostInertias; + gpuInertias.copyToHost(hostInertias); + b3AlignedObjectArray<b3Contact4> hostContacts; + gpuContacts.copyToHost(hostContacts); + { + //m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,&hostContacts[0]); + } + gpuBodies.copyFromHost(hostBodies); + } + + } else +#endif //TEST_OTHER_GPU_SOLVER + { + + int static0Index = m_data->m_narrowphase->getStatic0Index(); + m_data->m_solver2->solveContacts(numBodies, gpuBodies.getBufferCL(),gpuInertias.getBufferCL(),numContacts, gpuContacts.getBufferCL(),m_data->m_config, static0Index); + + //m_data->m_solver4->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(), gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL()); + + + /*m_data->m_solver3->solveContactConstraintHost( + (b3OpenCLArray<RigidBodyBase::Body>*)&gpuBodies, + (b3OpenCLArray<RigidBodyBase::Inertia>*)&gpuInertias, + (b3OpenCLArray<Constraint4>*) &gpuContacts, + 0,numContacts,256); + */ + } + } + + integrate(deltaTime); + +} + + +void b3GpuRigidBodyPipeline::integrate(float timeStep) +{ + //integrate + int numBodies = m_data->m_narrowphase->getNumRigidBodies(); + float angularDamp = 0.99f; + + if (gIntegrateOnCpu) + { + if(numBodies) + { + b3GpuNarrowPhaseInternalData* npData = m_data->m_narrowphase->getInternalData(); + npData->m_bodyBufferGPU->copyToHost(*npData->m_bodyBufferCPU); + + b3RigidBodyData_t* bodies = &npData->m_bodyBufferCPU->at(0); + + for (int nodeID=0;nodeID<numBodies;nodeID++) + { + integrateSingleTransform( bodies,nodeID, timeStep, angularDamp, m_data->m_gravity); + } + npData->m_bodyBufferGPU->copyFromHost(*npData->m_bodyBufferCPU); + } + } else + { + b3LauncherCL launcher(m_data->m_queue,m_data->m_integrateTransformsKernel,"m_integrateTransformsKernel"); + launcher.setBuffer(m_data->m_narrowphase->getBodiesGpu()); + + launcher.setConst(numBodies); + launcher.setConst(timeStep); + launcher.setConst(angularDamp); + launcher.setConst(m_data->m_gravity); + launcher.launch1D(numBodies); + } +} + + + + +void b3GpuRigidBodyPipeline::setupGpuAabbsFull() +{ + cl_int ciErrNum=0; + + int numBodies = m_data->m_narrowphase->getNumRigidBodies(); + if (!numBodies) + return; + + if (gCalcWorldSpaceAabbOnCpu) + { + + if (numBodies) + { + if (gUseDbvt) + { + m_data->m_allAabbsCPU.resize(numBodies); + m_data->m_narrowphase->readbackAllBodiesToCpu(); + for (int i=0;i<numBodies;i++) + { + b3ComputeWorldAabb( i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(),&m_data->m_allAabbsCPU[0]); + } + m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU); + } else + { + m_data->m_broadphaseSap->getAllAabbsCPU().resize(numBodies); + m_data->m_narrowphase->readbackAllBodiesToCpu(); + for (int i=0;i<numBodies;i++) + { + b3ComputeWorldAabb( i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(),&m_data->m_broadphaseSap->getAllAabbsCPU()[0]); + } + m_data->m_broadphaseSap->getAllAabbsGPU().copyFromHost(m_data->m_broadphaseSap->getAllAabbsCPU()); + //m_data->m_broadphaseSap->writeAabbsToGpu(); + } + } + } else + { + //__kernel void initializeGpuAabbsFull( const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global b3AABBCL* plocalShapeAABB, __global b3AABBCL* pAABB) + b3LauncherCL launcher(m_data->m_queue,m_data->m_updateAabbsKernel,"m_updateAabbsKernel"); + launcher.setConst(numBodies); + cl_mem bodies = m_data->m_narrowphase->getBodiesGpu(); + launcher.setBuffer(bodies); + cl_mem collidables = m_data->m_narrowphase->getCollidablesGpu(); + launcher.setBuffer(collidables); + cl_mem localAabbs = m_data->m_narrowphase->getAabbLocalSpaceBufferGpu(); + launcher.setBuffer(localAabbs); + + cl_mem worldAabbs =0; + if (gUseDbvt) + { + worldAabbs = m_data->m_allAabbsGPU->getBufferCL(); + } else + { + worldAabbs = m_data->m_broadphaseSap->getAabbBufferWS(); + } + launcher.setBuffer(worldAabbs); + launcher.launch1D(numBodies); + + oclCHECKERROR(ciErrNum, CL_SUCCESS); + } + + /* + b3AlignedObjectArray<b3SapAabb> aabbs; + m_data->m_broadphaseSap->m_allAabbsGPU.copyToHost(aabbs); + + printf("numAabbs = %d\n", aabbs.size()); + + for (int i=0;i<aabbs.size();i++) + { + printf("aabb[%d].m_min=%f,%f,%f,%d\n",i,aabbs[i].m_minVec[0],aabbs[i].m_minVec[1],aabbs[i].m_minVec[2],aabbs[i].m_minIndices[3]); + printf("aabb[%d].m_max=%f,%f,%f,%d\n",i,aabbs[i].m_maxVec[0],aabbs[i].m_maxVec[1],aabbs[i].m_maxVec[2],aabbs[i].m_signedMaxIndices[3]); + + }; + */ + + + + + +} + + + +cl_mem b3GpuRigidBodyPipeline::getBodyBuffer() +{ + return m_data->m_narrowphase->getBodiesGpu(); +} + +int b3GpuRigidBodyPipeline::getNumBodies() const +{ + return m_data->m_narrowphase->getNumRigidBodies(); +} + +void b3GpuRigidBodyPipeline::setGravity(const float* grav) +{ + m_data->m_gravity.setValue(grav[0],grav[1],grav[2]); +} + +void b3GpuRigidBodyPipeline::copyConstraintsToHost() +{ + m_data->m_gpuConstraints->copyToHost(m_data->m_cpuConstraints); +} + +void b3GpuRigidBodyPipeline::writeAllInstancesToGpu() +{ + m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU); + m_data->m_gpuConstraints->copyFromHost(m_data->m_cpuConstraints); +} + + +int b3GpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* position, const float* orientation, int collidableIndex, int userIndex, bool writeInstanceToGpu) +{ + + b3Vector3 aabbMin=b3MakeVector3(0,0,0),aabbMax=b3MakeVector3(0,0,0); + + + if (collidableIndex>=0) + { + b3SapAabb localAabb = m_data->m_narrowphase->getLocalSpaceAabb(collidableIndex); + b3Vector3 localAabbMin=b3MakeVector3(localAabb.m_min[0],localAabb.m_min[1],localAabb.m_min[2]); + b3Vector3 localAabbMax=b3MakeVector3(localAabb.m_max[0],localAabb.m_max[1],localAabb.m_max[2]); + + b3Scalar margin = 0.01f; + b3Transform t; + t.setIdentity(); + t.setOrigin(b3MakeVector3(position[0],position[1],position[2])); + t.setRotation(b3Quaternion(orientation[0],orientation[1],orientation[2],orientation[3])); + b3TransformAabb(localAabbMin,localAabbMax, margin,t,aabbMin,aabbMax); + } else + { + b3Error("registerPhysicsInstance using invalid collidableIndex\n"); + return -1; + } + + + bool writeToGpu = false; + int bodyIndex = m_data->m_narrowphase->getNumRigidBodies(); + bodyIndex = m_data->m_narrowphase->registerRigidBody(collidableIndex,mass,position,orientation,&aabbMin.getX(),&aabbMax.getX(),writeToGpu); + + if (bodyIndex>=0) + { + if (gUseDbvt) + { + m_data->m_broadphaseDbvt->createProxy(aabbMin,aabbMax,bodyIndex,0,1,1); + b3SapAabb aabb; + for (int i=0;i<3;i++) + { + aabb.m_min[i] = aabbMin[i]; + aabb.m_max[i] = aabbMax[i]; + aabb.m_minIndices[3] = bodyIndex; + } + m_data->m_allAabbsCPU.push_back(aabb); + if (writeInstanceToGpu) + { + m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU); + } + } else + { + if (mass) + { + m_data->m_broadphaseSap->createProxy(aabbMin,aabbMax,bodyIndex,1,1);//m_dispatcher); + } else + { + m_data->m_broadphaseSap->createLargeProxy(aabbMin,aabbMax,bodyIndex,1,1);//m_dispatcher); + } + } + } + + /* + if (mass>0.f) + m_numDynamicPhysicsInstances++; + + m_numPhysicsInstances++; + */ + + return bodyIndex; +} + +void b3GpuRigidBodyPipeline::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults) +{ + this->m_data->m_raycaster->castRays(rays,hitResults, + getNumBodies(),this->m_data->m_narrowphase->getBodiesCpu(), + m_data->m_narrowphase->getNumCollidablesGpu(), m_data->m_narrowphase->getCollidablesCpu(), + m_data->m_narrowphase->getInternalData(), m_data->m_broadphaseSap); +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h new file mode 100644 index 0000000000..b4eac6841a --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h @@ -0,0 +1,74 @@ +/* +Copyright (c) 2013 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + +#ifndef B3_GPU_RIGIDBODY_PIPELINE_H +#define B3_GPU_RIGIDBODY_PIPELINE_H + +#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h" +#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h" + +#include "Bullet3Common/b3AlignedObjectArray.h" +#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h" + +class b3GpuRigidBodyPipeline +{ +protected: + struct b3GpuRigidBodyPipelineInternalData* m_data; + + int allocateCollidable(); + +public: + + + b3GpuRigidBodyPipeline(cl_context ctx,cl_device_id device, cl_command_queue q , class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap, struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config); + virtual ~b3GpuRigidBodyPipeline(); + + void stepSimulation(float deltaTime); + void integrate(float timeStep); + void setupGpuAabbsFull(); + + int registerConvexPolyhedron(class b3ConvexUtility* convex); + + //int registerConvexPolyhedron(const float* vertices, int strideInBytes, int numVertices, const float* scaling); + //int registerSphereShape(float radius); + //int registerPlaneShape(const b3Vector3& planeNormal, float planeConstant); + + //int registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, const float* scaling); + //int registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes); + + + int registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, int userData, bool writeInstanceToGpu); + //if you passed "writeInstanceToGpu" false in the registerPhysicsInstance method (for performance) you need to call writeAllInstancesToGpu after all instances are registered + void writeAllInstancesToGpu(); + void copyConstraintsToHost(); + void setGravity(const float* grav); + void reset(); + + int createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB,float breakingThreshold); + int createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB, float breakingThreshold); + void removeConstraintByUid(int uid); + + void addConstraint(class b3TypedConstraint* constraint); + void removeConstraint(b3TypedConstraint* constraint); + + void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults); + + cl_mem getBodyBuffer(); + + int getNumBodies() const; + +}; + +#endif //B3_GPU_RIGIDBODY_PIPELINE_H
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipelineInternalData.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipelineInternalData.h new file mode 100644 index 0000000000..5ac92f97d6 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipelineInternalData.h @@ -0,0 +1,73 @@ +/* +Copyright (c) 2013 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + +#ifndef B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H +#define B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H + +#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h" +#include "Bullet3Common/b3AlignedObjectArray.h" + +#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h" + + +#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h" +#include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h" +#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h" + + + +#include "Bullet3Collision/BroadPhaseCollision/b3OverlappingPair.h" +#include "Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h" + +struct b3GpuRigidBodyPipelineInternalData +{ + + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + + cl_kernel m_integrateTransformsKernel; + cl_kernel m_updateAabbsKernel; + cl_kernel m_clearOverlappingPairsKernel; + + class b3PgsJacobiSolver* m_solver; + + class b3GpuPgsConstraintSolver* m_gpuSolver; + + class b3GpuPgsContactSolver* m_solver2; + class b3GpuJacobiContactSolver* m_solver3; + class b3GpuRaycast* m_raycaster; + + class b3GpuBroadphaseInterface* m_broadphaseSap; + + struct b3DynamicBvhBroadphase* m_broadphaseDbvt; + b3OpenCLArray<b3SapAabb>* m_allAabbsGPU; + b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU; + b3OpenCLArray<b3BroadphasePair>* m_overlappingPairsGPU; + + b3OpenCLArray<b3GpuGenericConstraint>* m_gpuConstraints; + b3AlignedObjectArray<b3GpuGenericConstraint> m_cpuConstraints; + + b3AlignedObjectArray<b3TypedConstraint*> m_joints; + int m_constraintUid; + class b3GpuNarrowPhase* m_narrowphase; + b3Vector3 m_gravity; + + b3Config m_config; +}; + +#endif //B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuSolverBody.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuSolverBody.h new file mode 100644 index 0000000000..f2a61801ac --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuSolverBody.h @@ -0,0 +1,228 @@ +/* +Copyright (c) 2013 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + + +#ifndef B3_GPU_SOLVER_BODY_H +#define B3_GPU_SOLVER_BODY_H + + +#include "Bullet3Common/b3Vector3.h" +#include "Bullet3Common/b3Matrix3x3.h" + +#include "Bullet3Common/b3AlignedAllocator.h" +#include "Bullet3Common/b3TransformUtil.h" + +///Until we get other contributions, only use SIMD on Windows, when using Visual Studio 2008 or later, and not double precision +#ifdef B3_USE_SSE +#define USE_SIMD 1 +#endif // + + + +///The b3SolverBody is an internal datastructure for the constraint solver. Only necessary data is packed to increase cache coherence/performance. +B3_ATTRIBUTE_ALIGNED16 (struct) b3GpuSolverBody +{ + B3_DECLARE_ALIGNED_ALLOCATOR(); +// b3Transform m_worldTransformUnused; + b3Vector3 m_deltaLinearVelocity; + b3Vector3 m_deltaAngularVelocity; + b3Vector3 m_angularFactor; + b3Vector3 m_linearFactor; + b3Vector3 m_invMass; + b3Vector3 m_pushVelocity; + b3Vector3 m_turnVelocity; + b3Vector3 m_linearVelocity; + b3Vector3 m_angularVelocity; + + union + { + void* m_originalBody; + int m_originalBodyIndex; + }; + + int padding[3]; + + /* + void setWorldTransform(const b3Transform& worldTransform) + { + m_worldTransform = worldTransform; + } + + const b3Transform& getWorldTransform() const + { + return m_worldTransform; + } + */ + B3_FORCE_INLINE void getVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity ) const + { + if (m_originalBody) + velocity = m_linearVelocity+m_deltaLinearVelocity + (m_angularVelocity+m_deltaAngularVelocity).cross(rel_pos); + else + velocity.setValue(0,0,0); + } + + B3_FORCE_INLINE void getAngularVelocity(b3Vector3& angVel) const + { + if (m_originalBody) + angVel =m_angularVelocity+m_deltaAngularVelocity; + else + angVel.setValue(0,0,0); + } + + + //Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position + B3_FORCE_INLINE void applyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,const b3Scalar impulseMagnitude) + { + if (m_originalBody) + { + m_deltaLinearVelocity += linearComponent*impulseMagnitude*m_linearFactor; + m_deltaAngularVelocity += angularComponent*(impulseMagnitude*m_angularFactor); + } + } + + B3_FORCE_INLINE void internalApplyPushImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,b3Scalar impulseMagnitude) + { + if (m_originalBody) + { + m_pushVelocity += linearComponent*impulseMagnitude*m_linearFactor; + m_turnVelocity += angularComponent*(impulseMagnitude*m_angularFactor); + } + } + + + + const b3Vector3& getDeltaLinearVelocity() const + { + return m_deltaLinearVelocity; + } + + const b3Vector3& getDeltaAngularVelocity() const + { + return m_deltaAngularVelocity; + } + + const b3Vector3& getPushVelocity() const + { + return m_pushVelocity; + } + + const b3Vector3& getTurnVelocity() const + { + return m_turnVelocity; + } + + + //////////////////////////////////////////////// + ///some internal methods, don't use them + + b3Vector3& internalGetDeltaLinearVelocity() + { + return m_deltaLinearVelocity; + } + + b3Vector3& internalGetDeltaAngularVelocity() + { + return m_deltaAngularVelocity; + } + + const b3Vector3& internalGetAngularFactor() const + { + return m_angularFactor; + } + + const b3Vector3& internalGetInvMass() const + { + return m_invMass; + } + + void internalSetInvMass(const b3Vector3& invMass) + { + m_invMass = invMass; + } + + b3Vector3& internalGetPushVelocity() + { + return m_pushVelocity; + } + + b3Vector3& internalGetTurnVelocity() + { + return m_turnVelocity; + } + + B3_FORCE_INLINE void internalGetVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity ) const + { + velocity = m_linearVelocity+m_deltaLinearVelocity + (m_angularVelocity+m_deltaAngularVelocity).cross(rel_pos); + } + + B3_FORCE_INLINE void internalGetAngularVelocity(b3Vector3& angVel) const + { + angVel = m_angularVelocity+m_deltaAngularVelocity; + } + + + //Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position + B3_FORCE_INLINE void internalApplyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,const b3Scalar impulseMagnitude) + { + //if (m_originalBody) + { + m_deltaLinearVelocity += linearComponent*impulseMagnitude*m_linearFactor; + m_deltaAngularVelocity += angularComponent*(impulseMagnitude*m_angularFactor); + } + } + + + + + void writebackVelocity() + { + //if (m_originalBody>=0) + { + m_linearVelocity +=m_deltaLinearVelocity; + m_angularVelocity += m_deltaAngularVelocity; + + //m_originalBody->setCompanionId(-1); + } + } + + + void writebackVelocityAndTransform(b3Scalar timeStep, b3Scalar splitImpulseTurnErp) + { + (void) timeStep; + if (m_originalBody) + { + m_linearVelocity += m_deltaLinearVelocity; + m_angularVelocity += m_deltaAngularVelocity; + + //correct the position/orientation based on push/turn recovery + b3Transform newTransform; + if (m_pushVelocity[0]!=0.f || m_pushVelocity[1]!=0 || m_pushVelocity[2]!=0 || m_turnVelocity[0]!=0.f || m_turnVelocity[1]!=0 || m_turnVelocity[2]!=0) + { + // b3Quaternion orn = m_worldTransform.getRotation(); +// b3TransformUtil::integrateTransform(m_worldTransform,m_pushVelocity,m_turnVelocity*splitImpulseTurnErp,timeStep,newTransform); +// m_worldTransform = newTransform; + } + //m_worldTransform.setRotation(orn); + //m_originalBody->setCompanionId(-1); + } + } + + + +}; + +#endif //B3_SOLVER_BODY_H + + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuSolverConstraint.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuSolverConstraint.h new file mode 100644 index 0000000000..60d235baab --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3GpuSolverConstraint.h @@ -0,0 +1,82 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2013 Erwin Coumans http://github.com/erwincoumans/bullet3 + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + +#ifndef B3_GPU_SOLVER_CONSTRAINT_H +#define B3_GPU_SOLVER_CONSTRAINT_H + + +#include "Bullet3Common/b3Vector3.h" +#include "Bullet3Common/b3Matrix3x3.h" +//#include "b3JacobianEntry.h" +#include "Bullet3Common/b3AlignedObjectArray.h" + +//#define NO_FRICTION_TANGENTIALS 1 + + + +///1D constraint along a normal axis between bodyA and bodyB. It can be combined to solve contact and friction constraints. +B3_ATTRIBUTE_ALIGNED16 (struct) b3GpuSolverConstraint +{ + B3_DECLARE_ALIGNED_ALLOCATOR(); + + b3Vector3 m_relpos1CrossNormal; + b3Vector3 m_contactNormal; + + b3Vector3 m_relpos2CrossNormal; + //b3Vector3 m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal + + b3Vector3 m_angularComponentA; + b3Vector3 m_angularComponentB; + + mutable b3Scalar m_appliedPushImpulse; + mutable b3Scalar m_appliedImpulse; + int m_padding1; + int m_padding2; + b3Scalar m_friction; + b3Scalar m_jacDiagABInv; + b3Scalar m_rhs; + b3Scalar m_cfm; + + b3Scalar m_lowerLimit; + b3Scalar m_upperLimit; + b3Scalar m_rhsPenetration; + union + { + void* m_originalContactPoint; + int m_originalConstraintIndex; + b3Scalar m_unusedPadding4; + }; + + int m_overrideNumSolverIterations; + int m_frictionIndex; + int m_solverBodyIdA; + int m_solverBodyIdB; + + + enum b3SolverConstraintType + { + B3_SOLVER_CONTACT_1D = 0, + B3_SOLVER_FRICTION_1D + }; +}; + +typedef b3AlignedObjectArray<b3GpuSolverConstraint> b3GpuConstraintArray; + + +#endif //B3_GPU_SOLVER_CONSTRAINT_H + + + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3Solver.cpp b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3Solver.cpp new file mode 100644 index 0000000000..20bf6d47c5 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3Solver.cpp @@ -0,0 +1,1225 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +#include "b3Solver.h" + +///useNewBatchingKernel is a rewritten kernel using just a single thread of the warp, for experiments +bool useNewBatchingKernel = true; +bool gConvertConstraintOnCpu = false; + +#define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl" +#define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl" +#define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl" +#define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl" +#define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl" +#define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl" + +#include "Bullet3Dynamics/shared/b3ConvertConstraint4.h" + +#include "kernels/solverSetup.h" +#include "kernels/solverSetup2.h" + +#include "kernels/solveContact.h" +#include "kernels/solveFriction.h" + +#include "kernels/batchingKernels.h" +#include "kernels/batchingKernelsNew.h" + + +#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" +#include "Bullet3Common/b3Vector3.h" + +struct SolverDebugInfo +{ + int m_valInt0; + int m_valInt1; + int m_valInt2; + int m_valInt3; + + int m_valInt4; + int m_valInt5; + int m_valInt6; + int m_valInt7; + + int m_valInt8; + int m_valInt9; + int m_valInt10; + int m_valInt11; + + int m_valInt12; + int m_valInt13; + int m_valInt14; + int m_valInt15; + + + float m_val0; + float m_val1; + float m_val2; + float m_val3; +}; + + + + +class SolverDeviceInl +{ +public: + struct ParallelSolveData + { + b3OpenCLArray<unsigned int>* m_numConstraints; + b3OpenCLArray<unsigned int>* m_offsets; + }; +}; + + + +b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity) + : + m_context(ctx), + m_device(device), + m_queue(queue), + m_batchSizes(ctx,queue), + m_nIterations(4) +{ + m_sort32 = new b3RadixSort32CL(ctx,device,queue); + m_scan = new b3PrefixScanCL(ctx,device,queue,B3_SOLVER_N_CELLS); + m_search = new b3BoundSearchCL(ctx,device,queue,B3_SOLVER_N_CELLS); + + const int sortSize = B3NEXTMULTIPLEOF( pairCapacity, 512 ); + + m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx,queue,sortSize); + m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(ctx,queue); + + m_numConstraints = new b3OpenCLArray<unsigned int>(ctx,queue,B3_SOLVER_N_CELLS ); + m_numConstraints->resize(B3_SOLVER_N_CELLS); + + m_offsets = new b3OpenCLArray<unsigned int>( ctx,queue,B3_SOLVER_N_CELLS); + m_offsets->resize(B3_SOLVER_N_CELLS); + const char* additionalMacros = ""; +// const char* srcFileNameForCaching=""; + + + + cl_int pErrNum; + const char* batchKernelSource = batchingKernelsCL; + const char* batchKernelNewSource = batchingKernelsNewCL; + + const char* solverSetupSource = solverSetupCL; + const char* solverSetup2Source = solverSetup2CL; + const char* solveContactSource = solveContactCL; + const char* solveFrictionSource = solveFrictionCL; + + + + { + + cl_program solveContactProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveContactSource, &pErrNum,additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH); + b3Assert(solveContactProg); + + cl_program solveFrictionProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH); + b3Assert(solveFrictionProg); + + cl_program solverSetup2Prog= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetup2Source, &pErrNum,additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH); + b3Assert(solverSetup2Prog); + + + cl_program solverSetupProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetupSource, &pErrNum,additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH); + b3Assert(solverSetupProg); + + + m_solveFrictionKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg,additionalMacros ); + b3Assert(m_solveFrictionKernel); + + m_solveContactKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros ); + b3Assert(m_solveContactKernel); + + m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg,additionalMacros ); + b3Assert(m_contactToConstraintKernel); + + m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + b3Assert(m_setSortDataKernel); + + m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + b3Assert(m_reorderContactKernel); + + + m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + b3Assert(m_copyConstraintKernel); + + } + + { + cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelSource, &pErrNum,additionalMacros, B3_BATCHING_PATH); + //cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, 0, &pErrNum,additionalMacros, B3_BATCHING_PATH,true); + b3Assert(batchingProg); + + m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg,additionalMacros ); + b3Assert(m_batchingKernel); + } + { + cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelNewSource, &pErrNum,additionalMacros, B3_BATCHING_NEW_PATH); + b3Assert(batchingNewProg); + + m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg,additionalMacros ); + //m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesBruteForce", &pErrNum, batchingNewProg,additionalMacros ); + b3Assert(m_batchingKernelNew); + } +} + +b3Solver::~b3Solver() +{ + delete m_offsets; + delete m_numConstraints; + delete m_sortDataBuffer; + delete m_contactBuffer2; + + delete m_sort32; + delete m_scan; + delete m_search; + + + clReleaseKernel(m_batchingKernel); + clReleaseKernel(m_batchingKernelNew); + + clReleaseKernel( m_solveContactKernel); + clReleaseKernel( m_solveFrictionKernel); + + clReleaseKernel( m_contactToConstraintKernel); + clReleaseKernel( m_setSortDataKernel); + clReleaseKernel( m_reorderContactKernel); + clReleaseKernel( m_copyConstraintKernel); + +} + + + + +template<bool JACOBI> +static +__inline +void solveContact(b3GpuConstraint4& cs, + const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA, + const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, + float maxRambdaDt[4], float minRambdaDt[4]) +{ + + b3Vector3 dLinVelA; dLinVelA.setZero(); + b3Vector3 dAngVelA; dAngVelA.setZero(); + b3Vector3 dLinVelB; dLinVelB.setZero(); + b3Vector3 dAngVelB; dAngVelB.setZero(); + + for(int ic=0; ic<4; ic++) + { + // dont necessary because this makes change to 0 + if( cs.m_jacCoeffInv[ic] == 0.f ) continue; + + { + b3Vector3 angular0, angular1, linear; + b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA; + b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB; + setLinearAndAngular( (const b3Vector3 &)cs.m_linear, (const b3Vector3 &)r0, (const b3Vector3 &)r1, &linear, &angular0, &angular1 ); + + float rambdaDt = calcRelVel((const b3Vector3 &)cs.m_linear,(const b3Vector3 &) -cs.m_linear, angular0, angular1, + linVelA, angVelA, linVelB, angVelB ) + cs.m_b[ic]; + rambdaDt *= cs.m_jacCoeffInv[ic]; + + { + float prevSum = cs.m_appliedRambdaDt[ic]; + float updated = prevSum; + updated += rambdaDt; + updated = b3Max( updated, minRambdaDt[ic] ); + updated = b3Min( updated, maxRambdaDt[ic] ); + rambdaDt = updated - prevSum; + cs.m_appliedRambdaDt[ic] = updated; + } + + b3Vector3 linImp0 = invMassA*linear*rambdaDt; + b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt; + b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt; + b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt; +#ifdef _WIN32 + b3Assert(_finite(linImp0.getX())); + b3Assert(_finite(linImp1.getX())); +#endif + if( JACOBI ) + { + dLinVelA += linImp0; + dAngVelA += angImp0; + dLinVelB += linImp1; + dAngVelB += angImp1; + } + else + { + linVelA += linImp0; + angVelA += angImp0; + linVelB += linImp1; + angVelB += angImp1; + } + } + } + + if( JACOBI ) + { + linVelA += dLinVelA; + angVelA += dAngVelA; + linVelB += dLinVelB; + angVelB += dAngVelB; + } + +} + + + + + + static + __inline + void solveFriction(b3GpuConstraint4& cs, + const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA, + const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, + float maxRambdaDt[4], float minRambdaDt[4]) + { + + if( cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0 ) return; + const b3Vector3& center = (const b3Vector3&)cs.m_center; + + b3Vector3 n = -(const b3Vector3&)cs.m_linear; + + b3Vector3 tangent[2]; +#if 1 + b3PlaneSpace1 (n, tangent[0],tangent[1]); +#else + b3Vector3 r = cs.m_worldPos[0]-center; + tangent[0] = cross3( n, r ); + tangent[1] = cross3( tangent[0], n ); + tangent[0] = normalize3( tangent[0] ); + tangent[1] = normalize3( tangent[1] ); +#endif + + b3Vector3 angular0, angular1, linear; + b3Vector3 r0 = center - posA; + b3Vector3 r1 = center - posB; + for(int i=0; i<2; i++) + { + setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 ); + float rambdaDt = calcRelVel(linear, -linear, angular0, angular1, + linVelA, angVelA, linVelB, angVelB ); + rambdaDt *= cs.m_fJacCoeffInv[i]; + + { + float prevSum = cs.m_fAppliedRambdaDt[i]; + float updated = prevSum; + updated += rambdaDt; + updated = b3Max( updated, minRambdaDt[i] ); + updated = b3Min( updated, maxRambdaDt[i] ); + rambdaDt = updated - prevSum; + cs.m_fAppliedRambdaDt[i] = updated; + } + + b3Vector3 linImp0 = invMassA*linear*rambdaDt; + b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt; + b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt; + b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt; +#ifdef _WIN32 + b3Assert(_finite(linImp0.getX())); + b3Assert(_finite(linImp1.getX())); +#endif + linVelA += linImp0; + angVelA += angImp0; + linVelB += linImp1; + angVelB += angImp1; + } + + { // angular damping for point constraint + b3Vector3 ab = ( posB - posA ).normalized(); + b3Vector3 ac = ( center - posA ).normalized(); + if( b3Dot( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f)) + { + float angNA = b3Dot( n, angVelA ); + float angNB = b3Dot( n, angVelB ); + + angVelA -= (angNA*0.1f)*n; + angVelB -= (angNB*0.1f)*n; + } + } + + } +/* + b3AlignedObjectArray<b3RigidBodyData>& m_bodies; + b3AlignedObjectArray<b3InertiaData>& m_shapes; + b3AlignedObjectArray<b3GpuConstraint4>& m_constraints; + b3AlignedObjectArray<int>* m_batchSizes; + int m_cellIndex; + int m_curWgidx; + int m_start; + int m_nConstraints; + bool m_solveFriction; + int m_maxNumBatches; + */ + +struct SolveTask// : public ThreadPool::Task +{ + SolveTask(b3AlignedObjectArray<b3RigidBodyData>& bodies, b3AlignedObjectArray<b3InertiaData>& shapes, b3AlignedObjectArray<b3GpuConstraint4>& constraints, + int start, int nConstraints,int maxNumBatches,b3AlignedObjectArray<int>* wgUsedBodies, int curWgidx, b3AlignedObjectArray<int>* batchSizes, int cellIndex) + : m_bodies( bodies ), m_shapes( shapes ), + m_constraints( constraints ), + m_batchSizes(batchSizes), + m_cellIndex(cellIndex), + m_curWgidx(curWgidx), + m_start( start ), + m_nConstraints( nConstraints ), + m_solveFriction( true ), + m_maxNumBatches(maxNumBatches) + {} + + unsigned short int getType(){ return 0; } + + void run(int tIdx) + { + int offset = 0; + for (int ii=0;ii<B3_MAX_NUM_BATCHES;ii++) + { + int numInBatch = m_batchSizes->at(m_cellIndex*B3_MAX_NUM_BATCHES+ii); + if (!numInBatch) + break; + + for (int jj=0;jj<numInBatch;jj++) + { + int i = m_start + offset+jj; + int batchId = m_constraints[i].m_batchIdx; + b3Assert(batchId==ii); + float frictionCoeff = m_constraints[i].getFrictionCoeff(); + int aIdx = (int)m_constraints[i].m_bodyA; + int bIdx = (int)m_constraints[i].m_bodyB; +// int localBatch = m_constraints[i].m_batchIdx; + b3RigidBodyData& bodyA = m_bodies[aIdx]; + b3RigidBodyData& bodyB = m_bodies[bIdx]; + + if( !m_solveFriction ) + { + float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + + solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld, + (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld, + maxRambdaDt, minRambdaDt ); + } + else + { + float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + float sum = 0; + for(int j=0; j<4; j++) + { + sum +=m_constraints[i].m_appliedRambdaDt[j]; + } + frictionCoeff = 0.7f; + for(int j=0; j<4; j++) + { + maxRambdaDt[j] = frictionCoeff*sum; + minRambdaDt[j] = -maxRambdaDt[j]; + } + solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld, + (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld, + maxRambdaDt, minRambdaDt ); + + } + } + offset+=numInBatch; + + + } +/* for (int bb=0;bb<m_maxNumBatches;bb++) + { + //for(int ic=m_nConstraints-1; ic>=0; ic--) + for(int ic=0; ic<m_nConstraints; ic++) + { + + int i = m_start + ic; + if (m_constraints[i].m_batchIdx != bb) + continue; + + float frictionCoeff = m_constraints[i].getFrictionCoeff(); + int aIdx = (int)m_constraints[i].m_bodyA; + int bIdx = (int)m_constraints[i].m_bodyB; + int localBatch = m_constraints[i].m_batchIdx; + b3RigidBodyData& bodyA = m_bodies[aIdx]; + b3RigidBodyData& bodyB = m_bodies[bIdx]; + + if( !m_solveFriction ) + { + float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + + solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld, + (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld, + maxRambdaDt, minRambdaDt ); + } + else + { + float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + float sum = 0; + for(int j=0; j<4; j++) + { + sum +=m_constraints[i].m_appliedRambdaDt[j]; + } + frictionCoeff = 0.7f; + for(int j=0; j<4; j++) + { + maxRambdaDt[j] = frictionCoeff*sum; + minRambdaDt[j] = -maxRambdaDt[j]; + } + solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld, + (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld, + maxRambdaDt, minRambdaDt ); + + } + } + } + */ + + + + } + + b3AlignedObjectArray<b3RigidBodyData>& m_bodies; + b3AlignedObjectArray<b3InertiaData>& m_shapes; + b3AlignedObjectArray<b3GpuConstraint4>& m_constraints; + b3AlignedObjectArray<int>* m_batchSizes; + int m_cellIndex; + int m_curWgidx; + int m_start; + int m_nConstraints; + bool m_solveFriction; + int m_maxNumBatches; +}; + + +void b3Solver::solveContactConstraintHost( b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches,b3AlignedObjectArray<int>* batchSizes) +{ + +#if 0 + { + int nSplitX = B3_SOLVER_N_SPLIT_X; + int nSplitY = B3_SOLVER_N_SPLIT_Y; + int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES; + for (int z=0;z<4;z++) + { + for (int y=0;y<4;y++) + { + for (int x=0;x<4;x++) + { + int newIndex = (x+y*nSplitX+z*nSplitX*nSplitY); + // printf("newIndex=%d\n",newIndex); + + int zIdx = newIndex/(nSplitX*nSplitY); + int remain = newIndex%(nSplitX*nSplitY); + int yIdx = remain/nSplitX; + int xIdx = remain%nSplitX; + // printf("newIndex=%d\n",newIndex); + } + } + } + + //for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--) + for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++) + { + for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++) + { + int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2); + int remain= (wgIdx%((nSplitX*nSplitY)/4)); + int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1); + int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1); + + /*int zIdx = newIndex/(nSplitX*nSplitY); + int remain = newIndex%(nSplitX*nSplitY); + int yIdx = remain/nSplitX; + int xIdx = remain%nSplitX; + */ + int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY); + // printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch); + } + } + } +#endif + + b3AlignedObjectArray<b3RigidBodyData> bodyNative; + bodyBuf->copyToHost(bodyNative); + b3AlignedObjectArray<b3InertiaData> shapeNative; + shapeBuf->copyToHost(shapeNative); + b3AlignedObjectArray<b3GpuConstraint4> constraintNative; + constraint->copyToHost(constraintNative); + + b3AlignedObjectArray<unsigned int> numConstraintsHost; + m_numConstraints->copyToHost(numConstraintsHost); + + //printf("------------------------\n"); + b3AlignedObjectArray<unsigned int> offsetsHost; + m_offsets->copyToHost(offsetsHost); + static int frame=0; + bool useBatches=true; + if (useBatches) + { + for(int iter=0; iter<m_nIterations; iter++) + { + for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++) + { + + int nSplitX = B3_SOLVER_N_SPLIT_X; + int nSplitY = B3_SOLVER_N_SPLIT_Y; + int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES; + //printf("cell Batch %d\n",cellBatch); + b3AlignedObjectArray<int> usedBodies[B3_SOLVER_N_CELLS]; + for (int i=0;i<B3_SOLVER_N_CELLS;i++) + { + usedBodies[i].resize(0); + } + + + + + //for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--) + for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++) + { + int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2); + int remain= (wgIdx%((nSplitX*nSplitY)/4)); + int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1); + int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1); + int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY); + + + if( numConstraintsHost[cellIdx] == 0 ) + continue; + + //printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch); + //printf("cell %d has %d constraints\n", cellIdx,numConstraintsHost[cellIdx]); + if (zIdx) + { + //printf("?\n"); + } + + if (iter==0) + { + //printf("frame=%d, Cell xIdx=%x, yIdx=%d ",frame, xIdx,yIdx); + //printf("cellBatch=%d, wgIdx=%d, #constraints in cell=%d\n",cellBatch,wgIdx,numConstraintsHost[cellIdx]); + } + const int start = offsetsHost[cellIdx]; + int numConstraintsInCell = numConstraintsHost[cellIdx]; + // const int end = start + numConstraintsInCell; + + SolveTask task( bodyNative, shapeNative, constraintNative, start, numConstraintsInCell ,maxNumBatches,usedBodies,wgIdx,batchSizes,cellIdx); + task.m_solveFriction = false; + task.run(0); + + } + } + } + + for(int iter=0; iter<m_nIterations; iter++) + { + for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++) + { + int nSplitX = B3_SOLVER_N_SPLIT_X; + int nSplitY = B3_SOLVER_N_SPLIT_Y; + + + int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES; + + for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++) + { + int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2); + int remain= (wgIdx%((nSplitX*nSplitY)/4)); + int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1); + int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1); + + int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY); + + if( numConstraintsHost[cellIdx] == 0 ) + continue; + + //printf("yIdx=%d\n",yIdx); + + const int start = offsetsHost[cellIdx]; + int numConstraintsInCell = numConstraintsHost[cellIdx]; + // const int end = start + numConstraintsInCell; + + SolveTask task( bodyNative, shapeNative, constraintNative, start, numConstraintsInCell,maxNumBatches, 0,0,batchSizes,cellIdx); + task.m_solveFriction = true; + task.run(0); + + } + } + } + + + } else + { + for(int iter=0; iter<m_nIterations; iter++) + { + SolveTask task( bodyNative, shapeNative, constraintNative, 0, n ,maxNumBatches,0,0,0,0); + task.m_solveFriction = false; + task.run(0); + } + + for(int iter=0; iter<m_nIterations; iter++) + { + SolveTask task( bodyNative, shapeNative, constraintNative, 0, n ,maxNumBatches,0,0,0,0); + task.m_solveFriction = true; + task.run(0); + } + } + + bodyBuf->copyFromHost(bodyNative); + shapeBuf->copyFromHost(shapeNative); + constraint->copyFromHost(constraintNative); + frame++; + +} + +void checkConstraintBatch(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, + const b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, + b3OpenCLArray<unsigned int>* m_numConstraints, + b3OpenCLArray<unsigned int>* m_offsets, + int batchId + ) +{ +// b3BufferInfoCL( m_numConstraints->getBufferCL() ), +// b3BufferInfoCL( m_offsets->getBufferCL() ) + + int cellBatch = batchId; + const int nn = B3_SOLVER_N_CELLS; +// int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES; + + b3AlignedObjectArray<unsigned int> gN; + m_numConstraints->copyToHost(gN); + b3AlignedObjectArray<unsigned int> gOffsets; + m_offsets->copyToHost(gOffsets); + int nSplitX = B3_SOLVER_N_SPLIT_X; + int nSplitY = B3_SOLVER_N_SPLIT_Y; + +// int bIdx = batchId; + + b3AlignedObjectArray<b3GpuConstraint4> cpuConstraints; + constraint->copyToHost(cpuConstraints); + + printf("batch = %d\n", batchId); + + int numWorkgroups = nn/B3_SOLVER_N_BATCHES; + b3AlignedObjectArray<int> usedBodies; + + + for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++) + { + printf("wgIdx = %d ", wgIdx); + + int zIdx = (wgIdx/((nSplitX*nSplitY))/2)*2+((cellBatch&4)>>2); + int remain = wgIdx%((nSplitX*nSplitY)); + int yIdx = (remain%(nSplitX/2))*2 + ((cellBatch&2)>>1); + int xIdx = (remain/(nSplitX/2))*2 + (cellBatch&1); + + + int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY); + printf("cellIdx=%d\n",cellIdx); + if( gN[cellIdx] == 0 ) + continue; + + const int start = gOffsets[cellIdx]; + const int end = start + gN[cellIdx]; + + for (int c=start;c<end;c++) + { + b3GpuConstraint4& constraint = cpuConstraints[c]; + //printf("constraint (%d,%d)\n", constraint.m_bodyA,constraint.m_bodyB); + if (usedBodies.findLinearSearch(constraint.m_bodyA)< usedBodies.size()) + { + printf("error?\n"); + } + if (usedBodies.findLinearSearch(constraint.m_bodyB)< usedBodies.size()) + { + printf("error?\n"); + } + } + + for (int c=start;c<end;c++) + { + b3GpuConstraint4& constraint = cpuConstraints[c]; + usedBodies.push_back(constraint.m_bodyA); + usedBodies.push_back(constraint.m_bodyB); + } + + } +} + +static bool verify=false; + +void b3Solver::solveContactConstraint( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches) +{ + + + b3Int4 cdata = b3MakeInt4( n, 0, 0, 0 ); + { + + const int nn = B3_SOLVER_N_CELLS; + + cdata.x = 0; + cdata.y = maxNumBatches;//250; + + + int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES; +#ifdef DEBUG_ME + SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems]; + adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems); +#endif + + + + { + + B3_PROFILE("m_batchSolveKernel iterations"); + for(int iter=0; iter<m_nIterations; iter++) + { + for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++) + { + + if (verify) + { + checkConstraintBatch(bodyBuf,shapeBuf,constraint,m_numConstraints,m_offsets,ib); + } + +#ifdef DEBUG_ME + memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems); + gpuDebugInfo.write(debugInfo,numWorkItems); +#endif + + + cdata.z = ib; + + + b3LauncherCL launcher( m_queue, m_solveContactKernel ,"m_solveContactKernel"); +#if 1 + + b3BufferInfoCL bInfo[] = { + + b3BufferInfoCL( bodyBuf->getBufferCL() ), + b3BufferInfoCL( shapeBuf->getBufferCL() ), + b3BufferInfoCL( constraint->getBufferCL() ), + b3BufferInfoCL( m_numConstraints->getBufferCL() ), + b3BufferInfoCL( m_offsets->getBufferCL() ) +#ifdef DEBUG_ME + , b3BufferInfoCL(&gpuDebugInfo) +#endif + }; + + + + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + //launcher.setConst( cdata.x ); + launcher.setConst( cdata.y ); + launcher.setConst( cdata.z ); + b3Int4 nSplit; + nSplit.x = B3_SOLVER_N_SPLIT_X; + nSplit.y = B3_SOLVER_N_SPLIT_Y; + nSplit.z = B3_SOLVER_N_SPLIT_Z; + + launcher.setConst( nSplit ); + launcher.launch1D( numWorkItems, 64 ); + + +#else + const char* fileName = "m_batchSolveKernel.bin"; + FILE* f = fopen(fileName,"rb"); + if (f) + { + int sizeInBytes=0; + if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) + { + printf("error, cannot get file size\n"); + exit(0); + } + + unsigned char* buf = (unsigned char*) malloc(sizeInBytes); + fread(buf,sizeInBytes,1,f); + int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context); + int num = *(int*)&buf[serializedBytes]; + + launcher.launch1D( num); + + //this clFinish is for testing on errors + clFinish(m_queue); + } + +#endif + + +#ifdef DEBUG_ME + clFinish(m_queue); + gpuDebugInfo.read(debugInfo,numWorkItems); + clFinish(m_queue); + for (int i=0;i<numWorkItems;i++) + { + if (debugInfo[i].m_valInt2>0) + { + printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2); + } + + if (debugInfo[i].m_valInt3>0) + { + printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3); + } + } +#endif //DEBUG_ME + + + } + } + + clFinish(m_queue); + + + } + + cdata.x = 1; + bool applyFriction=true; + if (applyFriction) + { + B3_PROFILE("m_batchSolveKernel iterations2"); + for(int iter=0; iter<m_nIterations; iter++) + { + for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++) + { + cdata.z = ib; + + + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( bodyBuf->getBufferCL() ), + b3BufferInfoCL( shapeBuf->getBufferCL() ), + b3BufferInfoCL( constraint->getBufferCL() ), + b3BufferInfoCL( m_numConstraints->getBufferCL() ), + b3BufferInfoCL( m_offsets->getBufferCL() ) +#ifdef DEBUG_ME + ,b3BufferInfoCL(&gpuDebugInfo) +#endif //DEBUG_ME + }; + b3LauncherCL launcher( m_queue, m_solveFrictionKernel,"m_solveFrictionKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + //launcher.setConst( cdata.x ); + launcher.setConst( cdata.y ); + launcher.setConst( cdata.z ); + b3Int4 nSplit; + nSplit.x = B3_SOLVER_N_SPLIT_X; + nSplit.y = B3_SOLVER_N_SPLIT_Y; + nSplit.z = B3_SOLVER_N_SPLIT_Z; + + launcher.setConst( nSplit ); + + launcher.launch1D( 64*nn/B3_SOLVER_N_BATCHES, 64 ); + } + } + clFinish(m_queue); + + } +#ifdef DEBUG_ME + delete[] debugInfo; +#endif //DEBUG_ME + } + + +} + +void b3Solver::convertToConstraints( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, + const b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData, + int nContacts, const ConstraintCfg& cfg ) +{ +// b3OpenCLArray<b3GpuConstraint4>* constraintNative =0; + contactCOut->resize(nContacts); + struct CB + { + int m_nContacts; + float m_dt; + float m_positionDrift; + float m_positionConstraintCoeff; + }; + + { + + CB cdata; + cdata.m_nContacts = nContacts; + cdata.m_dt = cfg.m_dt; + cdata.m_positionDrift = cfg.m_positionDrift; + cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff; + + + if (gConvertConstraintOnCpu) + { + b3AlignedObjectArray<b3RigidBodyData> gBodies; + bodyBuf->copyToHost(gBodies); + + b3AlignedObjectArray<b3Contact4> gContact; + contactsIn->copyToHost(gContact); + + b3AlignedObjectArray<b3InertiaData> gShapes; + shapeBuf->copyToHost(gShapes); + + b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut; + gConstraintOut.resize(nContacts); + + B3_PROFILE("cpu contactToConstraintKernel"); + for (int gIdx=0;gIdx<nContacts;gIdx++) + { + int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit); + int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit); + + b3Float4 posA = gBodies[aIdx].m_pos; + b3Float4 linVelA = gBodies[aIdx].m_linVel; + b3Float4 angVelA = gBodies[aIdx].m_angVel; + float invMassA = gBodies[aIdx].m_invMass; + b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia; + + b3Float4 posB = gBodies[bIdx].m_pos; + b3Float4 linVelB = gBodies[bIdx].m_linVel; + b3Float4 angVelB = gBodies[bIdx].m_angVel; + float invMassB = gBodies[bIdx].m_invMass; + b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia; + + b3ContactConstraint4_t cs; + + setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB, + &gContact[gIdx], cdata.m_dt, cdata.m_positionDrift, cdata.m_positionConstraintCoeff, + &cs ); + + cs.m_batchIdx = gContact[gIdx].m_batchIdx; + + gConstraintOut[gIdx] = (b3GpuConstraint4&)cs; + } + + contactCOut->copyFromHost(gConstraintOut); + + } else + { + B3_PROFILE("gpu m_contactToConstraintKernel"); + + + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( shapeBuf->getBufferCL()), + b3BufferInfoCL( contactCOut->getBufferCL() )}; + b3LauncherCL launcher( m_queue, m_contactToConstraintKernel,"m_contactToConstraintKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + //launcher.setConst( cdata ); + + launcher.setConst(cdata.m_nContacts); + launcher.setConst(cdata.m_dt); + launcher.setConst(cdata.m_positionDrift); + launcher.setConst(cdata.m_positionConstraintCoeff); + + launcher.launch1D( nContacts, 64 ); + clFinish(m_queue); + + } + } + + +} + +/* +void b3Solver::sortContacts( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, + b3OpenCLArray<b3Contact4>* contactsIn, void* additionalData, + int nContacts, const b3Solver::ConstraintCfg& cfg ) +{ + + + + const int sortAlignment = 512; // todo. get this out of sort + if( cfg.m_enableParallelSolve ) + { + + + int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment ); + + b3OpenCLArray<unsigned int>* countsNative = m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost ); + b3OpenCLArray<unsigned int>* offsetsNative = m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost ); + + { // 2. set cell idx + struct CB + { + int m_nContacts; + int m_staticIdx; + float m_scale; + int m_nSplit; + }; + + b3Assert( sortSize%64 == 0 ); + CB cdata; + cdata.m_nContacts = nContacts; + cdata.m_staticIdx = cfg.m_staticIdx; + cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent); + cdata.m_nSplit = B3_SOLVER_N_SPLIT; + + + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) }; + b3LauncherCL launcher( m_queue, m_setSortDataKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( sortSize, 64 ); + } + + { // 3. sort by cell idx + int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT; + int sortBit = 32; + //if( n <= 0xffff ) sortBit = 16; + //if( n <= 0xff ) sortBit = 8; + m_sort32->execute(*m_sortDataBuffer,sortSize); + } + { // 4. find entries + m_search->execute( *m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, b3BoundSearchCL::COUNT); + + m_scan->execute( *countsNative, *offsetsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT ); + } + + { // 5. sort constraints by cellIdx + // todo. preallocate this +// b3Assert( contactsIn->getType() == TYPE_HOST ); +// b3OpenCLArray<b3Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn ); // copying contacts to this buffer + + { + + + b3Int4 cdata; cdata.x = nContacts; + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( m_contactBuffer->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) }; + b3LauncherCL launcher( m_queue, m_reorderContactKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( nContacts, 64 ); + } +// BufferUtils::unmap<true>( out, contactsIn, nContacts ); + } + } + + +} + +*/ +void b3Solver::batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* nNative, b3OpenCLArray<unsigned int>* offsetsNative, int staticIdx ) +{ + + int numWorkItems = 64*B3_SOLVER_N_CELLS; + { + B3_PROFILE("batch generation"); + + b3Int4 cdata; + cdata.x = nContacts; + cdata.y = 0; + cdata.z = staticIdx; + + +#ifdef BATCH_DEBUG + SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems]; + adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems); + memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems); + gpuDebugInfo.write(debugInfo,numWorkItems); +#endif + + + +#if 0 + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( contacts->getBufferCL() ), + b3BufferInfoCL( m_contactBuffer2->getBufferCL()), + b3BufferInfoCL( nNative->getBufferCL() ), + b3BufferInfoCL( offsetsNative->getBufferCL() ), +#ifdef BATCH_DEBUG + , b3BufferInfoCL(&gpuDebugInfo) +#endif + }; +#endif + + + + { + m_batchSizes.resize(nNative->size()); + B3_PROFILE("batchingKernel"); + //b3LauncherCL launcher( m_queue, m_batchingKernel); + cl_kernel k = useNewBatchingKernel ? m_batchingKernelNew : m_batchingKernel; + + b3LauncherCL launcher( m_queue, k,"*batchingKernel"); + if (!useNewBatchingKernel ) + { + launcher.setBuffer( contacts->getBufferCL() ); + } + launcher.setBuffer( m_contactBuffer2->getBufferCL() ); + launcher.setBuffer( nNative->getBufferCL()); + launcher.setBuffer( offsetsNative->getBufferCL()); + + launcher.setBuffer(m_batchSizes.getBufferCL()); + + + //launcher.setConst( cdata ); + launcher.setConst(staticIdx); + + launcher.launch1D( numWorkItems, 64 ); + //clFinish(m_queue); + //b3AlignedObjectArray<int> batchSizesCPU; + //m_batchSizes.copyToHost(batchSizesCPU); + //printf(".\n"); + } + +#ifdef BATCH_DEBUG + aaaa + b3Contact4* hostContacts = new b3Contact4[nContacts]; + m_contactBuffer->read(hostContacts,nContacts); + clFinish(m_queue); + + gpuDebugInfo.read(debugInfo,numWorkItems); + clFinish(m_queue); + + for (int i=0;i<numWorkItems;i++) + { + if (debugInfo[i].m_valInt1>0) + { + printf("catch\n"); + } + if (debugInfo[i].m_valInt2>0) + { + printf("catch22\n"); + } + + if (debugInfo[i].m_valInt3>0) + { + printf("catch666\n"); + } + + if (debugInfo[i].m_valInt4>0) + { + printf("catch777\n"); + } + } + delete[] debugInfo; +#endif //BATCH_DEBUG + + } + +// copy buffer to buffer + //b3Assert(m_contactBuffer->size()==nContacts); + //contacts->copyFromOpenCLArray( *m_contactBuffer); + //clFinish(m_queue);//needed? + + + +} + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3Solver.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3Solver.h new file mode 100644 index 0000000000..b37f2f1bec --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/b3Solver.h @@ -0,0 +1,126 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +#ifndef __ADL_SOLVER_H +#define __ADL_SOLVER_H + +#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" +#include "b3GpuConstraint4.h" + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h" +#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h" + +#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h" + +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" + + +#define B3NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment)) + +enum +{ + B3_SOLVER_N_SPLIT_X = 8,//16,//4, + B3_SOLVER_N_SPLIT_Y = 4,//16,//4, + B3_SOLVER_N_SPLIT_Z = 8,//, + B3_SOLVER_N_CELLS = B3_SOLVER_N_SPLIT_X*B3_SOLVER_N_SPLIT_Y*B3_SOLVER_N_SPLIT_Z, + B3_SOLVER_N_BATCHES = 8,//4,//8,//4, + B3_MAX_NUM_BATCHES = 128, +}; + +class b3SolverBase +{ + public: + + + struct ConstraintCfg + { + ConstraintCfg( float dt = 0.f ): m_positionDrift( 0.005f ), m_positionConstraintCoeff( 0.2f ), m_dt(dt), m_staticIdx(-1) {} + + float m_positionDrift; + float m_positionConstraintCoeff; + float m_dt; + bool m_enableParallelSolve; + float m_batchCellSize; + int m_staticIdx; + }; + +}; + +class b3Solver : public b3SolverBase +{ + public: + + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + + + b3OpenCLArray<unsigned int>* m_numConstraints; + b3OpenCLArray<unsigned int>* m_offsets; + b3OpenCLArray<int> m_batchSizes; + + + int m_nIterations; + cl_kernel m_batchingKernel; + cl_kernel m_batchingKernelNew; + cl_kernel m_solveContactKernel; + cl_kernel m_solveFrictionKernel; + cl_kernel m_contactToConstraintKernel; + cl_kernel m_setSortDataKernel; + cl_kernel m_reorderContactKernel; + cl_kernel m_copyConstraintKernel; + + class b3RadixSort32CL* m_sort32; + class b3BoundSearchCL* m_search; + class b3PrefixScanCL* m_scan; + + b3OpenCLArray<b3SortData>* m_sortDataBuffer; + b3OpenCLArray<b3Contact4>* m_contactBuffer2; + + enum + { + DYNAMIC_CONTACT_ALLOCATION_THRESHOLD = 2000000, + }; + + + + + b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity); + + virtual ~b3Solver(); + + void solveContactConstraint( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* inertiaBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches); + + void solveContactConstraintHost( b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches, b3AlignedObjectArray<int>* batchSizes); + + + void convertToConstraints( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, + const b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData, + int nContacts, const ConstraintCfg& cfg ); + + void batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx ); + +}; + + + + +#endif //__ADL_SOLVER_H diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl new file mode 100644 index 0000000000..3b891b863d --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl @@ -0,0 +1,353 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h" + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile __global int* +#endif + + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define make_float4 (float4) +#define make_float2 (float2) +#define make_uint4 (uint4) +#define make_int4 (int4) +#define make_uint2 (uint2) +#define make_int2 (int2) + + +#define max2 max +#define min2 min + + +#define WG_SIZE 64 + + + + + +typedef struct +{ + int m_n; + int m_start; + int m_staticIdx; + int m_paddings[1]; +} ConstBuffer; + +typedef struct +{ + int m_a; + int m_b; + u32 m_idx; +}Elem; + +#define STACK_SIZE (WG_SIZE*10) +//#define STACK_SIZE (WG_SIZE) +#define RING_SIZE 1024 +#define RING_SIZE_MASK (RING_SIZE-1) +#define CHECK_SIZE (WG_SIZE) + + +#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd) +#define RING_END ldsTmp + +u32 readBuf(__local u32* buff, int idx) +{ + idx = idx % (32*CHECK_SIZE); + int bitIdx = idx%32; + int bufIdx = idx/32; + return buff[bufIdx] & (1<<bitIdx); +} + +void writeBuf(__local u32* buff, int idx) +{ + idx = idx % (32*CHECK_SIZE); + int bitIdx = idx%32; + int bufIdx = idx/32; +// buff[bufIdx] |= (1<<bitIdx); + atom_or( &buff[bufIdx], (1<<bitIdx) ); +} + +u32 tryWrite(__local u32* buff, int idx) +{ + idx = idx % (32*CHECK_SIZE); + int bitIdx = idx%32; + int bufIdx = idx/32; + u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) ); + return ((ans >> bitIdx)&1) == 0; +} + +// batching on the GPU +__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut, + __global const u32* gN, __global const u32* gStart, __global int* batchSizes, + int m_staticIdx ) +{ + __local u32 ldsStackIdx[STACK_SIZE]; + __local u32 ldsStackEnd; + __local Elem ldsRingElem[RING_SIZE]; + __local u32 ldsRingEnd; + __local u32 ldsTmp; + __local u32 ldsCheckBuffer[CHECK_SIZE]; + __local u32 ldsFixedBuffer[CHECK_SIZE]; + __local u32 ldsGEnd; + __local u32 ldsDstEnd; + + int wgIdx = GET_GROUP_IDX; + int lIdx = GET_LOCAL_IDX; + + const int m_n = gN[wgIdx]; + const int m_start = gStart[wgIdx]; + + if( lIdx == 0 ) + { + ldsRingEnd = 0; + ldsGEnd = 0; + ldsStackEnd = 0; + ldsDstEnd = m_start; + } + + + +// while(1) +//was 250 + int ie=0; + int maxBatch = 0; + for(ie=0; ie<50; ie++) + { + ldsFixedBuffer[lIdx] = 0; + + for(int giter=0; giter<4; giter++) + { + int ringCap = GET_RING_CAPACITY; + + // 1. fill ring + if( ldsGEnd < m_n ) + { + while( ringCap > WG_SIZE ) + { + if( ldsGEnd >= m_n ) break; + if( lIdx < ringCap - WG_SIZE ) + { + int srcIdx; + AtomInc1( ldsGEnd, srcIdx ); + if( srcIdx < m_n ) + { + int dstIdx; + AtomInc1( ldsRingEnd, dstIdx ); + + int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit; + int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit; + ldsRingElem[dstIdx].m_a = (a>b)? b:a; + ldsRingElem[dstIdx].m_b = (a>b)? a:b; + ldsRingElem[dstIdx].m_idx = srcIdx; + } + } + ringCap = GET_RING_CAPACITY; + } + } + + GROUP_LDS_BARRIER; + + // 2. fill stack + __local Elem* dst = ldsRingElem; + if( lIdx == 0 ) RING_END = 0; + + int srcIdx=lIdx; + int end = ldsRingEnd; + + { + for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE) + { + Elem e; + if(srcIdx<end) e = ldsRingElem[srcIdx]; + bool done = (srcIdx<end)?false:true; + + for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0; + + if( !done ) + { + int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a)); + int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b)); + + if( aUsed==0 && bUsed==0 ) + { + int aAvailable=1; + int bAvailable=1; + int ea = abs(e.m_a); + int eb = abs(e.m_b); + + bool aStatic = (e.m_a<0) ||(ea==m_staticIdx); + bool bStatic = (e.m_b<0) ||(eb==m_staticIdx); + + if (!aStatic) + aAvailable = tryWrite( ldsCheckBuffer, ea ); + if (!bStatic) + bAvailable = tryWrite( ldsCheckBuffer, eb ); + + //aAvailable = aStatic? 1: aAvailable; + //bAvailable = bStatic? 1: bAvailable; + + bool success = (aAvailable && bAvailable); + if(success) + { + + if (!aStatic) + writeBuf( ldsFixedBuffer, ea ); + if (!bStatic) + writeBuf( ldsFixedBuffer, eb ); + } + done = success; + } + } + + // put it aside + if(srcIdx<end) + { + if( done ) + { + int dstIdx; AtomInc1( ldsStackEnd, dstIdx ); + if( dstIdx < STACK_SIZE ) + ldsStackIdx[dstIdx] = e.m_idx; + else{ + done = false; + AtomAdd( ldsStackEnd, -1 ); + } + } + if( !done ) + { + int dstIdx; AtomInc1( RING_END, dstIdx ); + dst[dstIdx] = e; + } + } + + // if filled, flush + if( ldsStackEnd == STACK_SIZE ) + { + for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE) + { + int idx = m_start + ldsStackIdx[i]; + int dstIdx; AtomInc1( ldsDstEnd, dstIdx ); + gConstraintsOut[ dstIdx ] = gConstraints[ idx ]; + gConstraintsOut[ dstIdx ].m_batchIdx = ie; + } + if( lIdx == 0 ) ldsStackEnd = 0; + + //for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) + ldsFixedBuffer[lIdx] = 0; + } + } + } + + if( lIdx == 0 ) ldsRingEnd = RING_END; + } + + GROUP_LDS_BARRIER; + + for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE) + { + int idx = m_start + ldsStackIdx[i]; + int dstIdx; AtomInc1( ldsDstEnd, dstIdx ); + gConstraintsOut[ dstIdx ] = gConstraints[ idx ]; + gConstraintsOut[ dstIdx ].m_batchIdx = ie; + } + + // in case it couldn't consume any pair. Flush them + // todo. Serial batch worth while? + if( ldsStackEnd == 0 ) + { + for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE) + { + int idx = m_start + ldsRingElem[i].m_idx; + int dstIdx; AtomInc1( ldsDstEnd, dstIdx ); + gConstraintsOut[ dstIdx ] = gConstraints[ idx ]; + int curBatch = 100+i; + if (maxBatch < curBatch) + maxBatch = curBatch; + + gConstraintsOut[ dstIdx ].m_batchIdx = curBatch; + + } + GROUP_LDS_BARRIER; + if( lIdx == 0 ) ldsRingEnd = 0; + } + + if( lIdx == 0 ) ldsStackEnd = 0; + + GROUP_LDS_BARRIER; + + // termination + if( ldsGEnd == m_n && ldsRingEnd == 0 ) + break; + } + + if( lIdx == 0 ) + { + if (maxBatch < ie) + maxBatch=ie; + batchSizes[wgIdx]=maxBatch; + } + +} + + + + + + + + + + + + + + + + + + + + + + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h new file mode 100644 index 0000000000..150eedc94b --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h @@ -0,0 +1,388 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* batchingKernelsCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"#ifndef B3_CONTACT4DATA_H\n" +"#define B3_CONTACT4DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"typedef struct b3Contact4Data b3Contact4Data_t;\n" +"struct b3Contact4Data\n" +"{\n" +" b3Float4 m_worldPosB[4];\n" +"// b3Float4 m_localPosA[4];\n" +"// b3Float4 m_localPosB[4];\n" +" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" +" unsigned short m_restituitionCoeffCmp;\n" +" unsigned short m_frictionCoeffCmp;\n" +" int m_batchIdx;\n" +" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" +" int m_bodyBPtrAndSignBit;\n" +" int m_childIndexA;\n" +" int m_childIndexB;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"};\n" +"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" +"{\n" +" return (int)contact->m_worldNormalOnB.w;\n" +"};\n" +"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" +"{\n" +" contact->m_worldNormalOnB.w = (float)numPoints;\n" +"};\n" +"#endif //B3_CONTACT4DATA_H\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile __global int*\n" +"#endif\n" +"typedef unsigned int u32;\n" +"typedef unsigned short u16;\n" +"typedef unsigned char u8;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define make_float4 (float4)\n" +"#define make_float2 (float2)\n" +"#define make_uint4 (uint4)\n" +"#define make_int4 (int4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"#define max2 max\n" +"#define min2 min\n" +"#define WG_SIZE 64\n" +"typedef struct \n" +"{\n" +" int m_n;\n" +" int m_start;\n" +" int m_staticIdx;\n" +" int m_paddings[1];\n" +"} ConstBuffer;\n" +"typedef struct \n" +"{\n" +" int m_a;\n" +" int m_b;\n" +" u32 m_idx;\n" +"}Elem;\n" +"#define STACK_SIZE (WG_SIZE*10)\n" +"//#define STACK_SIZE (WG_SIZE)\n" +"#define RING_SIZE 1024\n" +"#define RING_SIZE_MASK (RING_SIZE-1)\n" +"#define CHECK_SIZE (WG_SIZE)\n" +"#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)\n" +"#define RING_END ldsTmp\n" +"u32 readBuf(__local u32* buff, int idx)\n" +"{\n" +" idx = idx % (32*CHECK_SIZE);\n" +" int bitIdx = idx%32;\n" +" int bufIdx = idx/32;\n" +" return buff[bufIdx] & (1<<bitIdx);\n" +"}\n" +"void writeBuf(__local u32* buff, int idx)\n" +"{\n" +" idx = idx % (32*CHECK_SIZE);\n" +" int bitIdx = idx%32;\n" +" int bufIdx = idx/32;\n" +"// buff[bufIdx] |= (1<<bitIdx);\n" +" atom_or( &buff[bufIdx], (1<<bitIdx) );\n" +"}\n" +"u32 tryWrite(__local u32* buff, int idx)\n" +"{\n" +" idx = idx % (32*CHECK_SIZE);\n" +" int bitIdx = idx%32;\n" +" int bufIdx = idx/32;\n" +" u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n" +" return ((ans >> bitIdx)&1) == 0;\n" +"}\n" +"// batching on the GPU\n" +"__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,\n" +" __global const u32* gN, __global const u32* gStart, __global int* batchSizes, \n" +" int m_staticIdx )\n" +"{\n" +" __local u32 ldsStackIdx[STACK_SIZE];\n" +" __local u32 ldsStackEnd;\n" +" __local Elem ldsRingElem[RING_SIZE];\n" +" __local u32 ldsRingEnd;\n" +" __local u32 ldsTmp;\n" +" __local u32 ldsCheckBuffer[CHECK_SIZE];\n" +" __local u32 ldsFixedBuffer[CHECK_SIZE];\n" +" __local u32 ldsGEnd;\n" +" __local u32 ldsDstEnd;\n" +" int wgIdx = GET_GROUP_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" \n" +" const int m_n = gN[wgIdx];\n" +" const int m_start = gStart[wgIdx];\n" +" \n" +" if( lIdx == 0 )\n" +" {\n" +" ldsRingEnd = 0;\n" +" ldsGEnd = 0;\n" +" ldsStackEnd = 0;\n" +" ldsDstEnd = m_start;\n" +" }\n" +" \n" +" \n" +" \n" +"// while(1)\n" +"//was 250\n" +" int ie=0;\n" +" int maxBatch = 0;\n" +" for(ie=0; ie<50; ie++)\n" +" {\n" +" ldsFixedBuffer[lIdx] = 0;\n" +" for(int giter=0; giter<4; giter++)\n" +" {\n" +" int ringCap = GET_RING_CAPACITY;\n" +" \n" +" // 1. fill ring\n" +" if( ldsGEnd < m_n )\n" +" {\n" +" while( ringCap > WG_SIZE )\n" +" {\n" +" if( ldsGEnd >= m_n ) break;\n" +" if( lIdx < ringCap - WG_SIZE )\n" +" {\n" +" int srcIdx;\n" +" AtomInc1( ldsGEnd, srcIdx );\n" +" if( srcIdx < m_n )\n" +" {\n" +" int dstIdx;\n" +" AtomInc1( ldsRingEnd, dstIdx );\n" +" \n" +" int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit;\n" +" int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit;\n" +" ldsRingElem[dstIdx].m_a = (a>b)? b:a;\n" +" ldsRingElem[dstIdx].m_b = (a>b)? a:b;\n" +" ldsRingElem[dstIdx].m_idx = srcIdx;\n" +" }\n" +" }\n" +" ringCap = GET_RING_CAPACITY;\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" \n" +" // 2. fill stack\n" +" __local Elem* dst = ldsRingElem;\n" +" if( lIdx == 0 ) RING_END = 0;\n" +" int srcIdx=lIdx;\n" +" int end = ldsRingEnd;\n" +" {\n" +" for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)\n" +" {\n" +" Elem e;\n" +" if(srcIdx<end) e = ldsRingElem[srcIdx];\n" +" bool done = (srcIdx<end)?false:true;\n" +" for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;\n" +" \n" +" if( !done )\n" +" {\n" +" int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a));\n" +" int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b));\n" +" if( aUsed==0 && bUsed==0 )\n" +" {\n" +" int aAvailable=1;\n" +" int bAvailable=1;\n" +" int ea = abs(e.m_a);\n" +" int eb = abs(e.m_b);\n" +" bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);\n" +" bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);\n" +" \n" +" if (!aStatic)\n" +" aAvailable = tryWrite( ldsCheckBuffer, ea );\n" +" if (!bStatic)\n" +" bAvailable = tryWrite( ldsCheckBuffer, eb );\n" +" \n" +" //aAvailable = aStatic? 1: aAvailable;\n" +" //bAvailable = bStatic? 1: bAvailable;\n" +" bool success = (aAvailable && bAvailable);\n" +" if(success)\n" +" {\n" +" \n" +" if (!aStatic)\n" +" writeBuf( ldsFixedBuffer, ea );\n" +" if (!bStatic)\n" +" writeBuf( ldsFixedBuffer, eb );\n" +" }\n" +" done = success;\n" +" }\n" +" }\n" +" // put it aside\n" +" if(srcIdx<end)\n" +" {\n" +" if( done )\n" +" {\n" +" int dstIdx; AtomInc1( ldsStackEnd, dstIdx );\n" +" if( dstIdx < STACK_SIZE )\n" +" ldsStackIdx[dstIdx] = e.m_idx;\n" +" else{\n" +" done = false;\n" +" AtomAdd( ldsStackEnd, -1 );\n" +" }\n" +" }\n" +" if( !done )\n" +" {\n" +" int dstIdx; AtomInc1( RING_END, dstIdx );\n" +" dst[dstIdx] = e;\n" +" }\n" +" }\n" +" // if filled, flush\n" +" if( ldsStackEnd == STACK_SIZE )\n" +" {\n" +" for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)\n" +" {\n" +" int idx = m_start + ldsStackIdx[i];\n" +" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n" +" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n" +" gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n" +" }\n" +" if( lIdx == 0 ) ldsStackEnd = 0;\n" +" //for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) \n" +" ldsFixedBuffer[lIdx] = 0;\n" +" }\n" +" }\n" +" }\n" +" if( lIdx == 0 ) ldsRingEnd = RING_END;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)\n" +" {\n" +" int idx = m_start + ldsStackIdx[i];\n" +" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n" +" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n" +" gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n" +" }\n" +" // in case it couldn't consume any pair. Flush them\n" +" // todo. Serial batch worth while?\n" +" if( ldsStackEnd == 0 )\n" +" {\n" +" for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)\n" +" {\n" +" int idx = m_start + ldsRingElem[i].m_idx;\n" +" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n" +" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n" +" int curBatch = 100+i;\n" +" if (maxBatch < curBatch)\n" +" maxBatch = curBatch;\n" +" \n" +" gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;\n" +" \n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" if( lIdx == 0 ) ldsRingEnd = 0;\n" +" }\n" +" if( lIdx == 0 ) ldsStackEnd = 0;\n" +" GROUP_LDS_BARRIER;\n" +" // termination\n" +" if( ldsGEnd == m_n && ldsRingEnd == 0 )\n" +" break;\n" +" }\n" +" if( lIdx == 0 )\n" +" {\n" +" if (maxBatch < ie)\n" +" maxBatch=ie;\n" +" batchSizes[wgIdx]=maxBatch;\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl new file mode 100644 index 0000000000..ba1b66d2c3 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl @@ -0,0 +1,231 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h" + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile __global int* +#endif + +#define SIMD_WIDTH 64 + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define make_float4 (float4) +#define make_float2 (float2) +#define make_uint4 (uint4) +#define make_int4 (int4) +#define make_uint2 (uint2) +#define make_int2 (int2) + + +#define max2 max +#define min2 min + + +#define WG_SIZE 64 + + + + + +typedef struct +{ + int m_n; + int m_start; + int m_staticIdx; + int m_paddings[1]; +} ConstBuffer; + +typedef struct +{ + int m_a; + int m_b; + u32 m_idx; +}Elem; + + + + + +// batching on the GPU +__kernel void CreateBatchesBruteForce( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int m_staticIdx ) +{ + int wgIdx = GET_GROUP_IDX; + int lIdx = GET_LOCAL_IDX; + + const int m_n = gN[wgIdx]; + const int m_start = gStart[wgIdx]; + + if( lIdx == 0 ) + { + for (int i=0;i<m_n;i++) + { + int srcIdx = i+m_start; + int batchIndex = i; + gConstraints[ srcIdx ].m_batchIdx = batchIndex; + } + } +} + + +#define CHECK_SIZE (WG_SIZE) + + + + +u32 readBuf(__local u32* buff, int idx) +{ + idx = idx % (32*CHECK_SIZE); + int bitIdx = idx%32; + int bufIdx = idx/32; + return buff[bufIdx] & (1<<bitIdx); +} + +void writeBuf(__local u32* buff, int idx) +{ + idx = idx % (32*CHECK_SIZE); + int bitIdx = idx%32; + int bufIdx = idx/32; + buff[bufIdx] |= (1<<bitIdx); + //atom_or( &buff[bufIdx], (1<<bitIdx) ); +} + +u32 tryWrite(__local u32* buff, int idx) +{ + idx = idx % (32*CHECK_SIZE); + int bitIdx = idx%32; + int bufIdx = idx/32; + u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) ); + return ((ans >> bitIdx)&1) == 0; +} + + +// batching on the GPU +__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx ) +{ + int wgIdx = GET_GROUP_IDX; + int lIdx = GET_LOCAL_IDX; + const int numConstraints = gN[wgIdx]; + const int m_start = gStart[wgIdx]; + b3Contact4Data_t tmp; + + __local u32 ldsFixedBuffer[CHECK_SIZE]; + + + + + + if( lIdx == 0 ) + { + + + __global struct b3Contact4Data* cs = &gConstraints[m_start]; + + + int numValidConstraints = 0; + int batchIdx = 0; + + while( numValidConstraints < numConstraints) + { + int nCurrentBatch = 0; + // clear flag + + for(int i=0; i<CHECK_SIZE; i++) + ldsFixedBuffer[i] = 0; + + for(int i=numValidConstraints; i<numConstraints; i++) + { + + int bodyAS = cs[i].m_bodyAPtrAndSignBit; + int bodyBS = cs[i].m_bodyBPtrAndSignBit; + int bodyA = abs(bodyAS); + int bodyB = abs(bodyBS); + bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx; + bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx; + int aUnavailable = aIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyA); + int bUnavailable = bIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyB); + + if( aUnavailable==0 && bUnavailable==0 ) // ok + { + if (!aIsStatic) + { + writeBuf( ldsFixedBuffer, bodyA ); + } + if (!bIsStatic) + { + writeBuf( ldsFixedBuffer, bodyB ); + } + + cs[i].m_batchIdx = batchIdx; + + if (i!=numValidConstraints) + { + + tmp = cs[i]; + cs[i] = cs[numValidConstraints]; + cs[numValidConstraints] = tmp; + + + } + + numValidConstraints++; + + nCurrentBatch++; + if( nCurrentBatch == SIMD_WIDTH) + { + nCurrentBatch = 0; + for(int i=0; i<CHECK_SIZE; i++) + ldsFixedBuffer[i] = 0; + + } + } + }//for + batchIdx ++; + }//while + + batchSizes[wgIdx] = batchIdx; + + }//if( lIdx == 0 ) + + //return batchIdx; +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h new file mode 100644 index 0000000000..1e5957adae --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h @@ -0,0 +1,291 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* batchingKernelsNewCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Erwin Coumans\n" +"#ifndef B3_CONTACT4DATA_H\n" +"#define B3_CONTACT4DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"typedef struct b3Contact4Data b3Contact4Data_t;\n" +"struct b3Contact4Data\n" +"{\n" +" b3Float4 m_worldPosB[4];\n" +"// b3Float4 m_localPosA[4];\n" +"// b3Float4 m_localPosB[4];\n" +" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" +" unsigned short m_restituitionCoeffCmp;\n" +" unsigned short m_frictionCoeffCmp;\n" +" int m_batchIdx;\n" +" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" +" int m_bodyBPtrAndSignBit;\n" +" int m_childIndexA;\n" +" int m_childIndexB;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"};\n" +"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" +"{\n" +" return (int)contact->m_worldNormalOnB.w;\n" +"};\n" +"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" +"{\n" +" contact->m_worldNormalOnB.w = (float)numPoints;\n" +"};\n" +"#endif //B3_CONTACT4DATA_H\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile __global int*\n" +"#endif\n" +"#define SIMD_WIDTH 64\n" +"typedef unsigned int u32;\n" +"typedef unsigned short u16;\n" +"typedef unsigned char u8;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define make_float4 (float4)\n" +"#define make_float2 (float2)\n" +"#define make_uint4 (uint4)\n" +"#define make_int4 (int4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"#define max2 max\n" +"#define min2 min\n" +"#define WG_SIZE 64\n" +"typedef struct \n" +"{\n" +" int m_n;\n" +" int m_start;\n" +" int m_staticIdx;\n" +" int m_paddings[1];\n" +"} ConstBuffer;\n" +"typedef struct \n" +"{\n" +" int m_a;\n" +" int m_b;\n" +" u32 m_idx;\n" +"}Elem;\n" +"// batching on the GPU\n" +"__kernel void CreateBatchesBruteForce( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int m_staticIdx )\n" +"{\n" +" int wgIdx = GET_GROUP_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" \n" +" const int m_n = gN[wgIdx];\n" +" const int m_start = gStart[wgIdx];\n" +" \n" +" if( lIdx == 0 )\n" +" {\n" +" for (int i=0;i<m_n;i++)\n" +" {\n" +" int srcIdx = i+m_start;\n" +" int batchIndex = i;\n" +" gConstraints[ srcIdx ].m_batchIdx = batchIndex; \n" +" }\n" +" }\n" +"}\n" +"#define CHECK_SIZE (WG_SIZE)\n" +"u32 readBuf(__local u32* buff, int idx)\n" +"{\n" +" idx = idx % (32*CHECK_SIZE);\n" +" int bitIdx = idx%32;\n" +" int bufIdx = idx/32;\n" +" return buff[bufIdx] & (1<<bitIdx);\n" +"}\n" +"void writeBuf(__local u32* buff, int idx)\n" +"{\n" +" idx = idx % (32*CHECK_SIZE);\n" +" int bitIdx = idx%32;\n" +" int bufIdx = idx/32;\n" +" buff[bufIdx] |= (1<<bitIdx);\n" +" //atom_or( &buff[bufIdx], (1<<bitIdx) );\n" +"}\n" +"u32 tryWrite(__local u32* buff, int idx)\n" +"{\n" +" idx = idx % (32*CHECK_SIZE);\n" +" int bitIdx = idx%32;\n" +" int bufIdx = idx/32;\n" +" u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n" +" return ((ans >> bitIdx)&1) == 0;\n" +"}\n" +"// batching on the GPU\n" +"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx )\n" +"{\n" +" int wgIdx = GET_GROUP_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" const int numConstraints = gN[wgIdx];\n" +" const int m_start = gStart[wgIdx];\n" +" b3Contact4Data_t tmp;\n" +" \n" +" __local u32 ldsFixedBuffer[CHECK_SIZE];\n" +" \n" +" \n" +" \n" +" \n" +" \n" +" if( lIdx == 0 )\n" +" {\n" +" \n" +" \n" +" __global struct b3Contact4Data* cs = &gConstraints[m_start]; \n" +" \n" +" \n" +" int numValidConstraints = 0;\n" +" int batchIdx = 0;\n" +" while( numValidConstraints < numConstraints)\n" +" {\n" +" int nCurrentBatch = 0;\n" +" // clear flag\n" +" \n" +" for(int i=0; i<CHECK_SIZE; i++) \n" +" ldsFixedBuffer[i] = 0; \n" +" for(int i=numValidConstraints; i<numConstraints; i++)\n" +" {\n" +" int bodyAS = cs[i].m_bodyAPtrAndSignBit;\n" +" int bodyBS = cs[i].m_bodyBPtrAndSignBit;\n" +" int bodyA = abs(bodyAS);\n" +" int bodyB = abs(bodyBS);\n" +" bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;\n" +" bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;\n" +" int aUnavailable = aIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyA);\n" +" int bUnavailable = bIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyB);\n" +" \n" +" if( aUnavailable==0 && bUnavailable==0 ) // ok\n" +" {\n" +" if (!aIsStatic)\n" +" {\n" +" writeBuf( ldsFixedBuffer, bodyA );\n" +" }\n" +" if (!bIsStatic)\n" +" {\n" +" writeBuf( ldsFixedBuffer, bodyB );\n" +" }\n" +" cs[i].m_batchIdx = batchIdx;\n" +" if (i!=numValidConstraints)\n" +" {\n" +" tmp = cs[i];\n" +" cs[i] = cs[numValidConstraints];\n" +" cs[numValidConstraints] = tmp;\n" +" }\n" +" numValidConstraints++;\n" +" \n" +" nCurrentBatch++;\n" +" if( nCurrentBatch == SIMD_WIDTH)\n" +" {\n" +" nCurrentBatch = 0;\n" +" for(int i=0; i<CHECK_SIZE; i++) \n" +" ldsFixedBuffer[i] = 0;\n" +" \n" +" }\n" +" }\n" +" }//for\n" +" batchIdx ++;\n" +" }//while\n" +" \n" +" batchSizes[wgIdx] = batchIdx;\n" +" }//if( lIdx == 0 )\n" +" \n" +" //return batchIdx;\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl new file mode 100644 index 0000000000..e22bc9bc33 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl @@ -0,0 +1,32 @@ +/* +Copyright (c) 2013 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h" + +#include "Bullet3Dynamics/shared/b3IntegrateTransforms.h" + + + +__kernel void + integrateTransformsKernel( __global b3RigidBodyData_t* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration) +{ + int nodeID = get_global_id(0); + + if( nodeID < numNodes) + { + integrateSingleTransform(bodies,nodeID, timeStep, angularDamping,gravityAcceleration); + } +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h new file mode 100644 index 0000000000..a5a432947c --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h @@ -0,0 +1,433 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* integrateKernelCL= \ +"/*\n" +"Copyright (c) 2013 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Erwin Coumans\n" +"#ifndef B3_RIGIDBODY_DATA_H\n" +"#define B3_RIGIDBODY_DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_QUAT_H\n" +"#define B3_QUAT_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif\n" +"#endif\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Quat;\n" +" #define b3QuatConstArg const b3Quat\n" +" \n" +" \n" +"inline float4 b3FastNormalize4(float4 v)\n" +"{\n" +" v = (float4)(v.xyz,0.f);\n" +" return fast_normalize(v);\n" +"}\n" +" \n" +"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" +"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" +"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" +"{\n" +" b3Quat ans;\n" +" ans = b3Cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - b3Dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" +"{\n" +" b3Quat q;\n" +" q=in;\n" +" //return b3FastNormalize4(in);\n" +" float len = native_sqrt(dot(q, q));\n" +" if(len > 0.f)\n" +" {\n" +" q *= 1.f / len;\n" +" }\n" +" else\n" +" {\n" +" q.x = q.y = q.z = 0.f;\n" +" q.w = 1.f;\n" +" }\n" +" return q;\n" +"}\n" +"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" b3Quat qInv = b3QuatInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" return b3QuatRotate( b3QuatInvert( q ), vec );\n" +"}\n" +"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" +"{\n" +" return b3QuatRotate( orientation, point ) + (translation);\n" +"}\n" +" \n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifndef B3_MAT3x3_H\n" +"#define B3_MAT3x3_H\n" +"#ifndef B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"typedef struct\n" +"{\n" +" b3Float4 m_row[3];\n" +"}b3Mat3x3;\n" +"#define b3Mat3x3ConstArg const b3Mat3x3\n" +"#define b3GetRow(m,row) (m.m_row[row])\n" +"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" +"{\n" +" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" +" b3Mat3x3 out;\n" +" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" +" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" +" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" +" out.m_row[0].w = 0.f;\n" +" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" +" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" +" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" +" out.m_row[1].w = 0.f;\n" +" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" +" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" +" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" +" out.m_row[2].w = 0.f;\n" +" return out;\n" +"}\n" +"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = fabs(matIn.m_row[0]);\n" +" out.m_row[1] = fabs(matIn.m_row[1]);\n" +" out.m_row[2] = fabs(matIn.m_row[2]);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtZero();\n" +"__inline\n" +"b3Mat3x3 mtIdentity();\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Mat3x3 mtZero()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(0.f);\n" +" m.m_row[1] = (b3Float4)(0.f);\n" +" m.m_row[2] = (b3Float4)(0.f);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtIdentity()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(1,0,0,0);\n" +" m.m_row[1] = (b3Float4)(0,1,0,0);\n" +" m.m_row[2] = (b3Float4)(0,0,1,0);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" +" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" +" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" +"{\n" +" b3Mat3x3 transB;\n" +" transB = mtTranspose( b );\n" +" b3Mat3x3 ans;\n" +" // why this doesn't run when 0ing in the for{}\n" +" a.m_row[0].w = 0.f;\n" +" a.m_row[1].w = 0.f;\n" +" a.m_row[2].w = 0.f;\n" +" for(int i=0; i<3; i++)\n" +" {\n" +"// a.m_row[i].w = 0.f;\n" +" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" +" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" +" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" +" ans.m_row[i].w = 0.f;\n" +" }\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" +"{\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a.m_row[0], b );\n" +" ans.y = b3Dot3F4( a.m_row[1], b );\n" +" ans.z = b3Dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" +"{\n" +" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a, colx );\n" +" ans.y = b3Dot3F4( a, coly );\n" +" ans.z = b3Dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"#endif\n" +"#endif //B3_MAT3x3_H\n" +"typedef struct b3RigidBodyData b3RigidBodyData_t;\n" +"struct b3RigidBodyData\n" +"{\n" +" b3Float4 m_pos;\n" +" b3Quat m_quat;\n" +" b3Float4 m_linVel;\n" +" b3Float4 m_angVel;\n" +" int m_collidableIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"};\n" +"typedef struct b3InertiaData b3InertiaData_t;\n" +"struct b3InertiaData\n" +"{\n" +" b3Mat3x3 m_invInertiaWorld;\n" +" b3Mat3x3 m_initInvInertia;\n" +"};\n" +"#endif //B3_RIGIDBODY_DATA_H\n" +" \n" +"#ifndef B3_RIGIDBODY_DATA_H\n" +"#endif //B3_RIGIDBODY_DATA_H\n" +" \n" +"inline void integrateSingleTransform( __global b3RigidBodyData_t* bodies,int nodeID, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n" +"{\n" +" \n" +" if (bodies[nodeID].m_invMass != 0.f)\n" +" {\n" +" float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n" +" //angular velocity\n" +" {\n" +" b3Float4 axis;\n" +" //add some hardcoded angular damping\n" +" bodies[nodeID].m_angVel.x *= angularDamping;\n" +" bodies[nodeID].m_angVel.y *= angularDamping;\n" +" bodies[nodeID].m_angVel.z *= angularDamping;\n" +" \n" +" b3Float4 angvel = bodies[nodeID].m_angVel;\n" +" float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n" +" \n" +" //limit the angular motion\n" +" if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n" +" {\n" +" fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n" +" }\n" +" if(fAngle < 0.001f)\n" +" {\n" +" // use Taylor's expansions of sync function\n" +" axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n" +" }\n" +" else\n" +" {\n" +" // sync(fAngle) = sin(c*fAngle)/t\n" +" axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n" +" }\n" +" \n" +" b3Quat dorn;\n" +" dorn.x = axis.x;\n" +" dorn.y = axis.y;\n" +" dorn.z = axis.z;\n" +" dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n" +" b3Quat orn0 = bodies[nodeID].m_quat;\n" +" b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n" +" predictedOrn = b3QuatNormalized(predictedOrn);\n" +" bodies[nodeID].m_quat=predictedOrn;\n" +" }\n" +" //linear velocity \n" +" bodies[nodeID].m_pos += bodies[nodeID].m_linVel * timeStep;\n" +" \n" +" //apply gravity\n" +" bodies[nodeID].m_linVel += gravityAcceleration * timeStep;\n" +" \n" +" }\n" +" \n" +"}\n" +"inline void b3IntegrateTransform( __global b3RigidBodyData_t* body, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n" +"{\n" +" float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n" +" \n" +" if( (body->m_invMass != 0.f))\n" +" {\n" +" //angular velocity\n" +" {\n" +" b3Float4 axis;\n" +" //add some hardcoded angular damping\n" +" body->m_angVel.x *= angularDamping;\n" +" body->m_angVel.y *= angularDamping;\n" +" body->m_angVel.z *= angularDamping;\n" +" \n" +" b3Float4 angvel = body->m_angVel;\n" +" float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n" +" //limit the angular motion\n" +" if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n" +" {\n" +" fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n" +" }\n" +" if(fAngle < 0.001f)\n" +" {\n" +" // use Taylor's expansions of sync function\n" +" axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n" +" }\n" +" else\n" +" {\n" +" // sync(fAngle) = sin(c*fAngle)/t\n" +" axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n" +" }\n" +" b3Quat dorn;\n" +" dorn.x = axis.x;\n" +" dorn.y = axis.y;\n" +" dorn.z = axis.z;\n" +" dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n" +" b3Quat orn0 = body->m_quat;\n" +" b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n" +" predictedOrn = b3QuatNormalized(predictedOrn);\n" +" body->m_quat=predictedOrn;\n" +" }\n" +" //apply gravity\n" +" body->m_linVel += gravityAcceleration * timeStep;\n" +" //linear velocity \n" +" body->m_pos += body->m_linVel * timeStep;\n" +" \n" +" }\n" +" \n" +"}\n" +"__kernel void \n" +" integrateTransformsKernel( __global b3RigidBodyData_t* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)\n" +"{\n" +" int nodeID = get_global_id(0);\n" +" \n" +" if( nodeID < numNodes)\n" +" {\n" +" integrateSingleTransform(bodies,nodeID, timeStep, angularDamping,gravityAcceleration);\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl new file mode 100644 index 0000000000..7f5dabe274 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl @@ -0,0 +1,877 @@ +/* +Copyright (c) 2013 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + +#define B3_CONSTRAINT_FLAG_ENABLED 1 + +#define B3_GPU_POINT2POINT_CONSTRAINT_TYPE 3 +#define B3_GPU_FIXED_CONSTRAINT_TYPE 4 + +#define MOTIONCLAMP 100000 //unused, for debugging/safety in case constraint solver fails +#define B3_INFINITY 1e30f + +#define mymake_float4 (float4) + + +__inline float dot3F4(float4 a, float4 b) +{ + float4 a1 = mymake_float4(a.xyz,0.f); + float4 b1 = mymake_float4(b.xyz,0.f); + return dot(a1, b1); +} + + +typedef float4 Quaternion; + + +typedef struct +{ + float4 m_row[3]; +}Matrix3x3; + +__inline +float4 mtMul1(Matrix3x3 a, float4 b); + +__inline +float4 mtMul3(float4 a, Matrix3x3 b); + + + + + +__inline +float4 mtMul1(Matrix3x3 a, float4 b) +{ + float4 ans; + ans.x = dot3F4( a.m_row[0], b ); + ans.y = dot3F4( a.m_row[1], b ); + ans.z = dot3F4( a.m_row[2], b ); + ans.w = 0.f; + return ans; +} + +__inline +float4 mtMul3(float4 a, Matrix3x3 b) +{ + float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0); + float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0); + float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0); + + float4 ans; + ans.x = dot3F4( a, colx ); + ans.y = dot3F4( a, coly ); + ans.z = dot3F4( a, colz ); + return ans; +} + + + +typedef struct +{ + Matrix3x3 m_invInertiaWorld; + Matrix3x3 m_initInvInertia; +} BodyInertia; + + +typedef struct +{ + Matrix3x3 m_basis;//orientation + float4 m_origin;//transform +}b3Transform; + +typedef struct +{ +// b3Transform m_worldTransformUnused; + float4 m_deltaLinearVelocity; + float4 m_deltaAngularVelocity; + float4 m_angularFactor; + float4 m_linearFactor; + float4 m_invMass; + float4 m_pushVelocity; + float4 m_turnVelocity; + float4 m_linearVelocity; + float4 m_angularVelocity; + + union + { + void* m_originalBody; + int m_originalBodyIndex; + }; + int padding[3]; + +} b3GpuSolverBody; + +typedef struct +{ + float4 m_pos; + Quaternion m_quat; + float4 m_linVel; + float4 m_angVel; + + unsigned int m_shapeIdx; + float m_invMass; + float m_restituitionCoeff; + float m_frictionCoeff; +} b3RigidBodyCL; + +typedef struct +{ + + float4 m_relpos1CrossNormal; + float4 m_contactNormal; + + float4 m_relpos2CrossNormal; + //float4 m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal + + float4 m_angularComponentA; + float4 m_angularComponentB; + + float m_appliedPushImpulse; + float m_appliedImpulse; + int m_padding1; + int m_padding2; + float m_friction; + float m_jacDiagABInv; + float m_rhs; + float m_cfm; + + float m_lowerLimit; + float m_upperLimit; + float m_rhsPenetration; + int m_originalConstraint; + + + int m_overrideNumSolverIterations; + int m_frictionIndex; + int m_solverBodyIdA; + int m_solverBodyIdB; + +} b3SolverConstraint; + +typedef struct +{ + int m_bodyAPtrAndSignBit; + int m_bodyBPtrAndSignBit; + int m_originalConstraintIndex; + int m_batchId; +} b3BatchConstraint; + + + + + + +typedef struct +{ + int m_constraintType; + int m_rbA; + int m_rbB; + float m_breakingImpulseThreshold; + + float4 m_pivotInA; + float4 m_pivotInB; + Quaternion m_relTargetAB; + + int m_flags; + int m_padding[3]; +} b3GpuGenericConstraint; + + +/*b3Transform getWorldTransform(b3RigidBodyCL* rb) +{ + b3Transform newTrans; + newTrans.setOrigin(rb->m_pos); + newTrans.setRotation(rb->m_quat); + return newTrans; +}*/ + + + + +__inline +float4 cross3(float4 a, float4 b) +{ + return cross(a,b); +} + +__inline +float4 fastNormalize4(float4 v) +{ + v = mymake_float4(v.xyz,0.f); + return fast_normalize(v); +} + + +__inline +Quaternion qtMul(Quaternion a, Quaternion b); + +__inline +Quaternion qtNormalize(Quaternion in); + +__inline +float4 qtRotate(Quaternion q, float4 vec); + +__inline +Quaternion qtInvert(Quaternion q); + + + + +__inline +Quaternion qtMul(Quaternion a, Quaternion b) +{ + Quaternion ans; + ans = cross3( a, b ); + ans += a.w*b+b.w*a; +// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z); + ans.w = a.w*b.w - dot3F4(a, b); + return ans; +} + +__inline +Quaternion qtNormalize(Quaternion in) +{ + return fastNormalize4(in); +// in /= length( in ); +// return in; +} +__inline +float4 qtRotate(Quaternion q, float4 vec) +{ + Quaternion qInv = qtInvert( q ); + float4 vcpy = vec; + vcpy.w = 0.f; + float4 out = qtMul(qtMul(q,vcpy),qInv); + return out; +} + +__inline +Quaternion qtInvert(Quaternion q) +{ + return (Quaternion)(-q.xyz, q.w); +} + + +__inline void internalApplyImpulse(__global b3GpuSolverBody* body, float4 linearComponent, float4 angularComponent,float impulseMagnitude) +{ + body->m_deltaLinearVelocity += linearComponent*impulseMagnitude*body->m_linearFactor; + body->m_deltaAngularVelocity += angularComponent*(impulseMagnitude*body->m_angularFactor); +} + + +void resolveSingleConstraintRowGeneric(__global b3GpuSolverBody* body1, __global b3GpuSolverBody* body2, __global b3SolverConstraint* c) +{ + float deltaImpulse = c->m_rhs-c->m_appliedImpulse*c->m_cfm; + float deltaVel1Dotn = dot3F4(c->m_contactNormal,body1->m_deltaLinearVelocity) + dot3F4(c->m_relpos1CrossNormal,body1->m_deltaAngularVelocity); + float deltaVel2Dotn = -dot3F4(c->m_contactNormal,body2->m_deltaLinearVelocity) + dot3F4(c->m_relpos2CrossNormal,body2->m_deltaAngularVelocity); + + deltaImpulse -= deltaVel1Dotn*c->m_jacDiagABInv; + deltaImpulse -= deltaVel2Dotn*c->m_jacDiagABInv; + + float sum = c->m_appliedImpulse + deltaImpulse; + if (sum < c->m_lowerLimit) + { + deltaImpulse = c->m_lowerLimit-c->m_appliedImpulse; + c->m_appliedImpulse = c->m_lowerLimit; + } + else if (sum > c->m_upperLimit) + { + deltaImpulse = c->m_upperLimit-c->m_appliedImpulse; + c->m_appliedImpulse = c->m_upperLimit; + } + else + { + c->m_appliedImpulse = sum; + } + + internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse); + internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse); + +} + +__kernel void solveJointConstraintRows(__global b3GpuSolverBody* solverBodies, + __global b3BatchConstraint* batchConstraints, + __global b3SolverConstraint* rows, + __global unsigned int* numConstraintRowsInfo1, + __global unsigned int* rowOffsets, + __global b3GpuGenericConstraint* constraints, + int batchOffset, + int numConstraintsInBatch + ) +{ + int b = get_global_id(0); + if (b>=numConstraintsInBatch) + return; + + __global b3BatchConstraint* c = &batchConstraints[b+batchOffset]; + int originalConstraintIndex = c->m_originalConstraintIndex; + if (constraints[originalConstraintIndex].m_flags&B3_CONSTRAINT_FLAG_ENABLED) + { + int numConstraintRows = numConstraintRowsInfo1[originalConstraintIndex]; + int rowOffset = rowOffsets[originalConstraintIndex]; + for (int jj=0;jj<numConstraintRows;jj++) + { + __global b3SolverConstraint* constraint = &rows[rowOffset+jj]; + resolveSingleConstraintRowGeneric(&solverBodies[constraint->m_solverBodyIdA],&solverBodies[constraint->m_solverBodyIdB],constraint); + } + } +}; + +__kernel void initSolverBodies(__global b3GpuSolverBody* solverBodies,__global b3RigidBodyCL* bodiesCL, int numBodies) +{ + int i = get_global_id(0); + if (i>=numBodies) + return; + + __global b3GpuSolverBody* solverBody = &solverBodies[i]; + __global b3RigidBodyCL* bodyCL = &bodiesCL[i]; + + solverBody->m_deltaLinearVelocity = (float4)(0.f,0.f,0.f,0.f); + solverBody->m_deltaAngularVelocity = (float4)(0.f,0.f,0.f,0.f); + solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f); + solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f); + solverBody->m_invMass = (float4)(bodyCL->m_invMass,bodyCL->m_invMass,bodyCL->m_invMass,0.f); + solverBody->m_originalBodyIndex = i; + solverBody->m_angularFactor = (float4)(1,1,1,0); + solverBody->m_linearFactor = (float4) (1,1,1,0); + solverBody->m_linearVelocity = bodyCL->m_linVel; + solverBody->m_angularVelocity = bodyCL->m_angVel; +} + +__kernel void breakViolatedConstraintsKernel(__global b3GpuGenericConstraint* constraints, __global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, __global b3SolverConstraint* rows, int numConstraints) +{ + int cid = get_global_id(0); + if (cid>=numConstraints) + return; + int numRows = numConstraintRows[cid]; + if (numRows) + { + for (int i=0;i<numRows;i++) + { + int rowIndex = rowOffsets[cid]+i; + float breakingThreshold = constraints[cid].m_breakingImpulseThreshold; + if (fabs(rows[rowIndex].m_appliedImpulse) >= breakingThreshold) + { + constraints[cid].m_flags =0;//&= ~B3_CONSTRAINT_FLAG_ENABLED; + } + } + } +} + + + +__kernel void getInfo1Kernel(__global unsigned int* infos, __global b3GpuGenericConstraint* constraints, int numConstraints) +{ + int i = get_global_id(0); + if (i>=numConstraints) + return; + + __global b3GpuGenericConstraint* constraint = &constraints[i]; + + switch (constraint->m_constraintType) + { + case B3_GPU_POINT2POINT_CONSTRAINT_TYPE: + { + infos[i] = 3; + break; + } + case B3_GPU_FIXED_CONSTRAINT_TYPE: + { + infos[i] = 6; + break; + } + default: + { + } + } +} + +__kernel void initBatchConstraintsKernel(__global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, + __global b3BatchConstraint* batchConstraints, + __global b3GpuGenericConstraint* constraints, + __global b3RigidBodyCL* bodies, + int numConstraints) +{ + int i = get_global_id(0); + if (i>=numConstraints) + return; + + int rbA = constraints[i].m_rbA; + int rbB = constraints[i].m_rbB; + + batchConstraints[i].m_bodyAPtrAndSignBit = bodies[rbA].m_invMass != 0.f ? rbA : -rbA; + batchConstraints[i].m_bodyBPtrAndSignBit = bodies[rbB].m_invMass != 0.f ? rbB : -rbB; + batchConstraints[i].m_batchId = -1; + batchConstraints[i].m_originalConstraintIndex = i; + +} + + + + +typedef struct +{ + // integrator parameters: frames per second (1/stepsize), default error + // reduction parameter (0..1). + float fps,erp; + + // for the first and second body, pointers to two (linear and angular) + // n*3 jacobian sub matrices, stored by rows. these matrices will have + // been initialized to 0 on entry. if the second body is zero then the + // J2xx pointers may be 0. + union + { + __global float4* m_J1linearAxisFloat4; + __global float* m_J1linearAxis; + }; + union + { + __global float4* m_J1angularAxisFloat4; + __global float* m_J1angularAxis; + + }; + union + { + __global float4* m_J2linearAxisFloat4; + __global float* m_J2linearAxis; + }; + union + { + __global float4* m_J2angularAxisFloat4; + __global float* m_J2angularAxis; + }; + // elements to jump from one row to the next in J's + int rowskip; + + // right hand sides of the equation J*v = c + cfm * lambda. cfm is the + // "constraint force mixing" vector. c is set to zero on entry, cfm is + // set to a constant value (typically very small or zero) value on entry. + __global float* m_constraintError; + __global float* cfm; + + // lo and hi limits for variables (set to -/+ infinity on entry). + __global float* m_lowerLimit; + __global float* m_upperLimit; + + // findex vector for variables. see the LCP solver interface for a + // description of what this does. this is set to -1 on entry. + // note that the returned indexes are relative to the first index of + // the constraint. + __global int *findex; + // number of solver iterations + int m_numIterations; + + //damping of the velocity + float m_damping; +} b3GpuConstraintInfo2; + + +void getSkewSymmetricMatrix(float4 vecIn, __global float4* v0,__global float4* v1,__global float4* v2) +{ + *v0 = (float4)(0. ,-vecIn.z ,vecIn.y,0.f); + *v1 = (float4)(vecIn.z ,0. ,-vecIn.x,0.f); + *v2 = (float4)(-vecIn.y ,vecIn.x ,0.f,0.f); +} + + +void getInfo2Point2Point(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies) +{ + float4 posA = bodies[constraint->m_rbA].m_pos; + Quaternion rotA = bodies[constraint->m_rbA].m_quat; + + float4 posB = bodies[constraint->m_rbB].m_pos; + Quaternion rotB = bodies[constraint->m_rbB].m_quat; + + + + // anchor points in global coordinates with respect to body PORs. + + // set jacobian + info->m_J1linearAxis[0] = 1; + info->m_J1linearAxis[info->rowskip+1] = 1; + info->m_J1linearAxis[2*info->rowskip+2] = 1; + + float4 a1 = qtRotate(rotA,constraint->m_pivotInA); + + { + __global float4* angular0 = (__global float4*)(info->m_J1angularAxis); + __global float4* angular1 = (__global float4*)(info->m_J1angularAxis+info->rowskip); + __global float4* angular2 = (__global float4*)(info->m_J1angularAxis+2*info->rowskip); + float4 a1neg = -a1; + getSkewSymmetricMatrix(a1neg,angular0,angular1,angular2); + } + if (info->m_J2linearAxis) + { + info->m_J2linearAxis[0] = -1; + info->m_J2linearAxis[info->rowskip+1] = -1; + info->m_J2linearAxis[2*info->rowskip+2] = -1; + } + + float4 a2 = qtRotate(rotB,constraint->m_pivotInB); + + { + // float4 a2n = -a2; + __global float4* angular0 = (__global float4*)(info->m_J2angularAxis); + __global float4* angular1 = (__global float4*)(info->m_J2angularAxis+info->rowskip); + __global float4* angular2 = (__global float4*)(info->m_J2angularAxis+2*info->rowskip); + getSkewSymmetricMatrix(a2,angular0,angular1,angular2); + } + + // set right hand side +// float currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp; + float currERP = info->erp; + + float k = info->fps * currERP; + int j; + float4 result = a2 + posB - a1 - posA; + float* resultPtr = &result; + + for (j=0; j<3; j++) + { + info->m_constraintError[j*info->rowskip] = k * (resultPtr[j]); + } +} + +Quaternion nearest( Quaternion first, Quaternion qd) +{ + Quaternion diff,sum; + diff = first- qd; + sum = first + qd; + + if( dot(diff,diff) < dot(sum,sum) ) + return qd; + return (-qd); +} + +float b3Acos(float x) +{ + if (x<-1) + x=-1; + if (x>1) + x=1; + return acos(x); +} + +float getAngle(Quaternion orn) +{ + if (orn.w>=1.f) + orn.w=1.f; + float s = 2.f * b3Acos(orn.w); + return s; +} + +void calculateDiffAxisAngleQuaternion( Quaternion orn0,Quaternion orn1a,float4* axis,float* angle) +{ + Quaternion orn1 = nearest(orn0,orn1a); + + Quaternion dorn = qtMul(orn1,qtInvert(orn0)); + *angle = getAngle(dorn); + *axis = (float4)(dorn.x,dorn.y,dorn.z,0.f); + + //check for axis length + float len = dot3F4(*axis,*axis); + if (len < FLT_EPSILON*FLT_EPSILON) + *axis = (float4)(1,0,0,0); + else + *axis /= sqrt(len); +} + + + +void getInfo2FixedOrientation(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies, int start_row) +{ + Quaternion worldOrnA = bodies[constraint->m_rbA].m_quat; + Quaternion worldOrnB = bodies[constraint->m_rbB].m_quat; + + int s = info->rowskip; + int start_index = start_row * s; + + // 3 rows to make body rotations equal + info->m_J1angularAxis[start_index] = 1; + info->m_J1angularAxis[start_index + s + 1] = 1; + info->m_J1angularAxis[start_index + s*2+2] = 1; + if ( info->m_J2angularAxis) + { + info->m_J2angularAxis[start_index] = -1; + info->m_J2angularAxis[start_index + s+1] = -1; + info->m_J2angularAxis[start_index + s*2+2] = -1; + } + + float currERP = info->erp; + float k = info->fps * currERP; + float4 diff; + float angle; + float4 qrelCur = qtMul(worldOrnA,qtInvert(worldOrnB)); + + calculateDiffAxisAngleQuaternion(constraint->m_relTargetAB,qrelCur,&diff,&angle); + diff*=-angle; + + float* resultPtr = &diff; + + for (int j=0; j<3; j++) + { + info->m_constraintError[(3+j)*info->rowskip] = k * resultPtr[j]; + } + + +} + + +__kernel void writeBackVelocitiesKernel(__global b3RigidBodyCL* bodies,__global b3GpuSolverBody* solverBodies,int numBodies) +{ + int i = get_global_id(0); + if (i>=numBodies) + return; + + if (bodies[i].m_invMass) + { +// if (length(solverBodies[i].m_deltaLinearVelocity)<MOTIONCLAMP) + { + bodies[i].m_linVel += solverBodies[i].m_deltaLinearVelocity; + } +// if (length(solverBodies[i].m_deltaAngularVelocity)<MOTIONCLAMP) + { + bodies[i].m_angVel += solverBodies[i].m_deltaAngularVelocity; + } + } +} + + +__kernel void getInfo2Kernel(__global b3SolverConstraint* solverConstraintRows, + __global unsigned int* infos, + __global unsigned int* constraintRowOffsets, + __global b3GpuGenericConstraint* constraints, + __global b3BatchConstraint* batchConstraints, + __global b3RigidBodyCL* bodies, + __global BodyInertia* inertias, + __global b3GpuSolverBody* solverBodies, + float timeStep, + float globalErp, + float globalCfm, + float globalDamping, + int globalNumIterations, + int numConstraints) +{ + + int i = get_global_id(0); + if (i>=numConstraints) + return; + + //for now, always initialize the batch info + int info1 = infos[i]; + + __global b3SolverConstraint* currentConstraintRow = &solverConstraintRows[constraintRowOffsets[i]]; + __global b3GpuGenericConstraint* constraint = &constraints[i]; + + __global b3RigidBodyCL* rbA = &bodies[ constraint->m_rbA]; + __global b3RigidBodyCL* rbB = &bodies[ constraint->m_rbB]; + + int solverBodyIdA = constraint->m_rbA; + int solverBodyIdB = constraint->m_rbB; + + __global b3GpuSolverBody* bodyAPtr = &solverBodies[solverBodyIdA]; + __global b3GpuSolverBody* bodyBPtr = &solverBodies[solverBodyIdB]; + + + if (rbA->m_invMass) + { + batchConstraints[i].m_bodyAPtrAndSignBit = solverBodyIdA; + } else + { +// if (!solverBodyIdA) +// m_staticIdx = 0; + batchConstraints[i].m_bodyAPtrAndSignBit = -solverBodyIdA; + } + + if (rbB->m_invMass) + { + batchConstraints[i].m_bodyBPtrAndSignBit = solverBodyIdB; + } else + { +// if (!solverBodyIdB) +// m_staticIdx = 0; + batchConstraints[i].m_bodyBPtrAndSignBit = -solverBodyIdB; + } + + if (info1) + { + int overrideNumSolverIterations = 0;//constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations; +// if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations) + // m_maxOverrideNumSolverIterations = overrideNumSolverIterations; + + + int j; + for ( j=0;j<info1;j++) + { +// memset(¤tConstraintRow[j],0,sizeof(b3SolverConstraint)); + currentConstraintRow[j].m_angularComponentA = (float4)(0,0,0,0); + currentConstraintRow[j].m_angularComponentB = (float4)(0,0,0,0); + currentConstraintRow[j].m_appliedImpulse = 0.f; + currentConstraintRow[j].m_appliedPushImpulse = 0.f; + currentConstraintRow[j].m_cfm = 0.f; + currentConstraintRow[j].m_contactNormal = (float4)(0,0,0,0); + currentConstraintRow[j].m_friction = 0.f; + currentConstraintRow[j].m_frictionIndex = 0; + currentConstraintRow[j].m_jacDiagABInv = 0.f; + currentConstraintRow[j].m_lowerLimit = 0.f; + currentConstraintRow[j].m_upperLimit = 0.f; + + currentConstraintRow[j].m_originalConstraint = i; + currentConstraintRow[j].m_overrideNumSolverIterations = 0; + currentConstraintRow[j].m_relpos1CrossNormal = (float4)(0,0,0,0); + currentConstraintRow[j].m_relpos2CrossNormal = (float4)(0,0,0,0); + currentConstraintRow[j].m_rhs = 0.f; + currentConstraintRow[j].m_rhsPenetration = 0.f; + currentConstraintRow[j].m_solverBodyIdA = 0; + currentConstraintRow[j].m_solverBodyIdB = 0; + + currentConstraintRow[j].m_lowerLimit = -B3_INFINITY; + currentConstraintRow[j].m_upperLimit = B3_INFINITY; + currentConstraintRow[j].m_appliedImpulse = 0.f; + currentConstraintRow[j].m_appliedPushImpulse = 0.f; + currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA; + currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB; + currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations; + } + + bodyAPtr->m_deltaLinearVelocity = (float4)(0,0,0,0); + bodyAPtr->m_deltaAngularVelocity = (float4)(0,0,0,0); + bodyAPtr->m_pushVelocity = (float4)(0,0,0,0); + bodyAPtr->m_turnVelocity = (float4)(0,0,0,0); + bodyBPtr->m_deltaLinearVelocity = (float4)(0,0,0,0); + bodyBPtr->m_deltaAngularVelocity = (float4)(0,0,0,0); + bodyBPtr->m_pushVelocity = (float4)(0,0,0,0); + bodyBPtr->m_turnVelocity = (float4)(0,0,0,0); + + int rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this + + + + + b3GpuConstraintInfo2 info2; + info2.fps = 1.f/timeStep; + info2.erp = globalErp; + info2.m_J1linearAxisFloat4 = ¤tConstraintRow->m_contactNormal; + info2.m_J1angularAxisFloat4 = ¤tConstraintRow->m_relpos1CrossNormal; + info2.m_J2linearAxisFloat4 = 0; + info2.m_J2angularAxisFloat4 = ¤tConstraintRow->m_relpos2CrossNormal; + info2.rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this + + ///the size of b3SolverConstraint needs be a multiple of float +// b3Assert(info2.rowskip*sizeof(float)== sizeof(b3SolverConstraint)); + info2.m_constraintError = ¤tConstraintRow->m_rhs; + currentConstraintRow->m_cfm = globalCfm; + info2.m_damping = globalDamping; + info2.cfm = ¤tConstraintRow->m_cfm; + info2.m_lowerLimit = ¤tConstraintRow->m_lowerLimit; + info2.m_upperLimit = ¤tConstraintRow->m_upperLimit; + info2.m_numIterations = globalNumIterations; + + switch (constraint->m_constraintType) + { + case B3_GPU_POINT2POINT_CONSTRAINT_TYPE: + { + getInfo2Point2Point(constraint,&info2,bodies); + break; + } + case B3_GPU_FIXED_CONSTRAINT_TYPE: + { + getInfo2Point2Point(constraint,&info2,bodies); + + getInfo2FixedOrientation(constraint,&info2,bodies,3); + + break; + } + + default: + { + } + } + + ///finalize the constraint setup + for ( j=0;j<info1;j++) + { + __global b3SolverConstraint* solverConstraint = ¤tConstraintRow[j]; + + if (solverConstraint->m_upperLimit>=constraint->m_breakingImpulseThreshold) + { + solverConstraint->m_upperLimit = constraint->m_breakingImpulseThreshold; + } + + if (solverConstraint->m_lowerLimit<=-constraint->m_breakingImpulseThreshold) + { + solverConstraint->m_lowerLimit = -constraint->m_breakingImpulseThreshold; + } + +// solverConstraint->m_originalContactPoint = constraint; + + Matrix3x3 invInertiaWorldA= inertias[constraint->m_rbA].m_invInertiaWorld; + { + + //float4 angularFactorA(1,1,1); + float4 ftorqueAxis1 = solverConstraint->m_relpos1CrossNormal; + solverConstraint->m_angularComponentA = mtMul1(invInertiaWorldA,ftorqueAxis1);//*angularFactorA; + } + + Matrix3x3 invInertiaWorldB= inertias[constraint->m_rbB].m_invInertiaWorld; + { + + float4 ftorqueAxis2 = solverConstraint->m_relpos2CrossNormal; + solverConstraint->m_angularComponentB = mtMul1(invInertiaWorldB,ftorqueAxis2);//*constraint->m_rbB.getAngularFactor(); + } + + { + //it is ok to use solverConstraint->m_contactNormal instead of -solverConstraint->m_contactNormal + //because it gets multiplied iMJlB + float4 iMJlA = solverConstraint->m_contactNormal*rbA->m_invMass; + float4 iMJaA = mtMul3(solverConstraint->m_relpos1CrossNormal,invInertiaWorldA); + float4 iMJlB = solverConstraint->m_contactNormal*rbB->m_invMass;//sign of normal? + float4 iMJaB = mtMul3(solverConstraint->m_relpos2CrossNormal,invInertiaWorldB); + + float sum = dot3F4(iMJlA,solverConstraint->m_contactNormal); + sum += dot3F4(iMJaA,solverConstraint->m_relpos1CrossNormal); + sum += dot3F4(iMJlB,solverConstraint->m_contactNormal); + sum += dot3F4(iMJaB,solverConstraint->m_relpos2CrossNormal); + float fsum = fabs(sum); + if (fsum>FLT_EPSILON) + { + solverConstraint->m_jacDiagABInv = 1.f/sum; + } else + { + solverConstraint->m_jacDiagABInv = 0.f; + } + } + + + ///fix rhs + ///todo: add force/torque accelerators + { + float rel_vel; + float vel1Dotn = dot3F4(solverConstraint->m_contactNormal,rbA->m_linVel) + dot3F4(solverConstraint->m_relpos1CrossNormal,rbA->m_angVel); + float vel2Dotn = -dot3F4(solverConstraint->m_contactNormal,rbB->m_linVel) + dot3F4(solverConstraint->m_relpos2CrossNormal,rbB->m_angVel); + + rel_vel = vel1Dotn+vel2Dotn; + + float restitution = 0.f; + float positionalError = solverConstraint->m_rhs;//already filled in by getConstraintInfo2 + float velocityError = restitution - rel_vel * info2.m_damping; + float penetrationImpulse = positionalError*solverConstraint->m_jacDiagABInv; + float velocityImpulse = velocityError *solverConstraint->m_jacDiagABInv; + solverConstraint->m_rhs = penetrationImpulse+velocityImpulse; + solverConstraint->m_appliedImpulse = 0.f; + + } + } + } +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h new file mode 100644 index 0000000000..d48ecf6ea6 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h @@ -0,0 +1,721 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* solveConstraintRowsCL= \ +"/*\n" +"Copyright (c) 2013 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Erwin Coumans\n" +"#define B3_CONSTRAINT_FLAG_ENABLED 1\n" +"#define B3_GPU_POINT2POINT_CONSTRAINT_TYPE 3\n" +"#define B3_GPU_FIXED_CONSTRAINT_TYPE 4\n" +"#define MOTIONCLAMP 100000 //unused, for debugging/safety in case constraint solver fails\n" +"#define B3_INFINITY 1e30f\n" +"#define mymake_float4 (float4)\n" +"__inline float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = mymake_float4(a.xyz,0.f);\n" +" float4 b1 = mymake_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"typedef float4 Quaternion;\n" +"typedef struct\n" +"{\n" +" float4 m_row[3];\n" +"}Matrix3x3;\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b);\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b);\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b)\n" +"{\n" +" float4 ans;\n" +" ans.x = dot3F4( a.m_row[0], b );\n" +" ans.y = dot3F4( a.m_row[1], b );\n" +" ans.z = dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b)\n" +"{\n" +" float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" float4 ans;\n" +" ans.x = dot3F4( a, colx );\n" +" ans.y = dot3F4( a, coly );\n" +" ans.z = dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"typedef struct\n" +"{\n" +" Matrix3x3 m_invInertiaWorld;\n" +" Matrix3x3 m_initInvInertia;\n" +"} BodyInertia;\n" +"typedef struct\n" +"{\n" +" Matrix3x3 m_basis;//orientation\n" +" float4 m_origin;//transform\n" +"}b3Transform;\n" +"typedef struct\n" +"{\n" +"// b3Transform m_worldTransformUnused;\n" +" float4 m_deltaLinearVelocity;\n" +" float4 m_deltaAngularVelocity;\n" +" float4 m_angularFactor;\n" +" float4 m_linearFactor;\n" +" float4 m_invMass;\n" +" float4 m_pushVelocity;\n" +" float4 m_turnVelocity;\n" +" float4 m_linearVelocity;\n" +" float4 m_angularVelocity;\n" +" union \n" +" {\n" +" void* m_originalBody;\n" +" int m_originalBodyIndex;\n" +" };\n" +" int padding[3];\n" +"} b3GpuSolverBody;\n" +"typedef struct\n" +"{\n" +" float4 m_pos;\n" +" Quaternion m_quat;\n" +" float4 m_linVel;\n" +" float4 m_angVel;\n" +" unsigned int m_shapeIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"} b3RigidBodyCL;\n" +"typedef struct\n" +"{\n" +" float4 m_relpos1CrossNormal;\n" +" float4 m_contactNormal;\n" +" float4 m_relpos2CrossNormal;\n" +" //float4 m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal\n" +" float4 m_angularComponentA;\n" +" float4 m_angularComponentB;\n" +" \n" +" float m_appliedPushImpulse;\n" +" float m_appliedImpulse;\n" +" int m_padding1;\n" +" int m_padding2;\n" +" float m_friction;\n" +" float m_jacDiagABInv;\n" +" float m_rhs;\n" +" float m_cfm;\n" +" \n" +" float m_lowerLimit;\n" +" float m_upperLimit;\n" +" float m_rhsPenetration;\n" +" int m_originalConstraint;\n" +" int m_overrideNumSolverIterations;\n" +" int m_frictionIndex;\n" +" int m_solverBodyIdA;\n" +" int m_solverBodyIdB;\n" +"} b3SolverConstraint;\n" +"typedef struct \n" +"{\n" +" int m_bodyAPtrAndSignBit;\n" +" int m_bodyBPtrAndSignBit;\n" +" int m_originalConstraintIndex;\n" +" int m_batchId;\n" +"} b3BatchConstraint;\n" +"typedef struct \n" +"{\n" +" int m_constraintType;\n" +" int m_rbA;\n" +" int m_rbB;\n" +" float m_breakingImpulseThreshold;\n" +" float4 m_pivotInA;\n" +" float4 m_pivotInB;\n" +" Quaternion m_relTargetAB;\n" +" int m_flags;\n" +" int m_padding[3];\n" +"} b3GpuGenericConstraint;\n" +"/*b3Transform getWorldTransform(b3RigidBodyCL* rb)\n" +"{\n" +" b3Transform newTrans;\n" +" newTrans.setOrigin(rb->m_pos);\n" +" newTrans.setRotation(rb->m_quat);\n" +" return newTrans;\n" +"}*/\n" +"__inline\n" +"float4 cross3(float4 a, float4 b)\n" +"{\n" +" return cross(a,b);\n" +"}\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" v = mymake_float4(v.xyz,0.f);\n" +" return fast_normalize(v);\n" +"}\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b);\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in);\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec);\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q);\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b)\n" +"{\n" +" Quaternion ans;\n" +" ans = cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in)\n" +"{\n" +" return fastNormalize4(in);\n" +"// in /= length( in );\n" +"// return in;\n" +"}\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec)\n" +"{\n" +" Quaternion qInv = qtInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q)\n" +"{\n" +" return (Quaternion)(-q.xyz, q.w);\n" +"}\n" +"__inline void internalApplyImpulse(__global b3GpuSolverBody* body, float4 linearComponent, float4 angularComponent,float impulseMagnitude)\n" +"{\n" +" body->m_deltaLinearVelocity += linearComponent*impulseMagnitude*body->m_linearFactor;\n" +" body->m_deltaAngularVelocity += angularComponent*(impulseMagnitude*body->m_angularFactor);\n" +"}\n" +"void resolveSingleConstraintRowGeneric(__global b3GpuSolverBody* body1, __global b3GpuSolverBody* body2, __global b3SolverConstraint* c)\n" +"{\n" +" float deltaImpulse = c->m_rhs-c->m_appliedImpulse*c->m_cfm;\n" +" float deltaVel1Dotn = dot3F4(c->m_contactNormal,body1->m_deltaLinearVelocity) + dot3F4(c->m_relpos1CrossNormal,body1->m_deltaAngularVelocity);\n" +" float deltaVel2Dotn = -dot3F4(c->m_contactNormal,body2->m_deltaLinearVelocity) + dot3F4(c->m_relpos2CrossNormal,body2->m_deltaAngularVelocity);\n" +" deltaImpulse -= deltaVel1Dotn*c->m_jacDiagABInv;\n" +" deltaImpulse -= deltaVel2Dotn*c->m_jacDiagABInv;\n" +" float sum = c->m_appliedImpulse + deltaImpulse;\n" +" if (sum < c->m_lowerLimit)\n" +" {\n" +" deltaImpulse = c->m_lowerLimit-c->m_appliedImpulse;\n" +" c->m_appliedImpulse = c->m_lowerLimit;\n" +" }\n" +" else if (sum > c->m_upperLimit) \n" +" {\n" +" deltaImpulse = c->m_upperLimit-c->m_appliedImpulse;\n" +" c->m_appliedImpulse = c->m_upperLimit;\n" +" }\n" +" else\n" +" {\n" +" c->m_appliedImpulse = sum;\n" +" }\n" +" internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse);\n" +" internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse);\n" +"}\n" +"__kernel void solveJointConstraintRows(__global b3GpuSolverBody* solverBodies,\n" +" __global b3BatchConstraint* batchConstraints,\n" +" __global b3SolverConstraint* rows,\n" +" __global unsigned int* numConstraintRowsInfo1, \n" +" __global unsigned int* rowOffsets,\n" +" __global b3GpuGenericConstraint* constraints,\n" +" int batchOffset,\n" +" int numConstraintsInBatch\n" +" )\n" +"{\n" +" int b = get_global_id(0);\n" +" if (b>=numConstraintsInBatch)\n" +" return;\n" +" __global b3BatchConstraint* c = &batchConstraints[b+batchOffset];\n" +" int originalConstraintIndex = c->m_originalConstraintIndex;\n" +" if (constraints[originalConstraintIndex].m_flags&B3_CONSTRAINT_FLAG_ENABLED)\n" +" {\n" +" int numConstraintRows = numConstraintRowsInfo1[originalConstraintIndex];\n" +" int rowOffset = rowOffsets[originalConstraintIndex];\n" +" for (int jj=0;jj<numConstraintRows;jj++)\n" +" {\n" +" __global b3SolverConstraint* constraint = &rows[rowOffset+jj];\n" +" resolveSingleConstraintRowGeneric(&solverBodies[constraint->m_solverBodyIdA],&solverBodies[constraint->m_solverBodyIdB],constraint);\n" +" }\n" +" }\n" +"};\n" +"__kernel void initSolverBodies(__global b3GpuSolverBody* solverBodies,__global b3RigidBodyCL* bodiesCL, int numBodies)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numBodies)\n" +" return;\n" +" __global b3GpuSolverBody* solverBody = &solverBodies[i];\n" +" __global b3RigidBodyCL* bodyCL = &bodiesCL[i];\n" +" solverBody->m_deltaLinearVelocity = (float4)(0.f,0.f,0.f,0.f);\n" +" solverBody->m_deltaAngularVelocity = (float4)(0.f,0.f,0.f,0.f);\n" +" solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f);\n" +" solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f);\n" +" solverBody->m_invMass = (float4)(bodyCL->m_invMass,bodyCL->m_invMass,bodyCL->m_invMass,0.f);\n" +" solverBody->m_originalBodyIndex = i;\n" +" solverBody->m_angularFactor = (float4)(1,1,1,0);\n" +" solverBody->m_linearFactor = (float4) (1,1,1,0);\n" +" solverBody->m_linearVelocity = bodyCL->m_linVel;\n" +" solverBody->m_angularVelocity = bodyCL->m_angVel;\n" +"}\n" +"__kernel void breakViolatedConstraintsKernel(__global b3GpuGenericConstraint* constraints, __global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, __global b3SolverConstraint* rows, int numConstraints)\n" +"{\n" +" int cid = get_global_id(0);\n" +" if (cid>=numConstraints)\n" +" return;\n" +" int numRows = numConstraintRows[cid];\n" +" if (numRows)\n" +" {\n" +" for (int i=0;i<numRows;i++)\n" +" {\n" +" int rowIndex = rowOffsets[cid]+i;\n" +" float breakingThreshold = constraints[cid].m_breakingImpulseThreshold;\n" +" if (fabs(rows[rowIndex].m_appliedImpulse) >= breakingThreshold)\n" +" {\n" +" constraints[cid].m_flags =0;//&= ~B3_CONSTRAINT_FLAG_ENABLED;\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__kernel void getInfo1Kernel(__global unsigned int* infos, __global b3GpuGenericConstraint* constraints, int numConstraints)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numConstraints)\n" +" return;\n" +" __global b3GpuGenericConstraint* constraint = &constraints[i];\n" +" switch (constraint->m_constraintType)\n" +" {\n" +" case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:\n" +" {\n" +" infos[i] = 3;\n" +" break;\n" +" }\n" +" case B3_GPU_FIXED_CONSTRAINT_TYPE:\n" +" {\n" +" infos[i] = 6;\n" +" break;\n" +" }\n" +" default:\n" +" {\n" +" }\n" +" }\n" +"}\n" +"__kernel void initBatchConstraintsKernel(__global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, \n" +" __global b3BatchConstraint* batchConstraints, \n" +" __global b3GpuGenericConstraint* constraints,\n" +" __global b3RigidBodyCL* bodies,\n" +" int numConstraints)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numConstraints)\n" +" return;\n" +" int rbA = constraints[i].m_rbA;\n" +" int rbB = constraints[i].m_rbB;\n" +" batchConstraints[i].m_bodyAPtrAndSignBit = bodies[rbA].m_invMass != 0.f ? rbA : -rbA;\n" +" batchConstraints[i].m_bodyBPtrAndSignBit = bodies[rbB].m_invMass != 0.f ? rbB : -rbB;\n" +" batchConstraints[i].m_batchId = -1;\n" +" batchConstraints[i].m_originalConstraintIndex = i;\n" +"}\n" +"typedef struct\n" +"{\n" +" // integrator parameters: frames per second (1/stepsize), default error\n" +" // reduction parameter (0..1).\n" +" float fps,erp;\n" +" // for the first and second body, pointers to two (linear and angular)\n" +" // n*3 jacobian sub matrices, stored by rows. these matrices will have\n" +" // been initialized to 0 on entry. if the second body is zero then the\n" +" // J2xx pointers may be 0.\n" +" union \n" +" {\n" +" __global float4* m_J1linearAxisFloat4;\n" +" __global float* m_J1linearAxis;\n" +" };\n" +" union\n" +" {\n" +" __global float4* m_J1angularAxisFloat4;\n" +" __global float* m_J1angularAxis;\n" +" };\n" +" union\n" +" {\n" +" __global float4* m_J2linearAxisFloat4;\n" +" __global float* m_J2linearAxis;\n" +" };\n" +" union\n" +" {\n" +" __global float4* m_J2angularAxisFloat4;\n" +" __global float* m_J2angularAxis;\n" +" };\n" +" // elements to jump from one row to the next in J's\n" +" int rowskip;\n" +" // right hand sides of the equation J*v = c + cfm * lambda. cfm is the\n" +" // \"constraint force mixing\" vector. c is set to zero on entry, cfm is\n" +" // set to a constant value (typically very small or zero) value on entry.\n" +" __global float* m_constraintError;\n" +" __global float* cfm;\n" +" // lo and hi limits for variables (set to -/+ infinity on entry).\n" +" __global float* m_lowerLimit;\n" +" __global float* m_upperLimit;\n" +" // findex vector for variables. see the LCP solver interface for a\n" +" // description of what this does. this is set to -1 on entry.\n" +" // note that the returned indexes are relative to the first index of\n" +" // the constraint.\n" +" __global int *findex;\n" +" // number of solver iterations\n" +" int m_numIterations;\n" +" //damping of the velocity\n" +" float m_damping;\n" +"} b3GpuConstraintInfo2;\n" +"void getSkewSymmetricMatrix(float4 vecIn, __global float4* v0,__global float4* v1,__global float4* v2)\n" +"{\n" +" *v0 = (float4)(0. ,-vecIn.z ,vecIn.y,0.f);\n" +" *v1 = (float4)(vecIn.z ,0. ,-vecIn.x,0.f);\n" +" *v2 = (float4)(-vecIn.y ,vecIn.x ,0.f,0.f);\n" +"}\n" +"void getInfo2Point2Point(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies)\n" +"{\n" +" float4 posA = bodies[constraint->m_rbA].m_pos;\n" +" Quaternion rotA = bodies[constraint->m_rbA].m_quat;\n" +" float4 posB = bodies[constraint->m_rbB].m_pos;\n" +" Quaternion rotB = bodies[constraint->m_rbB].m_quat;\n" +" // anchor points in global coordinates with respect to body PORs.\n" +" \n" +" // set jacobian\n" +" info->m_J1linearAxis[0] = 1;\n" +" info->m_J1linearAxis[info->rowskip+1] = 1;\n" +" info->m_J1linearAxis[2*info->rowskip+2] = 1;\n" +" float4 a1 = qtRotate(rotA,constraint->m_pivotInA);\n" +" {\n" +" __global float4* angular0 = (__global float4*)(info->m_J1angularAxis);\n" +" __global float4* angular1 = (__global float4*)(info->m_J1angularAxis+info->rowskip);\n" +" __global float4* angular2 = (__global float4*)(info->m_J1angularAxis+2*info->rowskip);\n" +" float4 a1neg = -a1;\n" +" getSkewSymmetricMatrix(a1neg,angular0,angular1,angular2);\n" +" }\n" +" if (info->m_J2linearAxis)\n" +" {\n" +" info->m_J2linearAxis[0] = -1;\n" +" info->m_J2linearAxis[info->rowskip+1] = -1;\n" +" info->m_J2linearAxis[2*info->rowskip+2] = -1;\n" +" }\n" +" \n" +" float4 a2 = qtRotate(rotB,constraint->m_pivotInB);\n" +" \n" +" {\n" +" // float4 a2n = -a2;\n" +" __global float4* angular0 = (__global float4*)(info->m_J2angularAxis);\n" +" __global float4* angular1 = (__global float4*)(info->m_J2angularAxis+info->rowskip);\n" +" __global float4* angular2 = (__global float4*)(info->m_J2angularAxis+2*info->rowskip);\n" +" getSkewSymmetricMatrix(a2,angular0,angular1,angular2);\n" +" }\n" +" \n" +" // set right hand side\n" +"// float currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp;\n" +" float currERP = info->erp;\n" +" float k = info->fps * currERP;\n" +" int j;\n" +" float4 result = a2 + posB - a1 - posA;\n" +" float* resultPtr = &result;\n" +" for (j=0; j<3; j++)\n" +" {\n" +" info->m_constraintError[j*info->rowskip] = k * (resultPtr[j]);\n" +" }\n" +"}\n" +"Quaternion nearest( Quaternion first, Quaternion qd)\n" +"{\n" +" Quaternion diff,sum;\n" +" diff = first- qd;\n" +" sum = first + qd;\n" +" \n" +" if( dot(diff,diff) < dot(sum,sum) )\n" +" return qd;\n" +" return (-qd);\n" +"}\n" +"float b3Acos(float x) \n" +"{ \n" +" if (x<-1) \n" +" x=-1; \n" +" if (x>1) \n" +" x=1;\n" +" return acos(x); \n" +"}\n" +"float getAngle(Quaternion orn)\n" +"{\n" +" if (orn.w>=1.f)\n" +" orn.w=1.f;\n" +" float s = 2.f * b3Acos(orn.w);\n" +" return s;\n" +"}\n" +"void calculateDiffAxisAngleQuaternion( Quaternion orn0,Quaternion orn1a,float4* axis,float* angle)\n" +"{\n" +" Quaternion orn1 = nearest(orn0,orn1a);\n" +" \n" +" Quaternion dorn = qtMul(orn1,qtInvert(orn0));\n" +" *angle = getAngle(dorn);\n" +" *axis = (float4)(dorn.x,dorn.y,dorn.z,0.f);\n" +" \n" +" //check for axis length\n" +" float len = dot3F4(*axis,*axis);\n" +" if (len < FLT_EPSILON*FLT_EPSILON)\n" +" *axis = (float4)(1,0,0,0);\n" +" else\n" +" *axis /= sqrt(len);\n" +"}\n" +"void getInfo2FixedOrientation(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies, int start_row)\n" +"{\n" +" Quaternion worldOrnA = bodies[constraint->m_rbA].m_quat;\n" +" Quaternion worldOrnB = bodies[constraint->m_rbB].m_quat;\n" +" int s = info->rowskip;\n" +" int start_index = start_row * s;\n" +" // 3 rows to make body rotations equal\n" +" info->m_J1angularAxis[start_index] = 1;\n" +" info->m_J1angularAxis[start_index + s + 1] = 1;\n" +" info->m_J1angularAxis[start_index + s*2+2] = 1;\n" +" if ( info->m_J2angularAxis)\n" +" {\n" +" info->m_J2angularAxis[start_index] = -1;\n" +" info->m_J2angularAxis[start_index + s+1] = -1;\n" +" info->m_J2angularAxis[start_index + s*2+2] = -1;\n" +" }\n" +" \n" +" float currERP = info->erp;\n" +" float k = info->fps * currERP;\n" +" float4 diff;\n" +" float angle;\n" +" float4 qrelCur = qtMul(worldOrnA,qtInvert(worldOrnB));\n" +" \n" +" calculateDiffAxisAngleQuaternion(constraint->m_relTargetAB,qrelCur,&diff,&angle);\n" +" diff*=-angle;\n" +" \n" +" float* resultPtr = &diff;\n" +" \n" +" for (int j=0; j<3; j++)\n" +" {\n" +" info->m_constraintError[(3+j)*info->rowskip] = k * resultPtr[j];\n" +" }\n" +" \n" +"}\n" +"__kernel void writeBackVelocitiesKernel(__global b3RigidBodyCL* bodies,__global b3GpuSolverBody* solverBodies,int numBodies)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numBodies)\n" +" return;\n" +" if (bodies[i].m_invMass)\n" +" {\n" +"// if (length(solverBodies[i].m_deltaLinearVelocity)<MOTIONCLAMP)\n" +" {\n" +" bodies[i].m_linVel += solverBodies[i].m_deltaLinearVelocity;\n" +" }\n" +"// if (length(solverBodies[i].m_deltaAngularVelocity)<MOTIONCLAMP)\n" +" {\n" +" bodies[i].m_angVel += solverBodies[i].m_deltaAngularVelocity;\n" +" } \n" +" }\n" +"}\n" +"__kernel void getInfo2Kernel(__global b3SolverConstraint* solverConstraintRows, \n" +" __global unsigned int* infos, \n" +" __global unsigned int* constraintRowOffsets, \n" +" __global b3GpuGenericConstraint* constraints, \n" +" __global b3BatchConstraint* batchConstraints, \n" +" __global b3RigidBodyCL* bodies,\n" +" __global BodyInertia* inertias,\n" +" __global b3GpuSolverBody* solverBodies,\n" +" float timeStep,\n" +" float globalErp,\n" +" float globalCfm,\n" +" float globalDamping,\n" +" int globalNumIterations,\n" +" int numConstraints)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numConstraints)\n" +" return;\n" +" \n" +" //for now, always initialize the batch info\n" +" int info1 = infos[i];\n" +" \n" +" __global b3SolverConstraint* currentConstraintRow = &solverConstraintRows[constraintRowOffsets[i]];\n" +" __global b3GpuGenericConstraint* constraint = &constraints[i];\n" +" __global b3RigidBodyCL* rbA = &bodies[ constraint->m_rbA];\n" +" __global b3RigidBodyCL* rbB = &bodies[ constraint->m_rbB];\n" +" int solverBodyIdA = constraint->m_rbA;\n" +" int solverBodyIdB = constraint->m_rbB;\n" +" __global b3GpuSolverBody* bodyAPtr = &solverBodies[solverBodyIdA];\n" +" __global b3GpuSolverBody* bodyBPtr = &solverBodies[solverBodyIdB];\n" +" if (rbA->m_invMass)\n" +" {\n" +" batchConstraints[i].m_bodyAPtrAndSignBit = solverBodyIdA;\n" +" } else\n" +" {\n" +"// if (!solverBodyIdA)\n" +"// m_staticIdx = 0;\n" +" batchConstraints[i].m_bodyAPtrAndSignBit = -solverBodyIdA;\n" +" }\n" +" if (rbB->m_invMass)\n" +" {\n" +" batchConstraints[i].m_bodyBPtrAndSignBit = solverBodyIdB;\n" +" } else\n" +" {\n" +"// if (!solverBodyIdB)\n" +"// m_staticIdx = 0;\n" +" batchConstraints[i].m_bodyBPtrAndSignBit = -solverBodyIdB;\n" +" }\n" +" if (info1)\n" +" {\n" +" int overrideNumSolverIterations = 0;//constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;\n" +"// if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations)\n" +" // m_maxOverrideNumSolverIterations = overrideNumSolverIterations;\n" +" int j;\n" +" for ( j=0;j<info1;j++)\n" +" {\n" +"// memset(¤tConstraintRow[j],0,sizeof(b3SolverConstraint));\n" +" currentConstraintRow[j].m_angularComponentA = (float4)(0,0,0,0);\n" +" currentConstraintRow[j].m_angularComponentB = (float4)(0,0,0,0);\n" +" currentConstraintRow[j].m_appliedImpulse = 0.f;\n" +" currentConstraintRow[j].m_appliedPushImpulse = 0.f;\n" +" currentConstraintRow[j].m_cfm = 0.f;\n" +" currentConstraintRow[j].m_contactNormal = (float4)(0,0,0,0);\n" +" currentConstraintRow[j].m_friction = 0.f;\n" +" currentConstraintRow[j].m_frictionIndex = 0;\n" +" currentConstraintRow[j].m_jacDiagABInv = 0.f;\n" +" currentConstraintRow[j].m_lowerLimit = 0.f;\n" +" currentConstraintRow[j].m_upperLimit = 0.f;\n" +" currentConstraintRow[j].m_originalConstraint = i;\n" +" currentConstraintRow[j].m_overrideNumSolverIterations = 0;\n" +" currentConstraintRow[j].m_relpos1CrossNormal = (float4)(0,0,0,0);\n" +" currentConstraintRow[j].m_relpos2CrossNormal = (float4)(0,0,0,0);\n" +" currentConstraintRow[j].m_rhs = 0.f;\n" +" currentConstraintRow[j].m_rhsPenetration = 0.f;\n" +" currentConstraintRow[j].m_solverBodyIdA = 0;\n" +" currentConstraintRow[j].m_solverBodyIdB = 0;\n" +" \n" +" currentConstraintRow[j].m_lowerLimit = -B3_INFINITY;\n" +" currentConstraintRow[j].m_upperLimit = B3_INFINITY;\n" +" currentConstraintRow[j].m_appliedImpulse = 0.f;\n" +" currentConstraintRow[j].m_appliedPushImpulse = 0.f;\n" +" currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA;\n" +" currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;\n" +" currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations; \n" +" }\n" +" bodyAPtr->m_deltaLinearVelocity = (float4)(0,0,0,0);\n" +" bodyAPtr->m_deltaAngularVelocity = (float4)(0,0,0,0);\n" +" bodyAPtr->m_pushVelocity = (float4)(0,0,0,0);\n" +" bodyAPtr->m_turnVelocity = (float4)(0,0,0,0);\n" +" bodyBPtr->m_deltaLinearVelocity = (float4)(0,0,0,0);\n" +" bodyBPtr->m_deltaAngularVelocity = (float4)(0,0,0,0);\n" +" bodyBPtr->m_pushVelocity = (float4)(0,0,0,0);\n" +" bodyBPtr->m_turnVelocity = (float4)(0,0,0,0);\n" +" int rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this\n" +" \n" +" b3GpuConstraintInfo2 info2;\n" +" info2.fps = 1.f/timeStep;\n" +" info2.erp = globalErp;\n" +" info2.m_J1linearAxisFloat4 = ¤tConstraintRow->m_contactNormal;\n" +" info2.m_J1angularAxisFloat4 = ¤tConstraintRow->m_relpos1CrossNormal;\n" +" info2.m_J2linearAxisFloat4 = 0;\n" +" info2.m_J2angularAxisFloat4 = ¤tConstraintRow->m_relpos2CrossNormal;\n" +" info2.rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this\n" +" ///the size of b3SolverConstraint needs be a multiple of float\n" +"// b3Assert(info2.rowskip*sizeof(float)== sizeof(b3SolverConstraint));\n" +" info2.m_constraintError = ¤tConstraintRow->m_rhs;\n" +" currentConstraintRow->m_cfm = globalCfm;\n" +" info2.m_damping = globalDamping;\n" +" info2.cfm = ¤tConstraintRow->m_cfm;\n" +" info2.m_lowerLimit = ¤tConstraintRow->m_lowerLimit;\n" +" info2.m_upperLimit = ¤tConstraintRow->m_upperLimit;\n" +" info2.m_numIterations = globalNumIterations;\n" +" switch (constraint->m_constraintType)\n" +" {\n" +" case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:\n" +" {\n" +" getInfo2Point2Point(constraint,&info2,bodies);\n" +" break;\n" +" }\n" +" case B3_GPU_FIXED_CONSTRAINT_TYPE:\n" +" {\n" +" getInfo2Point2Point(constraint,&info2,bodies);\n" +" getInfo2FixedOrientation(constraint,&info2,bodies,3);\n" +" break;\n" +" }\n" +" default:\n" +" {\n" +" }\n" +" }\n" +" ///finalize the constraint setup\n" +" for ( j=0;j<info1;j++)\n" +" {\n" +" __global b3SolverConstraint* solverConstraint = ¤tConstraintRow[j];\n" +" if (solverConstraint->m_upperLimit>=constraint->m_breakingImpulseThreshold)\n" +" {\n" +" solverConstraint->m_upperLimit = constraint->m_breakingImpulseThreshold;\n" +" }\n" +" if (solverConstraint->m_lowerLimit<=-constraint->m_breakingImpulseThreshold)\n" +" {\n" +" solverConstraint->m_lowerLimit = -constraint->m_breakingImpulseThreshold;\n" +" }\n" +"// solverConstraint->m_originalContactPoint = constraint;\n" +" \n" +" Matrix3x3 invInertiaWorldA= inertias[constraint->m_rbA].m_invInertiaWorld;\n" +" {\n" +" //float4 angularFactorA(1,1,1);\n" +" float4 ftorqueAxis1 = solverConstraint->m_relpos1CrossNormal;\n" +" solverConstraint->m_angularComponentA = mtMul1(invInertiaWorldA,ftorqueAxis1);//*angularFactorA;\n" +" }\n" +" \n" +" Matrix3x3 invInertiaWorldB= inertias[constraint->m_rbB].m_invInertiaWorld;\n" +" {\n" +" float4 ftorqueAxis2 = solverConstraint->m_relpos2CrossNormal;\n" +" solverConstraint->m_angularComponentB = mtMul1(invInertiaWorldB,ftorqueAxis2);//*constraint->m_rbB.getAngularFactor();\n" +" }\n" +" {\n" +" //it is ok to use solverConstraint->m_contactNormal instead of -solverConstraint->m_contactNormal\n" +" //because it gets multiplied iMJlB\n" +" float4 iMJlA = solverConstraint->m_contactNormal*rbA->m_invMass;\n" +" float4 iMJaA = mtMul3(solverConstraint->m_relpos1CrossNormal,invInertiaWorldA);\n" +" float4 iMJlB = solverConstraint->m_contactNormal*rbB->m_invMass;//sign of normal?\n" +" float4 iMJaB = mtMul3(solverConstraint->m_relpos2CrossNormal,invInertiaWorldB);\n" +" float sum = dot3F4(iMJlA,solverConstraint->m_contactNormal);\n" +" sum += dot3F4(iMJaA,solverConstraint->m_relpos1CrossNormal);\n" +" sum += dot3F4(iMJlB,solverConstraint->m_contactNormal);\n" +" sum += dot3F4(iMJaB,solverConstraint->m_relpos2CrossNormal);\n" +" float fsum = fabs(sum);\n" +" if (fsum>FLT_EPSILON)\n" +" {\n" +" solverConstraint->m_jacDiagABInv = 1.f/sum;\n" +" } else\n" +" {\n" +" solverConstraint->m_jacDiagABInv = 0.f;\n" +" }\n" +" }\n" +" ///fix rhs\n" +" ///todo: add force/torque accelerators\n" +" {\n" +" float rel_vel;\n" +" float vel1Dotn = dot3F4(solverConstraint->m_contactNormal,rbA->m_linVel) + dot3F4(solverConstraint->m_relpos1CrossNormal,rbA->m_angVel);\n" +" float vel2Dotn = -dot3F4(solverConstraint->m_contactNormal,rbB->m_linVel) + dot3F4(solverConstraint->m_relpos2CrossNormal,rbB->m_angVel);\n" +" rel_vel = vel1Dotn+vel2Dotn;\n" +" float restitution = 0.f;\n" +" float positionalError = solverConstraint->m_rhs;//already filled in by getConstraintInfo2\n" +" float velocityError = restitution - rel_vel * info2.m_damping;\n" +" float penetrationImpulse = positionalError*solverConstraint->m_jacDiagABInv;\n" +" float velocityImpulse = velocityError *solverConstraint->m_jacDiagABInv;\n" +" solverConstraint->m_rhs = penetrationImpulse+velocityImpulse;\n" +" solverConstraint->m_appliedImpulse = 0.f;\n" +" }\n" +" }\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl new file mode 100644 index 0000000000..5c4d62e4ec --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl @@ -0,0 +1,501 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +//#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile global int* +#endif + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define mymake_float4 (float4) +//#define make_float2 (float2) +//#define make_uint4 (uint4) +//#define make_int4 (int4) +//#define make_uint2 (uint2) +//#define make_int2 (int2) + + +#define max2 max +#define min2 min + + +/////////////////////////////////////// +// Vector +/////////////////////////////////////// + + + + +__inline +float4 fastNormalize4(float4 v) +{ + return fast_normalize(v); +} + + + +__inline +float4 cross3(float4 a, float4 b) +{ + return cross(a,b); +} + +__inline +float dot3F4(float4 a, float4 b) +{ + float4 a1 = mymake_float4(a.xyz,0.f); + float4 b1 = mymake_float4(b.xyz,0.f); + return dot(a1, b1); +} + + + + +__inline +float4 normalize3(const float4 a) +{ + float4 n = mymake_float4(a.x, a.y, a.z, 0.f); + return fastNormalize4( n ); +// float length = sqrtf(dot3F4(a, a)); +// return 1.f/length * a; +} + + + + +/////////////////////////////////////// +// Matrix3x3 +/////////////////////////////////////// + +typedef struct +{ + float4 m_row[3]; +}Matrix3x3; + + + + + + +__inline +float4 mtMul1(Matrix3x3 a, float4 b); + +__inline +float4 mtMul3(float4 a, Matrix3x3 b); + + + + +__inline +float4 mtMul1(Matrix3x3 a, float4 b) +{ + float4 ans; + ans.x = dot3F4( a.m_row[0], b ); + ans.y = dot3F4( a.m_row[1], b ); + ans.z = dot3F4( a.m_row[2], b ); + ans.w = 0.f; + return ans; +} + +__inline +float4 mtMul3(float4 a, Matrix3x3 b) +{ + float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0); + float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0); + float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0); + + float4 ans; + ans.x = dot3F4( a, colx ); + ans.y = dot3F4( a, coly ); + ans.z = dot3F4( a, colz ); + return ans; +} + +/////////////////////////////////////// +// Quaternion +/////////////////////////////////////// + +typedef float4 Quaternion; + + + + + + + +#define WG_SIZE 64 + +typedef struct +{ + float4 m_pos; + Quaternion m_quat; + float4 m_linVel; + float4 m_angVel; + + u32 m_shapeIdx; + float m_invMass; + float m_restituitionCoeff; + float m_frictionCoeff; +} Body; + +typedef struct +{ + Matrix3x3 m_invInertia; + Matrix3x3 m_initInvInertia; +} Shape; + +typedef struct +{ + float4 m_linear; + float4 m_worldPos[4]; + float4 m_center; + float m_jacCoeffInv[4]; + float m_b[4]; + float m_appliedRambdaDt[4]; + + float m_fJacCoeffInv[2]; + float m_fAppliedRambdaDt[2]; + + u32 m_bodyA; + u32 m_bodyB; + + int m_batchIdx; + u32 m_paddings[1]; +} Constraint4; + + + +typedef struct +{ + int m_nConstraints; + int m_start; + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBuffer; + +typedef struct +{ + int m_solveFriction; + int m_maxBatch; // long batch really kills the performance + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBufferBatchSolve; + +void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1); + +void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1) +{ + *linear = mymake_float4(-n.xyz,0.f); + *angular0 = -cross3(r0, n); + *angular1 = cross3(r1, n); +} + +float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 ); + +float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 ) +{ + return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1); +} + + +float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1, + float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1); + +float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1, + float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1) +{ + // linear0,1 are normlized + float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0; + float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0); + float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1; + float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1); + return -1.f/(jmj0+jmj1+jmj2+jmj3); +} + + +void solveContact(__global Constraint4* cs, + float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA, + float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB); + +void solveContact(__global Constraint4* cs, + float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA, + float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB) +{ + float minRambdaDt = 0; + float maxRambdaDt = FLT_MAX; + + for(int ic=0; ic<4; ic++) + { + if( cs->m_jacCoeffInv[ic] == 0.f ) continue; + + float4 angular0, angular1, linear; + float4 r0 = cs->m_worldPos[ic] - posA; + float4 r1 = cs->m_worldPos[ic] - posB; + setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 ); + + float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, + *linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic]; + rambdaDt *= cs->m_jacCoeffInv[ic]; + + { + float prevSum = cs->m_appliedRambdaDt[ic]; + float updated = prevSum; + updated += rambdaDt; + updated = max2( updated, minRambdaDt ); + updated = min2( updated, maxRambdaDt ); + rambdaDt = updated - prevSum; + cs->m_appliedRambdaDt[ic] = updated; + } + + float4 linImp0 = invMassA*linear*rambdaDt; + float4 linImp1 = invMassB*(-linear)*rambdaDt; + float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt; + float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt; + + *linVelA += linImp0; + *angVelA += angImp0; + *linVelB += linImp1; + *angVelB += angImp1; + } +} + +void btPlaneSpace1 (const float4* n, float4* p, float4* q); + void btPlaneSpace1 (const float4* n, float4* p, float4* q) +{ + if (fabs(n[0].z) > 0.70710678f) { + // choose p in y-z plane + float a = n[0].y*n[0].y + n[0].z*n[0].z; + float k = 1.f/sqrt(a); + p[0].x = 0; + p[0].y = -n[0].z*k; + p[0].z = n[0].y*k; + // set q = n x p + q[0].x = a*k; + q[0].y = -n[0].x*p[0].z; + q[0].z = n[0].x*p[0].y; + } + else { + // choose p in x-y plane + float a = n[0].x*n[0].x + n[0].y*n[0].y; + float k = 1.f/sqrt(a); + p[0].x = -n[0].y*k; + p[0].y = n[0].x*k; + p[0].z = 0; + // set q = n x p + q[0].x = -n[0].z*p[0].y; + q[0].y = n[0].z*p[0].x; + q[0].z = a*k; + } +} + +void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs); +void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs) +{ + //float frictionCoeff = ldsCs[0].m_linear.w; + int aIdx = ldsCs[0].m_bodyA; + int bIdx = ldsCs[0].m_bodyB; + + float4 posA = gBodies[aIdx].m_pos; + float4 linVelA = gBodies[aIdx].m_linVel; + float4 angVelA = gBodies[aIdx].m_angVel; + float invMassA = gBodies[aIdx].m_invMass; + Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia; + + float4 posB = gBodies[bIdx].m_pos; + float4 linVelB = gBodies[bIdx].m_linVel; + float4 angVelB = gBodies[bIdx].m_angVel; + float invMassB = gBodies[bIdx].m_invMass; + Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia; + + solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA, + posB, &linVelB, &angVelB, invMassB, invInertiaB ); + + if (gBodies[aIdx].m_invMass) + { + gBodies[aIdx].m_linVel = linVelA; + gBodies[aIdx].m_angVel = angVelA; + } else + { + gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0); + gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0); + + } + if (gBodies[bIdx].m_invMass) + { + gBodies[bIdx].m_linVel = linVelB; + gBodies[bIdx].m_angVel = angVelB; + } else + { + gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0); + gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0); + + } + +} + + + +typedef struct +{ + int m_valInt0; + int m_valInt1; + int m_valInt2; + int m_valInt3; + + float m_val0; + float m_val1; + float m_val2; + float m_val3; +} SolverDebugInfo; + + + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void BatchSolveKernelContact(__global Body* gBodies, + __global Shape* gShapes, + __global Constraint4* gConstraints, + __global int* gN, + __global int* gOffsets, + __global int* batchSizes, + int maxBatch1, + int cellBatch, + int4 nSplit + ) +{ + //__local int ldsBatchIdx[WG_SIZE+1]; + __local int ldsCurBatch; + __local int ldsNextBatch; + __local int ldsStart; + + int lIdx = GET_LOCAL_IDX; + int wgIdx = GET_GROUP_IDX; + +// int gIdx = GET_GLOBAL_IDX; +// debugInfo[gIdx].m_valInt0 = gIdx; + //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE; + + + + + int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2); + int remain= (wgIdx%((nSplit.x*nSplit.y)/4)); + int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1); + int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1); + int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y); + + //int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1); + //int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1); + //int cellIdx = xIdx+yIdx*nSplit; + + if( gN[cellIdx] == 0 ) + return; + + int maxBatch = batchSizes[cellIdx]; + + + const int start = gOffsets[cellIdx]; + const int end = start + gN[cellIdx]; + + + + + if( lIdx == 0 ) + { + ldsCurBatch = 0; + ldsNextBatch = 0; + ldsStart = start; + } + + + GROUP_LDS_BARRIER; + + int idx=ldsStart+lIdx; + while (ldsCurBatch < maxBatch) + { + for(; idx<end; ) + { + if (gConstraints[idx].m_batchIdx == ldsCurBatch) + { + solveContactConstraint( gBodies, gShapes, &gConstraints[idx] ); + + idx+=64; + } else + { + break; + } + } + GROUP_LDS_BARRIER; + + if( lIdx == 0 ) + { + ldsCurBatch++; + } + GROUP_LDS_BARRIER; + } + + +} + + + +__kernel void solveSingleContactKernel(__global Body* gBodies, + __global Shape* gShapes, + __global Constraint4* gConstraints, + int cellIdx, + int batchOffset, + int numConstraintsInBatch + ) +{ + + int index = get_global_id(0); + if (index < numConstraintsInBatch) + { + int idx=batchOffset+index; + solveContactConstraint( gBodies, gShapes, &gConstraints[idx] ); + } +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h new file mode 100644 index 0000000000..15a049992b --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h @@ -0,0 +1,393 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* solveContactCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile global int*\n" +"#endif\n" +"typedef unsigned int u32;\n" +"typedef unsigned short u16;\n" +"typedef unsigned char u8;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define mymake_float4 (float4)\n" +"//#define make_float2 (float2)\n" +"//#define make_uint4 (uint4)\n" +"//#define make_int4 (int4)\n" +"//#define make_uint2 (uint2)\n" +"//#define make_int2 (int2)\n" +"#define max2 max\n" +"#define min2 min\n" +"///////////////////////////////////////\n" +"// Vector\n" +"///////////////////////////////////////\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" return fast_normalize(v);\n" +"}\n" +"__inline\n" +"float4 cross3(float4 a, float4 b)\n" +"{\n" +" return cross(a,b);\n" +"}\n" +"__inline\n" +"float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = mymake_float4(a.xyz,0.f);\n" +" float4 b1 = mymake_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"__inline\n" +"float4 normalize3(const float4 a)\n" +"{\n" +" float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n" +" return fastNormalize4( n );\n" +"// float length = sqrtf(dot3F4(a, a));\n" +"// return 1.f/length * a;\n" +"}\n" +"///////////////////////////////////////\n" +"// Matrix3x3\n" +"///////////////////////////////////////\n" +"typedef struct\n" +"{\n" +" float4 m_row[3];\n" +"}Matrix3x3;\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b);\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b);\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b)\n" +"{\n" +" float4 ans;\n" +" ans.x = dot3F4( a.m_row[0], b );\n" +" ans.y = dot3F4( a.m_row[1], b );\n" +" ans.z = dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b)\n" +"{\n" +" float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" float4 ans;\n" +" ans.x = dot3F4( a, colx );\n" +" ans.y = dot3F4( a, coly );\n" +" ans.z = dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"///////////////////////////////////////\n" +"// Quaternion\n" +"///////////////////////////////////////\n" +"typedef float4 Quaternion;\n" +"#define WG_SIZE 64\n" +"typedef struct\n" +"{\n" +" float4 m_pos;\n" +" Quaternion m_quat;\n" +" float4 m_linVel;\n" +" float4 m_angVel;\n" +" u32 m_shapeIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"} Body;\n" +"typedef struct\n" +"{\n" +" Matrix3x3 m_invInertia;\n" +" Matrix3x3 m_initInvInertia;\n" +"} Shape;\n" +"typedef struct\n" +"{\n" +" float4 m_linear;\n" +" float4 m_worldPos[4];\n" +" float4 m_center; \n" +" float m_jacCoeffInv[4];\n" +" float m_b[4];\n" +" float m_appliedRambdaDt[4];\n" +" float m_fJacCoeffInv[2]; \n" +" float m_fAppliedRambdaDt[2]; \n" +" u32 m_bodyA;\n" +" u32 m_bodyB;\n" +" int m_batchIdx;\n" +" u32 m_paddings[1];\n" +"} Constraint4;\n" +"typedef struct\n" +"{\n" +" int m_nConstraints;\n" +" int m_start;\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBuffer;\n" +"typedef struct\n" +"{\n" +" int m_solveFriction;\n" +" int m_maxBatch; // long batch really kills the performance\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBufferBatchSolve;\n" +"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n" +"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n" +"{\n" +" *linear = mymake_float4(-n.xyz,0.f);\n" +" *angular0 = -cross3(r0, n);\n" +" *angular1 = cross3(r1, n);\n" +"}\n" +"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n" +"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n" +"{\n" +" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n" +"}\n" +"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" +" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n" +"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" +" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n" +"{\n" +" // linear0,1 are normlized\n" +" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n" +" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n" +" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n" +" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" +" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n" +"}\n" +"void solveContact(__global Constraint4* cs,\n" +" float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n" +" float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB);\n" +"void solveContact(__global Constraint4* cs,\n" +" float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n" +" float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB)\n" +"{\n" +" float minRambdaDt = 0;\n" +" float maxRambdaDt = FLT_MAX;\n" +" for(int ic=0; ic<4; ic++)\n" +" {\n" +" if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n" +" float4 angular0, angular1, linear;\n" +" float4 r0 = cs->m_worldPos[ic] - posA;\n" +" float4 r1 = cs->m_worldPos[ic] - posB;\n" +" setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n" +" float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n" +" *linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic];\n" +" rambdaDt *= cs->m_jacCoeffInv[ic];\n" +" {\n" +" float prevSum = cs->m_appliedRambdaDt[ic];\n" +" float updated = prevSum;\n" +" updated += rambdaDt;\n" +" updated = max2( updated, minRambdaDt );\n" +" updated = min2( updated, maxRambdaDt );\n" +" rambdaDt = updated - prevSum;\n" +" cs->m_appliedRambdaDt[ic] = updated;\n" +" }\n" +" float4 linImp0 = invMassA*linear*rambdaDt;\n" +" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" +" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" +" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" +" *linVelA += linImp0;\n" +" *angVelA += angImp0;\n" +" *linVelB += linImp1;\n" +" *angVelB += angImp1;\n" +" }\n" +"}\n" +"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n" +" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n" +"{\n" +" if (fabs(n[0].z) > 0.70710678f) {\n" +" // choose p in y-z plane\n" +" float a = n[0].y*n[0].y + n[0].z*n[0].z;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = 0;\n" +" p[0].y = -n[0].z*k;\n" +" p[0].z = n[0].y*k;\n" +" // set q = n x p\n" +" q[0].x = a*k;\n" +" q[0].y = -n[0].x*p[0].z;\n" +" q[0].z = n[0].x*p[0].y;\n" +" }\n" +" else {\n" +" // choose p in x-y plane\n" +" float a = n[0].x*n[0].x + n[0].y*n[0].y;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = -n[0].y*k;\n" +" p[0].y = n[0].x*k;\n" +" p[0].z = 0;\n" +" // set q = n x p\n" +" q[0].x = -n[0].z*p[0].y;\n" +" q[0].y = n[0].z*p[0].x;\n" +" q[0].z = a*k;\n" +" }\n" +"}\n" +"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n" +"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n" +"{\n" +" //float frictionCoeff = ldsCs[0].m_linear.w;\n" +" int aIdx = ldsCs[0].m_bodyA;\n" +" int bIdx = ldsCs[0].m_bodyB;\n" +" float4 posA = gBodies[aIdx].m_pos;\n" +" float4 linVelA = gBodies[aIdx].m_linVel;\n" +" float4 angVelA = gBodies[aIdx].m_angVel;\n" +" float invMassA = gBodies[aIdx].m_invMass;\n" +" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" +" float4 posB = gBodies[bIdx].m_pos;\n" +" float4 linVelB = gBodies[bIdx].m_linVel;\n" +" float4 angVelB = gBodies[bIdx].m_angVel;\n" +" float invMassB = gBodies[bIdx].m_invMass;\n" +" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" +" solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" +" posB, &linVelB, &angVelB, invMassB, invInertiaB );\n" +" if (gBodies[aIdx].m_invMass)\n" +" {\n" +" gBodies[aIdx].m_linVel = linVelA;\n" +" gBodies[aIdx].m_angVel = angVelA;\n" +" } else\n" +" {\n" +" gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n" +" gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n" +" \n" +" }\n" +" if (gBodies[bIdx].m_invMass)\n" +" {\n" +" gBodies[bIdx].m_linVel = linVelB;\n" +" gBodies[bIdx].m_angVel = angVelB;\n" +" } else\n" +" {\n" +" gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n" +" gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n" +" \n" +" }\n" +"}\n" +"typedef struct \n" +"{\n" +" int m_valInt0;\n" +" int m_valInt1;\n" +" int m_valInt2;\n" +" int m_valInt3;\n" +" float m_val0;\n" +" float m_val1;\n" +" float m_val2;\n" +" float m_val3;\n" +"} SolverDebugInfo;\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void BatchSolveKernelContact(__global Body* gBodies,\n" +" __global Shape* gShapes,\n" +" __global Constraint4* gConstraints,\n" +" __global int* gN,\n" +" __global int* gOffsets,\n" +" __global int* batchSizes,\n" +" int maxBatch1,\n" +" int cellBatch,\n" +" int4 nSplit\n" +" )\n" +"{\n" +" //__local int ldsBatchIdx[WG_SIZE+1];\n" +" __local int ldsCurBatch;\n" +" __local int ldsNextBatch;\n" +" __local int ldsStart;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" int wgIdx = GET_GROUP_IDX;\n" +"// int gIdx = GET_GLOBAL_IDX;\n" +"// debugInfo[gIdx].m_valInt0 = gIdx;\n" +" //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n" +" \n" +" \n" +" int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n" +" int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n" +" int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n" +" int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n" +" int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n" +" //int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n" +" //int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n" +" //int cellIdx = xIdx+yIdx*nSplit;\n" +" \n" +" if( gN[cellIdx] == 0 ) \n" +" return;\n" +" int maxBatch = batchSizes[cellIdx];\n" +" \n" +" \n" +" const int start = gOffsets[cellIdx];\n" +" const int end = start + gN[cellIdx];\n" +" \n" +" \n" +" \n" +" if( lIdx == 0 )\n" +" {\n" +" ldsCurBatch = 0;\n" +" ldsNextBatch = 0;\n" +" ldsStart = start;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" int idx=ldsStart+lIdx;\n" +" while (ldsCurBatch < maxBatch)\n" +" {\n" +" for(; idx<end; )\n" +" {\n" +" if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n" +" {\n" +" solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n" +" idx+=64;\n" +" } else\n" +" {\n" +" break;\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" \n" +" if( lIdx == 0 )\n" +" {\n" +" ldsCurBatch++;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" \n" +" \n" +"}\n" +"__kernel void solveSingleContactKernel(__global Body* gBodies,\n" +" __global Shape* gShapes,\n" +" __global Constraint4* gConstraints,\n" +" int cellIdx,\n" +" int batchOffset,\n" +" int numConstraintsInBatch\n" +" )\n" +"{\n" +" int index = get_global_id(0);\n" +" if (index < numConstraintsInBatch)\n" +" {\n" +" int idx=batchOffset+index;\n" +" solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n" +" } \n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl new file mode 100644 index 0000000000..1d70fbbae3 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl @@ -0,0 +1,527 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +//#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile global int* +#endif + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define mymake_float4 (float4) +//#define make_float2 (float2) +//#define make_uint4 (uint4) +//#define make_int4 (int4) +//#define make_uint2 (uint2) +//#define make_int2 (int2) + + +#define max2 max +#define min2 min + + +/////////////////////////////////////// +// Vector +/////////////////////////////////////// + + + + +__inline +float4 fastNormalize4(float4 v) +{ + return fast_normalize(v); +} + + + +__inline +float4 cross3(float4 a, float4 b) +{ + return cross(a,b); +} + +__inline +float dot3F4(float4 a, float4 b) +{ + float4 a1 = mymake_float4(a.xyz,0.f); + float4 b1 = mymake_float4(b.xyz,0.f); + return dot(a1, b1); +} + + + + +__inline +float4 normalize3(const float4 a) +{ + float4 n = mymake_float4(a.x, a.y, a.z, 0.f); + return fastNormalize4( n ); +// float length = sqrtf(dot3F4(a, a)); +// return 1.f/length * a; +} + + + + +/////////////////////////////////////// +// Matrix3x3 +/////////////////////////////////////// + +typedef struct +{ + float4 m_row[3]; +}Matrix3x3; + + + + + + +__inline +float4 mtMul1(Matrix3x3 a, float4 b); + +__inline +float4 mtMul3(float4 a, Matrix3x3 b); + + + + +__inline +float4 mtMul1(Matrix3x3 a, float4 b) +{ + float4 ans; + ans.x = dot3F4( a.m_row[0], b ); + ans.y = dot3F4( a.m_row[1], b ); + ans.z = dot3F4( a.m_row[2], b ); + ans.w = 0.f; + return ans; +} + +__inline +float4 mtMul3(float4 a, Matrix3x3 b) +{ + float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0); + float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0); + float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0); + + float4 ans; + ans.x = dot3F4( a, colx ); + ans.y = dot3F4( a, coly ); + ans.z = dot3F4( a, colz ); + return ans; +} + +/////////////////////////////////////// +// Quaternion +/////////////////////////////////////// + +typedef float4 Quaternion; + + + + + + + +#define WG_SIZE 64 + +typedef struct +{ + float4 m_pos; + Quaternion m_quat; + float4 m_linVel; + float4 m_angVel; + + u32 m_shapeIdx; + float m_invMass; + float m_restituitionCoeff; + float m_frictionCoeff; +} Body; + +typedef struct +{ + Matrix3x3 m_invInertia; + Matrix3x3 m_initInvInertia; +} Shape; + +typedef struct +{ + float4 m_linear; + float4 m_worldPos[4]; + float4 m_center; + float m_jacCoeffInv[4]; + float m_b[4]; + float m_appliedRambdaDt[4]; + + float m_fJacCoeffInv[2]; + float m_fAppliedRambdaDt[2]; + + u32 m_bodyA; + u32 m_bodyB; + + int m_batchIdx; + u32 m_paddings[1]; +} Constraint4; + + + +typedef struct +{ + int m_nConstraints; + int m_start; + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBuffer; + +typedef struct +{ + int m_solveFriction; + int m_maxBatch; // long batch really kills the performance + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBufferBatchSolve; + +void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1); + +void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1) +{ + *linear = mymake_float4(-n.xyz,0.f); + *angular0 = -cross3(r0, n); + *angular1 = cross3(r1, n); +} + +float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 ); + +float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 ) +{ + return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1); +} + + +float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1, + float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1); + +float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1, + float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1) +{ + // linear0,1 are normlized + float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0; + float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0); + float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1; + float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1); + return -1.f/(jmj0+jmj1+jmj2+jmj3); +} +void btPlaneSpace1 (const float4* n, float4* p, float4* q); + void btPlaneSpace1 (const float4* n, float4* p, float4* q) +{ + if (fabs(n[0].z) > 0.70710678f) { + // choose p in y-z plane + float a = n[0].y*n[0].y + n[0].z*n[0].z; + float k = 1.f/sqrt(a); + p[0].x = 0; + p[0].y = -n[0].z*k; + p[0].z = n[0].y*k; + // set q = n x p + q[0].x = a*k; + q[0].y = -n[0].x*p[0].z; + q[0].z = n[0].x*p[0].y; + } + else { + // choose p in x-y plane + float a = n[0].x*n[0].x + n[0].y*n[0].y; + float k = 1.f/sqrt(a); + p[0].x = -n[0].y*k; + p[0].y = n[0].x*k; + p[0].z = 0; + // set q = n x p + q[0].x = -n[0].z*p[0].y; + q[0].y = n[0].z*p[0].x; + q[0].z = a*k; + } +} + + +void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs); +void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs) +{ + float frictionCoeff = ldsCs[0].m_linear.w; + int aIdx = ldsCs[0].m_bodyA; + int bIdx = ldsCs[0].m_bodyB; + + + float4 posA = gBodies[aIdx].m_pos; + float4 linVelA = gBodies[aIdx].m_linVel; + float4 angVelA = gBodies[aIdx].m_angVel; + float invMassA = gBodies[aIdx].m_invMass; + Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia; + + float4 posB = gBodies[bIdx].m_pos; + float4 linVelB = gBodies[bIdx].m_linVel; + float4 angVelB = gBodies[bIdx].m_angVel; + float invMassB = gBodies[bIdx].m_invMass; + Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia; + + + { + float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + + float sum = 0; + for(int j=0; j<4; j++) + { + sum +=ldsCs[0].m_appliedRambdaDt[j]; + } + frictionCoeff = 0.7f; + for(int j=0; j<4; j++) + { + maxRambdaDt[j] = frictionCoeff*sum; + minRambdaDt[j] = -maxRambdaDt[j]; + } + + +// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA, +// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt ); + + + { + + __global Constraint4* cs = ldsCs; + + if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return; + const float4 center = cs->m_center; + + float4 n = -cs->m_linear; + + float4 tangent[2]; + btPlaneSpace1(&n,&tangent[0],&tangent[1]); + float4 angular0, angular1, linear; + float4 r0 = center - posA; + float4 r1 = center - posB; + for(int i=0; i<2; i++) + { + setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 ); + float rambdaDt = calcRelVel(linear, -linear, angular0, angular1, + linVelA, angVelA, linVelB, angVelB ); + rambdaDt *= cs->m_fJacCoeffInv[i]; + + { + float prevSum = cs->m_fAppliedRambdaDt[i]; + float updated = prevSum; + updated += rambdaDt; + updated = max2( updated, minRambdaDt[i] ); + updated = min2( updated, maxRambdaDt[i] ); + rambdaDt = updated - prevSum; + cs->m_fAppliedRambdaDt[i] = updated; + } + + float4 linImp0 = invMassA*linear*rambdaDt; + float4 linImp1 = invMassB*(-linear)*rambdaDt; + float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt; + float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt; + + linVelA += linImp0; + angVelA += angImp0; + linVelB += linImp1; + angVelB += angImp1; + } + { // angular damping for point constraint + float4 ab = normalize3( posB - posA ); + float4 ac = normalize3( center - posA ); + if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f)) + { + float angNA = dot3F4( n, angVelA ); + float angNB = dot3F4( n, angVelB ); + + angVelA -= (angNA*0.1f)*n; + angVelB -= (angNB*0.1f)*n; + } + } + } + + + + } + + if (gBodies[aIdx].m_invMass) + { + gBodies[aIdx].m_linVel = linVelA; + gBodies[aIdx].m_angVel = angVelA; + } else + { + gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0); + gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0); + } + if (gBodies[bIdx].m_invMass) + { + gBodies[bIdx].m_linVel = linVelB; + gBodies[bIdx].m_angVel = angVelB; + } else + { + gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0); + gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0); + } + + +} + +typedef struct +{ + int m_valInt0; + int m_valInt1; + int m_valInt2; + int m_valInt3; + + float m_val0; + float m_val1; + float m_val2; + float m_val3; +} SolverDebugInfo; + + + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void BatchSolveKernelFriction(__global Body* gBodies, + __global Shape* gShapes, + __global Constraint4* gConstraints, + __global int* gN, + __global int* gOffsets, + __global int* batchSizes, + int maxBatch1, + int cellBatch, + int4 nSplit + ) +{ + //__local int ldsBatchIdx[WG_SIZE+1]; + __local int ldsCurBatch; + __local int ldsNextBatch; + __local int ldsStart; + + int lIdx = GET_LOCAL_IDX; + int wgIdx = GET_GROUP_IDX; + +// int gIdx = GET_GLOBAL_IDX; +// debugInfo[gIdx].m_valInt0 = gIdx; + //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE; + + + int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2); + int remain= (wgIdx%((nSplit.x*nSplit.y)/4)); + int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1); + int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1); + int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y); + + + if( gN[cellIdx] == 0 ) + return; + + int maxBatch = batchSizes[cellIdx]; + + const int start = gOffsets[cellIdx]; + const int end = start + gN[cellIdx]; + + + if( lIdx == 0 ) + { + ldsCurBatch = 0; + ldsNextBatch = 0; + ldsStart = start; + } + + + GROUP_LDS_BARRIER; + + int idx=ldsStart+lIdx; + while (ldsCurBatch < maxBatch) + { + for(; idx<end; ) + { + if (gConstraints[idx].m_batchIdx == ldsCurBatch) + { + + solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] ); + + idx+=64; + } else + { + break; + } + } + GROUP_LDS_BARRIER; + if( lIdx == 0 ) + { + ldsCurBatch++; + } + GROUP_LDS_BARRIER; + } + + +} + + + + + + +__kernel void solveSingleFrictionKernel(__global Body* gBodies, + __global Shape* gShapes, + __global Constraint4* gConstraints, + int cellIdx, + int batchOffset, + int numConstraintsInBatch + ) +{ + + int index = get_global_id(0); + if (index < numConstraintsInBatch) + { + + int idx=batchOffset+index; + + solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] ); + } +}
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h new file mode 100644 index 0000000000..eb58674f22 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h @@ -0,0 +1,421 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* solveFrictionCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile global int*\n" +"#endif\n" +"typedef unsigned int u32;\n" +"typedef unsigned short u16;\n" +"typedef unsigned char u8;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define mymake_float4 (float4)\n" +"//#define make_float2 (float2)\n" +"//#define make_uint4 (uint4)\n" +"//#define make_int4 (int4)\n" +"//#define make_uint2 (uint2)\n" +"//#define make_int2 (int2)\n" +"#define max2 max\n" +"#define min2 min\n" +"///////////////////////////////////////\n" +"// Vector\n" +"///////////////////////////////////////\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" return fast_normalize(v);\n" +"}\n" +"__inline\n" +"float4 cross3(float4 a, float4 b)\n" +"{\n" +" return cross(a,b);\n" +"}\n" +"__inline\n" +"float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = mymake_float4(a.xyz,0.f);\n" +" float4 b1 = mymake_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"__inline\n" +"float4 normalize3(const float4 a)\n" +"{\n" +" float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n" +" return fastNormalize4( n );\n" +"// float length = sqrtf(dot3F4(a, a));\n" +"// return 1.f/length * a;\n" +"}\n" +"///////////////////////////////////////\n" +"// Matrix3x3\n" +"///////////////////////////////////////\n" +"typedef struct\n" +"{\n" +" float4 m_row[3];\n" +"}Matrix3x3;\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b);\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b);\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b)\n" +"{\n" +" float4 ans;\n" +" ans.x = dot3F4( a.m_row[0], b );\n" +" ans.y = dot3F4( a.m_row[1], b );\n" +" ans.z = dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b)\n" +"{\n" +" float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" float4 ans;\n" +" ans.x = dot3F4( a, colx );\n" +" ans.y = dot3F4( a, coly );\n" +" ans.z = dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"///////////////////////////////////////\n" +"// Quaternion\n" +"///////////////////////////////////////\n" +"typedef float4 Quaternion;\n" +"#define WG_SIZE 64\n" +"typedef struct\n" +"{\n" +" float4 m_pos;\n" +" Quaternion m_quat;\n" +" float4 m_linVel;\n" +" float4 m_angVel;\n" +" u32 m_shapeIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"} Body;\n" +"typedef struct\n" +"{\n" +" Matrix3x3 m_invInertia;\n" +" Matrix3x3 m_initInvInertia;\n" +"} Shape;\n" +"typedef struct\n" +"{\n" +" float4 m_linear;\n" +" float4 m_worldPos[4];\n" +" float4 m_center; \n" +" float m_jacCoeffInv[4];\n" +" float m_b[4];\n" +" float m_appliedRambdaDt[4];\n" +" float m_fJacCoeffInv[2]; \n" +" float m_fAppliedRambdaDt[2]; \n" +" u32 m_bodyA;\n" +" u32 m_bodyB;\n" +" int m_batchIdx;\n" +" u32 m_paddings[1];\n" +"} Constraint4;\n" +"typedef struct\n" +"{\n" +" int m_nConstraints;\n" +" int m_start;\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBuffer;\n" +"typedef struct\n" +"{\n" +" int m_solveFriction;\n" +" int m_maxBatch; // long batch really kills the performance\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBufferBatchSolve;\n" +"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n" +"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n" +"{\n" +" *linear = mymake_float4(-n.xyz,0.f);\n" +" *angular0 = -cross3(r0, n);\n" +" *angular1 = cross3(r1, n);\n" +"}\n" +"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n" +"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n" +"{\n" +" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n" +"}\n" +"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" +" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n" +"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" +" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n" +"{\n" +" // linear0,1 are normlized\n" +" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n" +" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n" +" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n" +" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" +" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n" +"}\n" +"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n" +" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n" +"{\n" +" if (fabs(n[0].z) > 0.70710678f) {\n" +" // choose p in y-z plane\n" +" float a = n[0].y*n[0].y + n[0].z*n[0].z;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = 0;\n" +" p[0].y = -n[0].z*k;\n" +" p[0].z = n[0].y*k;\n" +" // set q = n x p\n" +" q[0].x = a*k;\n" +" q[0].y = -n[0].x*p[0].z;\n" +" q[0].z = n[0].x*p[0].y;\n" +" }\n" +" else {\n" +" // choose p in x-y plane\n" +" float a = n[0].x*n[0].x + n[0].y*n[0].y;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = -n[0].y*k;\n" +" p[0].y = n[0].x*k;\n" +" p[0].z = 0;\n" +" // set q = n x p\n" +" q[0].x = -n[0].z*p[0].y;\n" +" q[0].y = n[0].z*p[0].x;\n" +" q[0].z = a*k;\n" +" }\n" +"}\n" +"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n" +"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n" +"{\n" +" float frictionCoeff = ldsCs[0].m_linear.w;\n" +" int aIdx = ldsCs[0].m_bodyA;\n" +" int bIdx = ldsCs[0].m_bodyB;\n" +" float4 posA = gBodies[aIdx].m_pos;\n" +" float4 linVelA = gBodies[aIdx].m_linVel;\n" +" float4 angVelA = gBodies[aIdx].m_angVel;\n" +" float invMassA = gBodies[aIdx].m_invMass;\n" +" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" +" float4 posB = gBodies[bIdx].m_pos;\n" +" float4 linVelB = gBodies[bIdx].m_linVel;\n" +" float4 angVelB = gBodies[bIdx].m_angVel;\n" +" float invMassB = gBodies[bIdx].m_invMass;\n" +" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" +" \n" +" {\n" +" float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n" +" float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n" +" float sum = 0;\n" +" for(int j=0; j<4; j++)\n" +" {\n" +" sum +=ldsCs[0].m_appliedRambdaDt[j];\n" +" }\n" +" frictionCoeff = 0.7f;\n" +" for(int j=0; j<4; j++)\n" +" {\n" +" maxRambdaDt[j] = frictionCoeff*sum;\n" +" minRambdaDt[j] = -maxRambdaDt[j];\n" +" }\n" +" \n" +"// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" +"// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n" +" \n" +" \n" +" {\n" +" \n" +" __global Constraint4* cs = ldsCs;\n" +" \n" +" if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;\n" +" const float4 center = cs->m_center;\n" +" \n" +" float4 n = -cs->m_linear;\n" +" \n" +" float4 tangent[2];\n" +" btPlaneSpace1(&n,&tangent[0],&tangent[1]);\n" +" float4 angular0, angular1, linear;\n" +" float4 r0 = center - posA;\n" +" float4 r1 = center - posB;\n" +" for(int i=0; i<2; i++)\n" +" {\n" +" setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );\n" +" float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,\n" +" linVelA, angVelA, linVelB, angVelB );\n" +" rambdaDt *= cs->m_fJacCoeffInv[i];\n" +" \n" +" {\n" +" float prevSum = cs->m_fAppliedRambdaDt[i];\n" +" float updated = prevSum;\n" +" updated += rambdaDt;\n" +" updated = max2( updated, minRambdaDt[i] );\n" +" updated = min2( updated, maxRambdaDt[i] );\n" +" rambdaDt = updated - prevSum;\n" +" cs->m_fAppliedRambdaDt[i] = updated;\n" +" }\n" +" \n" +" float4 linImp0 = invMassA*linear*rambdaDt;\n" +" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" +" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" +" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" +" \n" +" linVelA += linImp0;\n" +" angVelA += angImp0;\n" +" linVelB += linImp1;\n" +" angVelB += angImp1;\n" +" }\n" +" { // angular damping for point constraint\n" +" float4 ab = normalize3( posB - posA );\n" +" float4 ac = normalize3( center - posA );\n" +" if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))\n" +" {\n" +" float angNA = dot3F4( n, angVelA );\n" +" float angNB = dot3F4( n, angVelB );\n" +" \n" +" angVelA -= (angNA*0.1f)*n;\n" +" angVelB -= (angNB*0.1f)*n;\n" +" }\n" +" }\n" +" }\n" +" \n" +" \n" +" }\n" +" if (gBodies[aIdx].m_invMass)\n" +" {\n" +" gBodies[aIdx].m_linVel = linVelA;\n" +" gBodies[aIdx].m_angVel = angVelA;\n" +" } else\n" +" {\n" +" gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n" +" gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n" +" }\n" +" if (gBodies[bIdx].m_invMass)\n" +" {\n" +" gBodies[bIdx].m_linVel = linVelB;\n" +" gBodies[bIdx].m_angVel = angVelB;\n" +" } else\n" +" {\n" +" gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n" +" gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n" +" }\n" +" \n" +"}\n" +"typedef struct \n" +"{\n" +" int m_valInt0;\n" +" int m_valInt1;\n" +" int m_valInt2;\n" +" int m_valInt3;\n" +" float m_val0;\n" +" float m_val1;\n" +" float m_val2;\n" +" float m_val3;\n" +"} SolverDebugInfo;\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void BatchSolveKernelFriction(__global Body* gBodies,\n" +" __global Shape* gShapes,\n" +" __global Constraint4* gConstraints,\n" +" __global int* gN,\n" +" __global int* gOffsets,\n" +" __global int* batchSizes,\n" +" int maxBatch1,\n" +" int cellBatch,\n" +" int4 nSplit\n" +" )\n" +"{\n" +" //__local int ldsBatchIdx[WG_SIZE+1];\n" +" __local int ldsCurBatch;\n" +" __local int ldsNextBatch;\n" +" __local int ldsStart;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" int wgIdx = GET_GROUP_IDX;\n" +"// int gIdx = GET_GLOBAL_IDX;\n" +"// debugInfo[gIdx].m_valInt0 = gIdx;\n" +" //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n" +" int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n" +" int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n" +" int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n" +" int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n" +" int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n" +" \n" +" if( gN[cellIdx] == 0 ) \n" +" return;\n" +" int maxBatch = batchSizes[cellIdx];\n" +" const int start = gOffsets[cellIdx];\n" +" const int end = start + gN[cellIdx];\n" +" \n" +" if( lIdx == 0 )\n" +" {\n" +" ldsCurBatch = 0;\n" +" ldsNextBatch = 0;\n" +" ldsStart = start;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" int idx=ldsStart+lIdx;\n" +" while (ldsCurBatch < maxBatch)\n" +" {\n" +" for(; idx<end; )\n" +" {\n" +" if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n" +" {\n" +" solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n" +" idx+=64;\n" +" } else\n" +" {\n" +" break;\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" if( lIdx == 0 )\n" +" {\n" +" ldsCurBatch++;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" \n" +" \n" +"}\n" +"__kernel void solveSingleFrictionKernel(__global Body* gBodies,\n" +" __global Shape* gShapes,\n" +" __global Constraint4* gConstraints,\n" +" int cellIdx,\n" +" int batchOffset,\n" +" int numConstraintsInBatch\n" +" )\n" +"{\n" +" int index = get_global_id(0);\n" +" if (index < numConstraintsInBatch)\n" +" {\n" +" \n" +" int idx=batchOffset+index;\n" +" \n" +" solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n" +" } \n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl new file mode 100644 index 0000000000..8e2de7b5a6 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl @@ -0,0 +1,277 @@ + +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + +#include "Bullet3Dynamics/shared/b3ConvertConstraint4.h" + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile global int* +#endif + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define make_float4 (float4) +#define make_float2 (float2) +#define make_uint4 (uint4) +#define make_int4 (int4) +#define make_uint2 (uint2) +#define make_int2 (int2) + + +#define max2 max +#define min2 min + + +/////////////////////////////////////// +// Vector +/////////////////////////////////////// +__inline +float fastDiv(float numerator, float denominator) +{ + return native_divide(numerator, denominator); +// return numerator/denominator; +} + +__inline +float4 fastDiv4(float4 numerator, float4 denominator) +{ + return native_divide(numerator, denominator); +} + +__inline +float fastSqrtf(float f2) +{ + return native_sqrt(f2); +// return sqrt(f2); +} + +__inline +float fastRSqrt(float f2) +{ + return native_rsqrt(f2); +} + +__inline +float fastLength4(float4 v) +{ + return fast_length(v); +} + +__inline +float4 fastNormalize4(float4 v) +{ + return fast_normalize(v); +} + + +__inline +float sqrtf(float a) +{ +// return sqrt(a); + return native_sqrt(a); +} + +__inline +float4 cross3(float4 a, float4 b) +{ + return cross(a,b); +} + +__inline +float dot3F4(float4 a, float4 b) +{ + float4 a1 = make_float4(a.xyz,0.f); + float4 b1 = make_float4(b.xyz,0.f); + return dot(a1, b1); +} + +__inline +float length3(const float4 a) +{ + return sqrtf(dot3F4(a,a)); +} + +__inline +float dot4(const float4 a, const float4 b) +{ + return dot( a, b ); +} + +// for height +__inline +float dot3w1(const float4 point, const float4 eqn) +{ + return dot3F4(point,eqn) + eqn.w; +} + +__inline +float4 normalize3(const float4 a) +{ + float4 n = make_float4(a.x, a.y, a.z, 0.f); + return fastNormalize4( n ); +// float length = sqrtf(dot3F4(a, a)); +// return 1.f/length * a; +} + +__inline +float4 normalize4(const float4 a) +{ + float length = sqrtf(dot4(a, a)); + return 1.f/length * a; +} + +__inline +float4 createEquation(const float4 a, const float4 b, const float4 c) +{ + float4 eqn; + float4 ab = b-a; + float4 ac = c-a; + eqn = normalize3( cross3(ab, ac) ); + eqn.w = -dot3F4(eqn,a); + return eqn; +} + + + +#define WG_SIZE 64 + + + + + + + +typedef struct +{ + int m_nConstraints; + int m_start; + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBuffer; + +typedef struct +{ + int m_solveFriction; + int m_maxBatch; // long batch really kills the performance + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBufferBatchSolve; + + + + + + + +typedef struct +{ + int m_valInt0; + int m_valInt1; + int m_valInt2; + int m_valInt3; + + float m_val0; + float m_val1; + float m_val2; + float m_val3; +} SolverDebugInfo; + + + + + + +typedef struct +{ + int m_nContacts; + float m_dt; + float m_positionDrift; + float m_positionConstraintCoeff; +} ConstBufferCTC; + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void ContactToConstraintKernel(__global struct b3Contact4Data* gContact, __global b3RigidBodyData_t* gBodies, __global b3InertiaData_t* gShapes, __global b3ContactConstraint4_t* gConstraintOut, +int nContacts, +float dt, +float positionDrift, +float positionConstraintCoeff +) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit); + int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit); + + float4 posA = gBodies[aIdx].m_pos; + float4 linVelA = gBodies[aIdx].m_linVel; + float4 angVelA = gBodies[aIdx].m_angVel; + float invMassA = gBodies[aIdx].m_invMass; + b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia; + + float4 posB = gBodies[bIdx].m_pos; + float4 linVelB = gBodies[bIdx].m_linVel; + float4 angVelB = gBodies[bIdx].m_angVel; + float invMassB = gBodies[bIdx].m_invMass; + b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia; + + b3ContactConstraint4_t cs; + + setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB, + &gContact[gIdx], dt, positionDrift, positionConstraintCoeff, + &cs ); + + cs.m_batchIdx = gContact[gIdx].m_batchIdx; + + gConstraintOut[gIdx] = cs; + } +} + + + + + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h new file mode 100644 index 0000000000..eb1834ee00 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h @@ -0,0 +1,703 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* solverSetupCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"#ifndef B3_CONTACT4DATA_H\n" +"#define B3_CONTACT4DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"typedef struct b3Contact4Data b3Contact4Data_t;\n" +"struct b3Contact4Data\n" +"{\n" +" b3Float4 m_worldPosB[4];\n" +"// b3Float4 m_localPosA[4];\n" +"// b3Float4 m_localPosB[4];\n" +" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" +" unsigned short m_restituitionCoeffCmp;\n" +" unsigned short m_frictionCoeffCmp;\n" +" int m_batchIdx;\n" +" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" +" int m_bodyBPtrAndSignBit;\n" +" int m_childIndexA;\n" +" int m_childIndexB;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"};\n" +"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" +"{\n" +" return (int)contact->m_worldNormalOnB.w;\n" +"};\n" +"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" +"{\n" +" contact->m_worldNormalOnB.w = (float)numPoints;\n" +"};\n" +"#endif //B3_CONTACT4DATA_H\n" +"#ifndef B3_CONTACT_CONSTRAINT5_H\n" +"#define B3_CONTACT_CONSTRAINT5_H\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"typedef struct b3ContactConstraint4 b3ContactConstraint4_t;\n" +"struct b3ContactConstraint4\n" +"{\n" +" b3Float4 m_linear;//normal?\n" +" b3Float4 m_worldPos[4];\n" +" b3Float4 m_center; // friction\n" +" float m_jacCoeffInv[4];\n" +" float m_b[4];\n" +" float m_appliedRambdaDt[4];\n" +" float m_fJacCoeffInv[2]; // friction\n" +" float m_fAppliedRambdaDt[2]; // friction\n" +" unsigned int m_bodyA;\n" +" unsigned int m_bodyB;\n" +" int m_batchIdx;\n" +" unsigned int m_paddings;\n" +"};\n" +"//inline void setFrictionCoeff(float value) { m_linear[3] = value; }\n" +"inline float b3GetFrictionCoeff(b3ContactConstraint4_t* constraint) \n" +"{\n" +" return constraint->m_linear.w; \n" +"}\n" +"#endif //B3_CONTACT_CONSTRAINT5_H\n" +"#ifndef B3_RIGIDBODY_DATA_H\n" +"#define B3_RIGIDBODY_DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_QUAT_H\n" +"#define B3_QUAT_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif\n" +"#endif\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Quat;\n" +" #define b3QuatConstArg const b3Quat\n" +" \n" +" \n" +"inline float4 b3FastNormalize4(float4 v)\n" +"{\n" +" v = (float4)(v.xyz,0.f);\n" +" return fast_normalize(v);\n" +"}\n" +" \n" +"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" +"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" +"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" +"{\n" +" b3Quat ans;\n" +" ans = b3Cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - b3Dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" +"{\n" +" b3Quat q;\n" +" q=in;\n" +" //return b3FastNormalize4(in);\n" +" float len = native_sqrt(dot(q, q));\n" +" if(len > 0.f)\n" +" {\n" +" q *= 1.f / len;\n" +" }\n" +" else\n" +" {\n" +" q.x = q.y = q.z = 0.f;\n" +" q.w = 1.f;\n" +" }\n" +" return q;\n" +"}\n" +"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" b3Quat qInv = b3QuatInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" return b3QuatRotate( b3QuatInvert( q ), vec );\n" +"}\n" +"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" +"{\n" +" return b3QuatRotate( orientation, point ) + (translation);\n" +"}\n" +" \n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifndef B3_MAT3x3_H\n" +"#define B3_MAT3x3_H\n" +"#ifndef B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"typedef struct\n" +"{\n" +" b3Float4 m_row[3];\n" +"}b3Mat3x3;\n" +"#define b3Mat3x3ConstArg const b3Mat3x3\n" +"#define b3GetRow(m,row) (m.m_row[row])\n" +"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" +"{\n" +" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" +" b3Mat3x3 out;\n" +" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" +" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" +" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" +" out.m_row[0].w = 0.f;\n" +" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" +" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" +" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" +" out.m_row[1].w = 0.f;\n" +" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" +" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" +" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" +" out.m_row[2].w = 0.f;\n" +" return out;\n" +"}\n" +"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = fabs(matIn.m_row[0]);\n" +" out.m_row[1] = fabs(matIn.m_row[1]);\n" +" out.m_row[2] = fabs(matIn.m_row[2]);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtZero();\n" +"__inline\n" +"b3Mat3x3 mtIdentity();\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Mat3x3 mtZero()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(0.f);\n" +" m.m_row[1] = (b3Float4)(0.f);\n" +" m.m_row[2] = (b3Float4)(0.f);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtIdentity()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(1,0,0,0);\n" +" m.m_row[1] = (b3Float4)(0,1,0,0);\n" +" m.m_row[2] = (b3Float4)(0,0,1,0);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" +" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" +" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" +"{\n" +" b3Mat3x3 transB;\n" +" transB = mtTranspose( b );\n" +" b3Mat3x3 ans;\n" +" // why this doesn't run when 0ing in the for{}\n" +" a.m_row[0].w = 0.f;\n" +" a.m_row[1].w = 0.f;\n" +" a.m_row[2].w = 0.f;\n" +" for(int i=0; i<3; i++)\n" +" {\n" +"// a.m_row[i].w = 0.f;\n" +" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" +" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" +" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" +" ans.m_row[i].w = 0.f;\n" +" }\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" +"{\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a.m_row[0], b );\n" +" ans.y = b3Dot3F4( a.m_row[1], b );\n" +" ans.z = b3Dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" +"{\n" +" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a, colx );\n" +" ans.y = b3Dot3F4( a, coly );\n" +" ans.z = b3Dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"#endif\n" +"#endif //B3_MAT3x3_H\n" +"typedef struct b3RigidBodyData b3RigidBodyData_t;\n" +"struct b3RigidBodyData\n" +"{\n" +" b3Float4 m_pos;\n" +" b3Quat m_quat;\n" +" b3Float4 m_linVel;\n" +" b3Float4 m_angVel;\n" +" int m_collidableIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"};\n" +"typedef struct b3InertiaData b3InertiaData_t;\n" +"struct b3InertiaData\n" +"{\n" +" b3Mat3x3 m_invInertiaWorld;\n" +" b3Mat3x3 m_initInvInertia;\n" +"};\n" +"#endif //B3_RIGIDBODY_DATA_H\n" +" \n" +"void b3PlaneSpace1 (b3Float4ConstArg n, b3Float4* p, b3Float4* q);\n" +" void b3PlaneSpace1 (b3Float4ConstArg n, b3Float4* p, b3Float4* q)\n" +"{\n" +" if (b3Fabs(n.z) > 0.70710678f) {\n" +" // choose p in y-z plane\n" +" float a = n.y*n.y + n.z*n.z;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = 0;\n" +" p[0].y = -n.z*k;\n" +" p[0].z = n.y*k;\n" +" // set q = n x p\n" +" q[0].x = a*k;\n" +" q[0].y = -n.x*p[0].z;\n" +" q[0].z = n.x*p[0].y;\n" +" }\n" +" else {\n" +" // choose p in x-y plane\n" +" float a = n.x*n.x + n.y*n.y;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = -n.y*k;\n" +" p[0].y = n.x*k;\n" +" p[0].z = 0;\n" +" // set q = n x p\n" +" q[0].x = -n.z*p[0].y;\n" +" q[0].y = n.z*p[0].x;\n" +" q[0].z = a*k;\n" +" }\n" +"}\n" +" \n" +"void setLinearAndAngular( b3Float4ConstArg n, b3Float4ConstArg r0, b3Float4ConstArg r1, b3Float4* linear, b3Float4* angular0, b3Float4* angular1)\n" +"{\n" +" *linear = b3MakeFloat4(n.x,n.y,n.z,0.f);\n" +" *angular0 = b3Cross3(r0, n);\n" +" *angular1 = -b3Cross3(r1, n);\n" +"}\n" +"float calcRelVel( b3Float4ConstArg l0, b3Float4ConstArg l1, b3Float4ConstArg a0, b3Float4ConstArg a1, b3Float4ConstArg linVel0,\n" +" b3Float4ConstArg angVel0, b3Float4ConstArg linVel1, b3Float4ConstArg angVel1 )\n" +"{\n" +" return b3Dot3F4(l0, linVel0) + b3Dot3F4(a0, angVel0) + b3Dot3F4(l1, linVel1) + b3Dot3F4(a1, angVel1);\n" +"}\n" +"float calcJacCoeff(b3Float4ConstArg linear0, b3Float4ConstArg linear1, b3Float4ConstArg angular0, b3Float4ConstArg angular1,\n" +" float invMass0, const b3Mat3x3* invInertia0, float invMass1, const b3Mat3x3* invInertia1)\n" +"{\n" +" // linear0,1 are normlized\n" +" float jmj0 = invMass0;//b3Dot3F4(linear0, linear0)*invMass0;\n" +" float jmj1 = b3Dot3F4(mtMul3(angular0,*invInertia0), angular0);\n" +" float jmj2 = invMass1;//b3Dot3F4(linear1, linear1)*invMass1;\n" +" float jmj3 = b3Dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" +" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n" +"}\n" +"void setConstraint4( b3Float4ConstArg posA, b3Float4ConstArg linVelA, b3Float4ConstArg angVelA, float invMassA, b3Mat3x3ConstArg invInertiaA,\n" +" b3Float4ConstArg posB, b3Float4ConstArg linVelB, b3Float4ConstArg angVelB, float invMassB, b3Mat3x3ConstArg invInertiaB, \n" +" __global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,\n" +" b3ContactConstraint4_t* dstC )\n" +"{\n" +" dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n" +" dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n" +" float dtInv = 1.f/dt;\n" +" for(int ic=0; ic<4; ic++)\n" +" {\n" +" dstC->m_appliedRambdaDt[ic] = 0.f;\n" +" }\n" +" dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n" +" dstC->m_linear = src->m_worldNormalOnB;\n" +" dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n" +" for(int ic=0; ic<4; ic++)\n" +" {\n" +" b3Float4 r0 = src->m_worldPosB[ic] - posA;\n" +" b3Float4 r1 = src->m_worldPosB[ic] - posB;\n" +" if( ic >= src->m_worldNormalOnB.w )//npoints\n" +" {\n" +" dstC->m_jacCoeffInv[ic] = 0.f;\n" +" continue;\n" +" }\n" +" float relVelN;\n" +" {\n" +" b3Float4 linear, angular0, angular1;\n" +" setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1);\n" +" dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n" +" invMassA, &invInertiaA, invMassB, &invInertiaB );\n" +" relVelN = calcRelVel(linear, -linear, angular0, angular1,\n" +" linVelA, angVelA, linVelB, angVelB);\n" +" float e = 0.f;//src->getRestituitionCoeff();\n" +" if( relVelN*relVelN < 0.004f ) e = 0.f;\n" +" dstC->m_b[ic] = e*relVelN;\n" +" //float penetration = src->m_worldPosB[ic].w;\n" +" dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n" +" dstC->m_appliedRambdaDt[ic] = 0.f;\n" +" }\n" +" }\n" +" if( src->m_worldNormalOnB.w > 0 )//npoints\n" +" { // prepare friction\n" +" b3Float4 center = b3MakeFloat4(0.f,0.f,0.f,0.f);\n" +" for(int i=0; i<src->m_worldNormalOnB.w; i++) \n" +" center += src->m_worldPosB[i];\n" +" center /= (float)src->m_worldNormalOnB.w;\n" +" b3Float4 tangent[2];\n" +" b3PlaneSpace1(src->m_worldNormalOnB,&tangent[0],&tangent[1]);\n" +" \n" +" b3Float4 r[2];\n" +" r[0] = center - posA;\n" +" r[1] = center - posB;\n" +" for(int i=0; i<2; i++)\n" +" {\n" +" b3Float4 linear, angular0, angular1;\n" +" setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n" +" dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n" +" invMassA, &invInertiaA, invMassB, &invInertiaB );\n" +" dstC->m_fAppliedRambdaDt[i] = 0.f;\n" +" }\n" +" dstC->m_center = center;\n" +" }\n" +" for(int i=0; i<4; i++)\n" +" {\n" +" if( i<src->m_worldNormalOnB.w )\n" +" {\n" +" dstC->m_worldPos[i] = src->m_worldPosB[i];\n" +" }\n" +" else\n" +" {\n" +" dstC->m_worldPos[i] = b3MakeFloat4(0.f,0.f,0.f,0.f);\n" +" }\n" +" }\n" +"}\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile global int*\n" +"#endif\n" +"typedef unsigned int u32;\n" +"typedef unsigned short u16;\n" +"typedef unsigned char u8;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define make_float4 (float4)\n" +"#define make_float2 (float2)\n" +"#define make_uint4 (uint4)\n" +"#define make_int4 (int4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"#define max2 max\n" +"#define min2 min\n" +"///////////////////////////////////////\n" +"// Vector\n" +"///////////////////////////////////////\n" +"__inline\n" +"float fastDiv(float numerator, float denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"// return numerator/denominator; \n" +"}\n" +"__inline\n" +"float4 fastDiv4(float4 numerator, float4 denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"}\n" +"__inline\n" +"float fastSqrtf(float f2)\n" +"{\n" +" return native_sqrt(f2);\n" +"// return sqrt(f2);\n" +"}\n" +"__inline\n" +"float fastRSqrt(float f2)\n" +"{\n" +" return native_rsqrt(f2);\n" +"}\n" +"__inline\n" +"float fastLength4(float4 v)\n" +"{\n" +" return fast_length(v);\n" +"}\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" return fast_normalize(v);\n" +"}\n" +"__inline\n" +"float sqrtf(float a)\n" +"{\n" +"// return sqrt(a);\n" +" return native_sqrt(a);\n" +"}\n" +"__inline\n" +"float4 cross3(float4 a, float4 b)\n" +"{\n" +" return cross(a,b);\n" +"}\n" +"__inline\n" +"float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = make_float4(a.xyz,0.f);\n" +" float4 b1 = make_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"__inline\n" +"float length3(const float4 a)\n" +"{\n" +" return sqrtf(dot3F4(a,a));\n" +"}\n" +"__inline\n" +"float dot4(const float4 a, const float4 b)\n" +"{\n" +" return dot( a, b );\n" +"}\n" +"// for height\n" +"__inline\n" +"float dot3w1(const float4 point, const float4 eqn)\n" +"{\n" +" return dot3F4(point,eqn) + eqn.w;\n" +"}\n" +"__inline\n" +"float4 normalize3(const float4 a)\n" +"{\n" +" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" +" return fastNormalize4( n );\n" +"// float length = sqrtf(dot3F4(a, a));\n" +"// return 1.f/length * a;\n" +"}\n" +"__inline\n" +"float4 normalize4(const float4 a)\n" +"{\n" +" float length = sqrtf(dot4(a, a));\n" +" return 1.f/length * a;\n" +"}\n" +"__inline\n" +"float4 createEquation(const float4 a, const float4 b, const float4 c)\n" +"{\n" +" float4 eqn;\n" +" float4 ab = b-a;\n" +" float4 ac = c-a;\n" +" eqn = normalize3( cross3(ab, ac) );\n" +" eqn.w = -dot3F4(eqn,a);\n" +" return eqn;\n" +"}\n" +"#define WG_SIZE 64\n" +"typedef struct\n" +"{\n" +" int m_nConstraints;\n" +" int m_start;\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBuffer;\n" +"typedef struct\n" +"{\n" +" int m_solveFriction;\n" +" int m_maxBatch; // long batch really kills the performance\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBufferBatchSolve;\n" +" \n" +"typedef struct \n" +"{\n" +" int m_valInt0;\n" +" int m_valInt1;\n" +" int m_valInt2;\n" +" int m_valInt3;\n" +" float m_val0;\n" +" float m_val1;\n" +" float m_val2;\n" +" float m_val3;\n" +"} SolverDebugInfo;\n" +"typedef struct\n" +"{\n" +" int m_nContacts;\n" +" float m_dt;\n" +" float m_positionDrift;\n" +" float m_positionConstraintCoeff;\n" +"} ConstBufferCTC;\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void ContactToConstraintKernel(__global struct b3Contact4Data* gContact, __global b3RigidBodyData_t* gBodies, __global b3InertiaData_t* gShapes, __global b3ContactConstraint4_t* gConstraintOut, \n" +"int nContacts,\n" +"float dt,\n" +"float positionDrift,\n" +"float positionConstraintCoeff\n" +")\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" \n" +" if( gIdx < nContacts )\n" +" {\n" +" int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n" +" int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n" +" float4 posA = gBodies[aIdx].m_pos;\n" +" float4 linVelA = gBodies[aIdx].m_linVel;\n" +" float4 angVelA = gBodies[aIdx].m_angVel;\n" +" float invMassA = gBodies[aIdx].m_invMass;\n" +" b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;\n" +" float4 posB = gBodies[bIdx].m_pos;\n" +" float4 linVelB = gBodies[bIdx].m_linVel;\n" +" float4 angVelB = gBodies[bIdx].m_angVel;\n" +" float invMassB = gBodies[bIdx].m_invMass;\n" +" b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;\n" +" b3ContactConstraint4_t cs;\n" +" setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n" +" &gContact[gIdx], dt, positionDrift, positionConstraintCoeff,\n" +" &cs );\n" +" \n" +" cs.m_batchIdx = gContact[gIdx].m_batchIdx;\n" +" gConstraintOut[gIdx] = cs;\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl new file mode 100644 index 0000000000..3dc48d4350 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl @@ -0,0 +1,613 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h" + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile global int* +#endif + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define make_float4 (float4) +#define make_float2 (float2) +#define make_uint4 (uint4) +#define make_int4 (int4) +#define make_uint2 (uint2) +#define make_int2 (int2) + + +#define max2 max +#define min2 min + + +/////////////////////////////////////// +// Vector +/////////////////////////////////////// +__inline +float fastDiv(float numerator, float denominator) +{ + return native_divide(numerator, denominator); +// return numerator/denominator; +} + +__inline +float4 fastDiv4(float4 numerator, float4 denominator) +{ + return native_divide(numerator, denominator); +} + +__inline +float fastSqrtf(float f2) +{ + return native_sqrt(f2); +// return sqrt(f2); +} + +__inline +float fastRSqrt(float f2) +{ + return native_rsqrt(f2); +} + +__inline +float fastLength4(float4 v) +{ + return fast_length(v); +} + +__inline +float4 fastNormalize4(float4 v) +{ + return fast_normalize(v); +} + + +__inline +float sqrtf(float a) +{ +// return sqrt(a); + return native_sqrt(a); +} + +__inline +float4 cross3(float4 a, float4 b) +{ + return cross(a,b); +} + +__inline +float dot3F4(float4 a, float4 b) +{ + float4 a1 = make_float4(a.xyz,0.f); + float4 b1 = make_float4(b.xyz,0.f); + return dot(a1, b1); +} + +__inline +float length3(const float4 a) +{ + return sqrtf(dot3F4(a,a)); +} + +__inline +float dot4(const float4 a, const float4 b) +{ + return dot( a, b ); +} + +// for height +__inline +float dot3w1(const float4 point, const float4 eqn) +{ + return dot3F4(point,eqn) + eqn.w; +} + +__inline +float4 normalize3(const float4 a) +{ + float4 n = make_float4(a.x, a.y, a.z, 0.f); + return fastNormalize4( n ); +// float length = sqrtf(dot3F4(a, a)); +// return 1.f/length * a; +} + +__inline +float4 normalize4(const float4 a) +{ + float length = sqrtf(dot4(a, a)); + return 1.f/length * a; +} + +__inline +float4 createEquation(const float4 a, const float4 b, const float4 c) +{ + float4 eqn; + float4 ab = b-a; + float4 ac = c-a; + eqn = normalize3( cross3(ab, ac) ); + eqn.w = -dot3F4(eqn,a); + return eqn; +} + +/////////////////////////////////////// +// Matrix3x3 +/////////////////////////////////////// + +typedef struct +{ + float4 m_row[3]; +}Matrix3x3; + +__inline +Matrix3x3 mtZero(); + +__inline +Matrix3x3 mtIdentity(); + +__inline +Matrix3x3 mtTranspose(Matrix3x3 m); + +__inline +Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b); + +__inline +float4 mtMul1(Matrix3x3 a, float4 b); + +__inline +float4 mtMul3(float4 a, Matrix3x3 b); + +__inline +Matrix3x3 mtZero() +{ + Matrix3x3 m; + m.m_row[0] = (float4)(0.f); + m.m_row[1] = (float4)(0.f); + m.m_row[2] = (float4)(0.f); + return m; +} + +__inline +Matrix3x3 mtIdentity() +{ + Matrix3x3 m; + m.m_row[0] = (float4)(1,0,0,0); + m.m_row[1] = (float4)(0,1,0,0); + m.m_row[2] = (float4)(0,0,1,0); + return m; +} + +__inline +Matrix3x3 mtTranspose(Matrix3x3 m) +{ + Matrix3x3 out; + out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f); + out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f); + out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f); + return out; +} + +__inline +Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b) +{ + Matrix3x3 transB; + transB = mtTranspose( b ); + Matrix3x3 ans; + // why this doesn't run when 0ing in the for{} + a.m_row[0].w = 0.f; + a.m_row[1].w = 0.f; + a.m_row[2].w = 0.f; + for(int i=0; i<3; i++) + { +// a.m_row[i].w = 0.f; + ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]); + ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]); + ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]); + ans.m_row[i].w = 0.f; + } + return ans; +} + +__inline +float4 mtMul1(Matrix3x3 a, float4 b) +{ + float4 ans; + ans.x = dot3F4( a.m_row[0], b ); + ans.y = dot3F4( a.m_row[1], b ); + ans.z = dot3F4( a.m_row[2], b ); + ans.w = 0.f; + return ans; +} + +__inline +float4 mtMul3(float4 a, Matrix3x3 b) +{ + float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0); + float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0); + float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0); + + float4 ans; + ans.x = dot3F4( a, colx ); + ans.y = dot3F4( a, coly ); + ans.z = dot3F4( a, colz ); + return ans; +} + +/////////////////////////////////////// +// Quaternion +/////////////////////////////////////// + +typedef float4 Quaternion; + +__inline +Quaternion qtMul(Quaternion a, Quaternion b); + +__inline +Quaternion qtNormalize(Quaternion in); + +__inline +float4 qtRotate(Quaternion q, float4 vec); + +__inline +Quaternion qtInvert(Quaternion q); + + + + + +__inline +Quaternion qtMul(Quaternion a, Quaternion b) +{ + Quaternion ans; + ans = cross3( a, b ); + ans += a.w*b+b.w*a; +// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z); + ans.w = a.w*b.w - dot3F4(a, b); + return ans; +} + +__inline +Quaternion qtNormalize(Quaternion in) +{ + return fastNormalize4(in); +// in /= length( in ); +// return in; +} +__inline +float4 qtRotate(Quaternion q, float4 vec) +{ + Quaternion qInv = qtInvert( q ); + float4 vcpy = vec; + vcpy.w = 0.f; + float4 out = qtMul(qtMul(q,vcpy),qInv); + return out; +} + +__inline +Quaternion qtInvert(Quaternion q) +{ + return (Quaternion)(-q.xyz, q.w); +} + +__inline +float4 qtInvRotate(const Quaternion q, float4 vec) +{ + return qtRotate( qtInvert( q ), vec ); +} + + + + +#define WG_SIZE 64 + +typedef struct +{ + float4 m_pos; + Quaternion m_quat; + float4 m_linVel; + float4 m_angVel; + + u32 m_shapeIdx; + float m_invMass; + float m_restituitionCoeff; + float m_frictionCoeff; +} Body; + +typedef struct +{ + Matrix3x3 m_invInertia; + Matrix3x3 m_initInvInertia; +} Shape; + +typedef struct +{ + float4 m_linear; + float4 m_worldPos[4]; + float4 m_center; + float m_jacCoeffInv[4]; + float m_b[4]; + float m_appliedRambdaDt[4]; + + float m_fJacCoeffInv[2]; + float m_fAppliedRambdaDt[2]; + + u32 m_bodyA; + u32 m_bodyB; + + int m_batchIdx; + u32 m_paddings[1]; +} Constraint4; + + + +typedef struct +{ + int m_nConstraints; + int m_start; + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBuffer; + +typedef struct +{ + int m_solveFriction; + int m_maxBatch; // long batch really kills the performance + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBufferBatchSolve; + + + + + +typedef struct +{ + int m_valInt0; + int m_valInt1; + int m_valInt2; + int m_valInt3; + + float m_val0; + float m_val1; + float m_val2; + float m_val3; +} SolverDebugInfo; + + + + +// others +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb ) +{ + int nContacts = cb.x; + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int srcIdx = sortData[gIdx].y; + out[gIdx] = in[srcIdx]; + } +} + +__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SetDeterminismSortDataChildShapeB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataOut, int nContacts) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int2 sd; + sd.x = contactsIn[gIdx].m_childIndexB; + sd.y = gIdx; + sortDataOut[gIdx] = sd; + } +} + +__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SetDeterminismSortDataChildShapeA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int2 sdIn; + sdIn = sortDataInOut[gIdx]; + int2 sdOut; + sdOut.x = contactsIn[sdIn.y].m_childIndexA; + sdOut.y = sdIn.y; + sortDataInOut[gIdx] = sdOut; + } +} + +__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SetDeterminismSortDataBodyA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int2 sdIn; + sdIn = sortDataInOut[gIdx]; + int2 sdOut; + sdOut.x = contactsIn[sdIn.y].m_bodyAPtrAndSignBit; + sdOut.y = sdIn.y; + sortDataInOut[gIdx] = sdOut; + } +} + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SetDeterminismSortDataBodyB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int2 sdIn; + sdIn = sortDataInOut[gIdx]; + int2 sdOut; + sdOut.x = contactsIn[sdIn.y].m_bodyBPtrAndSignBit; + sdOut.y = sdIn.y; + sortDataInOut[gIdx] = sdOut; + } +} + + + + +typedef struct +{ + int m_nContacts; + int m_staticIdx; + float m_scale; + int m_nSplit; +} ConstBufferSSD; + + +__constant const int gridTable4x4[] = +{ + 0,1,17,16, + 1,2,18,19, + 17,18,32,3, + 16,19,3,34 +}; + +__constant const int gridTable8x8[] = +{ + 0, 2, 3, 16, 17, 18, 19, 1, + 66, 64, 80, 67, 82, 81, 65, 83, + 131,144,128,130,147,129,145,146, + 208,195,194,192,193,211,210,209, + 21, 22, 23, 5, 4, 6, 7, 20, + 86, 85, 69, 87, 70, 68, 84, 71, + 151,133,149,150,135,148,132,134, + 197,27,214,213,212,199,198,196 + +}; + + + + +#define USE_SPATIAL_BATCHING 1 +#define USE_4x4_GRID 1 + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SetSortDataKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global int2* gSortDataOut, +int nContacts,float scale,int4 nSplit,int staticIdx) + +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int aPtrAndSignBit = gContact[gIdx].m_bodyAPtrAndSignBit; + int bPtrAndSignBit = gContact[gIdx].m_bodyBPtrAndSignBit; + + int aIdx = abs(aPtrAndSignBit ); + int bIdx = abs(bPtrAndSignBit); + + bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx); + bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx); + +#if USE_SPATIAL_BATCHING + int idx = (aStatic)? bIdx: aIdx; + float4 p = gBodies[idx].m_pos; + int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (nSplit.x-1); + int yIdx = (int)((p.y-((p.y<0.f)?1.f:0.f))*scale) & (nSplit.y-1); + int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (nSplit.z-1); + int newIndex = (xIdx+yIdx*nSplit.x+zIdx*nSplit.x*nSplit.y); + +#else//USE_SPATIAL_BATCHING + #if USE_4x4_GRID + int aa = aIdx&3; + int bb = bIdx&3; + if (aStatic) + aa = bb; + if (bStatic) + bb = aa; + + int gridIndex = aa + bb*4; + int newIndex = gridTable4x4[gridIndex]; + #else//USE_4x4_GRID + int aa = aIdx&7; + int bb = bIdx&7; + if (aStatic) + aa = bb; + if (bStatic) + bb = aa; + + int gridIndex = aa + bb*8; + int newIndex = gridTable8x8[gridIndex]; + #endif//USE_4x4_GRID +#endif//USE_SPATIAL_BATCHING + + + gSortDataOut[gIdx].x = newIndex; + gSortDataOut[gIdx].y = gIdx; + } + else + { + gSortDataOut[gIdx].x = 0xffffffff; + } +} + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void CopyConstraintKernel(__global struct b3Contact4Data* gIn, __global struct b3Contact4Data* gOut, int4 cb ) +{ + int gIdx = GET_GLOBAL_IDX; + if( gIdx < cb.x ) + { + gOut[gIdx] = gIn[gIdx]; + } +} + + + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h new file mode 100644 index 0000000000..1b5819f6cf --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h @@ -0,0 +1,601 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* solverSetup2CL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"#ifndef B3_CONTACT4DATA_H\n" +"#define B3_CONTACT4DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"typedef struct b3Contact4Data b3Contact4Data_t;\n" +"struct b3Contact4Data\n" +"{\n" +" b3Float4 m_worldPosB[4];\n" +"// b3Float4 m_localPosA[4];\n" +"// b3Float4 m_localPosB[4];\n" +" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" +" unsigned short m_restituitionCoeffCmp;\n" +" unsigned short m_frictionCoeffCmp;\n" +" int m_batchIdx;\n" +" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" +" int m_bodyBPtrAndSignBit;\n" +" int m_childIndexA;\n" +" int m_childIndexB;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"};\n" +"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" +"{\n" +" return (int)contact->m_worldNormalOnB.w;\n" +"};\n" +"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" +"{\n" +" contact->m_worldNormalOnB.w = (float)numPoints;\n" +"};\n" +"#endif //B3_CONTACT4DATA_H\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile global int*\n" +"#endif\n" +"typedef unsigned int u32;\n" +"typedef unsigned short u16;\n" +"typedef unsigned char u8;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define make_float4 (float4)\n" +"#define make_float2 (float2)\n" +"#define make_uint4 (uint4)\n" +"#define make_int4 (int4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"#define max2 max\n" +"#define min2 min\n" +"///////////////////////////////////////\n" +"// Vector\n" +"///////////////////////////////////////\n" +"__inline\n" +"float fastDiv(float numerator, float denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"// return numerator/denominator; \n" +"}\n" +"__inline\n" +"float4 fastDiv4(float4 numerator, float4 denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"}\n" +"__inline\n" +"float fastSqrtf(float f2)\n" +"{\n" +" return native_sqrt(f2);\n" +"// return sqrt(f2);\n" +"}\n" +"__inline\n" +"float fastRSqrt(float f2)\n" +"{\n" +" return native_rsqrt(f2);\n" +"}\n" +"__inline\n" +"float fastLength4(float4 v)\n" +"{\n" +" return fast_length(v);\n" +"}\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" return fast_normalize(v);\n" +"}\n" +"__inline\n" +"float sqrtf(float a)\n" +"{\n" +"// return sqrt(a);\n" +" return native_sqrt(a);\n" +"}\n" +"__inline\n" +"float4 cross3(float4 a, float4 b)\n" +"{\n" +" return cross(a,b);\n" +"}\n" +"__inline\n" +"float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = make_float4(a.xyz,0.f);\n" +" float4 b1 = make_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"__inline\n" +"float length3(const float4 a)\n" +"{\n" +" return sqrtf(dot3F4(a,a));\n" +"}\n" +"__inline\n" +"float dot4(const float4 a, const float4 b)\n" +"{\n" +" return dot( a, b );\n" +"}\n" +"// for height\n" +"__inline\n" +"float dot3w1(const float4 point, const float4 eqn)\n" +"{\n" +" return dot3F4(point,eqn) + eqn.w;\n" +"}\n" +"__inline\n" +"float4 normalize3(const float4 a)\n" +"{\n" +" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" +" return fastNormalize4( n );\n" +"// float length = sqrtf(dot3F4(a, a));\n" +"// return 1.f/length * a;\n" +"}\n" +"__inline\n" +"float4 normalize4(const float4 a)\n" +"{\n" +" float length = sqrtf(dot4(a, a));\n" +" return 1.f/length * a;\n" +"}\n" +"__inline\n" +"float4 createEquation(const float4 a, const float4 b, const float4 c)\n" +"{\n" +" float4 eqn;\n" +" float4 ab = b-a;\n" +" float4 ac = c-a;\n" +" eqn = normalize3( cross3(ab, ac) );\n" +" eqn.w = -dot3F4(eqn,a);\n" +" return eqn;\n" +"}\n" +"///////////////////////////////////////\n" +"// Matrix3x3\n" +"///////////////////////////////////////\n" +"typedef struct\n" +"{\n" +" float4 m_row[3];\n" +"}Matrix3x3;\n" +"__inline\n" +"Matrix3x3 mtZero();\n" +"__inline\n" +"Matrix3x3 mtIdentity();\n" +"__inline\n" +"Matrix3x3 mtTranspose(Matrix3x3 m);\n" +"__inline\n" +"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b);\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b);\n" +"__inline\n" +"Matrix3x3 mtZero()\n" +"{\n" +" Matrix3x3 m;\n" +" m.m_row[0] = (float4)(0.f);\n" +" m.m_row[1] = (float4)(0.f);\n" +" m.m_row[2] = (float4)(0.f);\n" +" return m;\n" +"}\n" +"__inline\n" +"Matrix3x3 mtIdentity()\n" +"{\n" +" Matrix3x3 m;\n" +" m.m_row[0] = (float4)(1,0,0,0);\n" +" m.m_row[1] = (float4)(0,1,0,0);\n" +" m.m_row[2] = (float4)(0,0,1,0);\n" +" return m;\n" +"}\n" +"__inline\n" +"Matrix3x3 mtTranspose(Matrix3x3 m)\n" +"{\n" +" Matrix3x3 out;\n" +" out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" +" out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" +" out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" +" return out;\n" +"}\n" +"__inline\n" +"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n" +"{\n" +" Matrix3x3 transB;\n" +" transB = mtTranspose( b );\n" +" Matrix3x3 ans;\n" +" // why this doesn't run when 0ing in the for{}\n" +" a.m_row[0].w = 0.f;\n" +" a.m_row[1].w = 0.f;\n" +" a.m_row[2].w = 0.f;\n" +" for(int i=0; i<3; i++)\n" +" {\n" +"// a.m_row[i].w = 0.f;\n" +" ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n" +" ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n" +" ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n" +" ans.m_row[i].w = 0.f;\n" +" }\n" +" return ans;\n" +"}\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b)\n" +"{\n" +" float4 ans;\n" +" ans.x = dot3F4( a.m_row[0], b );\n" +" ans.y = dot3F4( a.m_row[1], b );\n" +" ans.z = dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b)\n" +"{\n" +" float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" float4 ans;\n" +" ans.x = dot3F4( a, colx );\n" +" ans.y = dot3F4( a, coly );\n" +" ans.z = dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"///////////////////////////////////////\n" +"// Quaternion\n" +"///////////////////////////////////////\n" +"typedef float4 Quaternion;\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b);\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in);\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec);\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q);\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b)\n" +"{\n" +" Quaternion ans;\n" +" ans = cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in)\n" +"{\n" +" return fastNormalize4(in);\n" +"// in /= length( in );\n" +"// return in;\n" +"}\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec)\n" +"{\n" +" Quaternion qInv = qtInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q)\n" +"{\n" +" return (Quaternion)(-q.xyz, q.w);\n" +"}\n" +"__inline\n" +"float4 qtInvRotate(const Quaternion q, float4 vec)\n" +"{\n" +" return qtRotate( qtInvert( q ), vec );\n" +"}\n" +"#define WG_SIZE 64\n" +"typedef struct\n" +"{\n" +" float4 m_pos;\n" +" Quaternion m_quat;\n" +" float4 m_linVel;\n" +" float4 m_angVel;\n" +" u32 m_shapeIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"} Body;\n" +"typedef struct\n" +"{\n" +" Matrix3x3 m_invInertia;\n" +" Matrix3x3 m_initInvInertia;\n" +"} Shape;\n" +"typedef struct\n" +"{\n" +" float4 m_linear;\n" +" float4 m_worldPos[4];\n" +" float4 m_center; \n" +" float m_jacCoeffInv[4];\n" +" float m_b[4];\n" +" float m_appliedRambdaDt[4];\n" +" float m_fJacCoeffInv[2]; \n" +" float m_fAppliedRambdaDt[2]; \n" +" u32 m_bodyA;\n" +" u32 m_bodyB;\n" +" int m_batchIdx;\n" +" u32 m_paddings[1];\n" +"} Constraint4;\n" +"typedef struct\n" +"{\n" +" int m_nConstraints;\n" +" int m_start;\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBuffer;\n" +"typedef struct\n" +"{\n" +" int m_solveFriction;\n" +" int m_maxBatch; // long batch really kills the performance\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBufferBatchSolve;\n" +" \n" +"typedef struct \n" +"{\n" +" int m_valInt0;\n" +" int m_valInt1;\n" +" int m_valInt2;\n" +" int m_valInt3;\n" +" float m_val0;\n" +" float m_val1;\n" +" float m_val2;\n" +" float m_val3;\n" +"} SolverDebugInfo;\n" +"// others\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n" +"{\n" +" int nContacts = cb.x;\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < nContacts )\n" +" {\n" +" int srcIdx = sortData[gIdx].y;\n" +" out[gIdx] = in[srcIdx];\n" +" }\n" +"}\n" +"__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SetDeterminismSortDataChildShapeB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataOut, int nContacts)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < nContacts )\n" +" {\n" +" int2 sd;\n" +" sd.x = contactsIn[gIdx].m_childIndexB;\n" +" sd.y = gIdx;\n" +" sortDataOut[gIdx] = sd;\n" +" }\n" +"}\n" +"__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SetDeterminismSortDataChildShapeA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < nContacts )\n" +" {\n" +" int2 sdIn;\n" +" sdIn = sortDataInOut[gIdx];\n" +" int2 sdOut;\n" +" sdOut.x = contactsIn[sdIn.y].m_childIndexA;\n" +" sdOut.y = sdIn.y;\n" +" sortDataInOut[gIdx] = sdOut;\n" +" }\n" +"}\n" +"__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SetDeterminismSortDataBodyA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < nContacts )\n" +" {\n" +" int2 sdIn;\n" +" sdIn = sortDataInOut[gIdx];\n" +" int2 sdOut;\n" +" sdOut.x = contactsIn[sdIn.y].m_bodyAPtrAndSignBit;\n" +" sdOut.y = sdIn.y;\n" +" sortDataInOut[gIdx] = sdOut;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SetDeterminismSortDataBodyB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < nContacts )\n" +" {\n" +" int2 sdIn;\n" +" sdIn = sortDataInOut[gIdx];\n" +" int2 sdOut;\n" +" sdOut.x = contactsIn[sdIn.y].m_bodyBPtrAndSignBit;\n" +" sdOut.y = sdIn.y;\n" +" sortDataInOut[gIdx] = sdOut;\n" +" }\n" +"}\n" +"typedef struct\n" +"{\n" +" int m_nContacts;\n" +" int m_staticIdx;\n" +" float m_scale;\n" +" int m_nSplit;\n" +"} ConstBufferSSD;\n" +"__constant const int gridTable4x4[] = \n" +"{\n" +" 0,1,17,16,\n" +" 1,2,18,19,\n" +" 17,18,32,3,\n" +" 16,19,3,34\n" +"};\n" +"__constant const int gridTable8x8[] = \n" +"{\n" +" 0, 2, 3, 16, 17, 18, 19, 1,\n" +" 66, 64, 80, 67, 82, 81, 65, 83,\n" +" 131,144,128,130,147,129,145,146,\n" +" 208,195,194,192,193,211,210,209,\n" +" 21, 22, 23, 5, 4, 6, 7, 20,\n" +" 86, 85, 69, 87, 70, 68, 84, 71,\n" +" 151,133,149,150,135,148,132,134,\n" +" 197,27,214,213,212,199,198,196\n" +" \n" +"};\n" +"#define USE_SPATIAL_BATCHING 1\n" +"#define USE_4x4_GRID 1\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SetSortDataKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global int2* gSortDataOut, \n" +"int nContacts,float scale,int4 nSplit,int staticIdx)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" \n" +" if( gIdx < nContacts )\n" +" {\n" +" int aPtrAndSignBit = gContact[gIdx].m_bodyAPtrAndSignBit;\n" +" int bPtrAndSignBit = gContact[gIdx].m_bodyBPtrAndSignBit;\n" +" int aIdx = abs(aPtrAndSignBit );\n" +" int bIdx = abs(bPtrAndSignBit);\n" +" bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx);\n" +" bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx);\n" +"#if USE_SPATIAL_BATCHING \n" +" int idx = (aStatic)? bIdx: aIdx;\n" +" float4 p = gBodies[idx].m_pos;\n" +" int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (nSplit.x-1);\n" +" int yIdx = (int)((p.y-((p.y<0.f)?1.f:0.f))*scale) & (nSplit.y-1);\n" +" int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (nSplit.z-1);\n" +" int newIndex = (xIdx+yIdx*nSplit.x+zIdx*nSplit.x*nSplit.y);\n" +" \n" +"#else//USE_SPATIAL_BATCHING\n" +" #if USE_4x4_GRID\n" +" int aa = aIdx&3;\n" +" int bb = bIdx&3;\n" +" if (aStatic)\n" +" aa = bb;\n" +" if (bStatic)\n" +" bb = aa;\n" +" int gridIndex = aa + bb*4;\n" +" int newIndex = gridTable4x4[gridIndex];\n" +" #else//USE_4x4_GRID\n" +" int aa = aIdx&7;\n" +" int bb = bIdx&7;\n" +" if (aStatic)\n" +" aa = bb;\n" +" if (bStatic)\n" +" bb = aa;\n" +" int gridIndex = aa + bb*8;\n" +" int newIndex = gridTable8x8[gridIndex];\n" +" #endif//USE_4x4_GRID\n" +"#endif//USE_SPATIAL_BATCHING\n" +" gSortDataOut[gIdx].x = newIndex;\n" +" gSortDataOut[gIdx].y = gIdx;\n" +" }\n" +" else\n" +" {\n" +" gSortDataOut[gIdx].x = 0xffffffff;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void CopyConstraintKernel(__global struct b3Contact4Data* gIn, __global struct b3Contact4Data* gOut, int4 cb )\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < cb.x )\n" +" {\n" +" gOut[gIdx] = gIn[gIdx];\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.cl new file mode 100644 index 0000000000..a21a08c3b4 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.cl @@ -0,0 +1,968 @@ +/* +Copyright (c) 2013 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h" + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile global int* +#endif + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define make_float4 (float4) +#define make_float2 (float2) +#define make_uint4 (uint4) +#define make_int4 (int4) +#define make_uint2 (uint2) +#define make_int2 (int2) + + +#define max2 max +#define min2 min + + +/////////////////////////////////////// +// Vector +/////////////////////////////////////// +__inline +float fastDiv(float numerator, float denominator) +{ + return native_divide(numerator, denominator); +// return numerator/denominator; +} + +__inline +float4 fastDiv4(float4 numerator, float4 denominator) +{ + return native_divide(numerator, denominator); +} + +__inline +float fastSqrtf(float f2) +{ + return native_sqrt(f2); +// return sqrt(f2); +} + +__inline +float fastRSqrt(float f2) +{ + return native_rsqrt(f2); +} + +__inline +float fastLength4(float4 v) +{ + return fast_length(v); +} + +__inline +float4 fastNormalize4(float4 v) +{ + return fast_normalize(v); +} + + +__inline +float sqrtf(float a) +{ +// return sqrt(a); + return native_sqrt(a); +} + +__inline +float4 cross3(float4 a1, float4 b1) +{ + + float4 a=make_float4(a1.xyz,0.f); + float4 b=make_float4(b1.xyz,0.f); + //float4 a=a1; + //float4 b=b1; + return cross(a,b); +} + +__inline +float dot3F4(float4 a, float4 b) +{ + float4 a1 = make_float4(a.xyz,0.f); + float4 b1 = make_float4(b.xyz,0.f); + return dot(a1, b1); +} + +__inline +float length3(const float4 a) +{ + return sqrtf(dot3F4(a,a)); +} + +__inline +float dot4(const float4 a, const float4 b) +{ + return dot( a, b ); +} + +// for height +__inline +float dot3w1(const float4 point, const float4 eqn) +{ + return dot3F4(point,eqn) + eqn.w; +} + +__inline +float4 normalize3(const float4 a) +{ + float4 n = make_float4(a.x, a.y, a.z, 0.f); + return fastNormalize4( n ); +// float length = sqrtf(dot3F4(a, a)); +// return 1.f/length * a; +} + +__inline +float4 normalize4(const float4 a) +{ + float length = sqrtf(dot4(a, a)); + return 1.f/length * a; +} + +__inline +float4 createEquation(const float4 a, const float4 b, const float4 c) +{ + float4 eqn; + float4 ab = b-a; + float4 ac = c-a; + eqn = normalize3( cross3(ab, ac) ); + eqn.w = -dot3F4(eqn,a); + return eqn; +} + +/////////////////////////////////////// +// Matrix3x3 +/////////////////////////////////////// + +typedef struct +{ + float4 m_row[3]; +}Matrix3x3; + +__inline +Matrix3x3 mtZero(); + +__inline +Matrix3x3 mtIdentity(); + +__inline +Matrix3x3 mtTranspose(Matrix3x3 m); + +__inline +Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b); + +__inline +float4 mtMul1(Matrix3x3 a, float4 b); + +__inline +float4 mtMul3(float4 a, Matrix3x3 b); + +__inline +Matrix3x3 mtZero() +{ + Matrix3x3 m; + m.m_row[0] = (float4)(0.f); + m.m_row[1] = (float4)(0.f); + m.m_row[2] = (float4)(0.f); + return m; +} + +__inline +Matrix3x3 mtIdentity() +{ + Matrix3x3 m; + m.m_row[0] = (float4)(1,0,0,0); + m.m_row[1] = (float4)(0,1,0,0); + m.m_row[2] = (float4)(0,0,1,0); + return m; +} + +__inline +Matrix3x3 mtTranspose(Matrix3x3 m) +{ + Matrix3x3 out; + out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f); + out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f); + out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f); + return out; +} + +__inline +Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b) +{ + Matrix3x3 transB; + transB = mtTranspose( b ); + Matrix3x3 ans; + // why this doesn't run when 0ing in the for{} + a.m_row[0].w = 0.f; + a.m_row[1].w = 0.f; + a.m_row[2].w = 0.f; + for(int i=0; i<3; i++) + { +// a.m_row[i].w = 0.f; + ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]); + ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]); + ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]); + ans.m_row[i].w = 0.f; + } + return ans; +} + +__inline +float4 mtMul1(Matrix3x3 a, float4 b) +{ + float4 ans; + ans.x = dot3F4( a.m_row[0], b ); + ans.y = dot3F4( a.m_row[1], b ); + ans.z = dot3F4( a.m_row[2], b ); + ans.w = 0.f; + return ans; +} + +__inline +float4 mtMul3(float4 a, Matrix3x3 b) +{ + float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0); + float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0); + float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0); + + float4 ans; + ans.x = dot3F4( a, colx ); + ans.y = dot3F4( a, coly ); + ans.z = dot3F4( a, colz ); + return ans; +} + +/////////////////////////////////////// +// Quaternion +/////////////////////////////////////// + +typedef float4 Quaternion; + +__inline +Quaternion qtMul(Quaternion a, Quaternion b); + +__inline +Quaternion qtNormalize(Quaternion in); + +__inline +float4 qtRotate(Quaternion q, float4 vec); + +__inline +Quaternion qtInvert(Quaternion q); + + + + + +__inline +Quaternion qtMul(Quaternion a, Quaternion b) +{ + Quaternion ans; + ans = cross3( a, b ); + ans += a.w*b+b.w*a; +// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z); + ans.w = a.w*b.w - dot3F4(a, b); + return ans; +} + +__inline +Quaternion qtNormalize(Quaternion in) +{ + return fastNormalize4(in); +// in /= length( in ); +// return in; +} +__inline +float4 qtRotate(Quaternion q, float4 vec) +{ + Quaternion qInv = qtInvert( q ); + float4 vcpy = vec; + vcpy.w = 0.f; + float4 out = qtMul(qtMul(q,vcpy),qInv); + return out; +} + +__inline +Quaternion qtInvert(Quaternion q) +{ + return (Quaternion)(-q.xyz, q.w); +} + +__inline +float4 qtInvRotate(const Quaternion q, float4 vec) +{ + return qtRotate( qtInvert( q ), vec ); +} + + + + +#define WG_SIZE 64 + +typedef struct +{ + float4 m_pos; + Quaternion m_quat; + float4 m_linVel; + float4 m_angVel; + + u32 m_shapeIdx; + float m_invMass; + float m_restituitionCoeff; + float m_frictionCoeff; +} Body; + + + +typedef struct +{ + Matrix3x3 m_invInertia; + Matrix3x3 m_initInvInertia; +} Shape; + +typedef struct +{ + float4 m_linear; + float4 m_worldPos[4]; + float4 m_center; + float m_jacCoeffInv[4]; + float m_b[4]; + float m_appliedRambdaDt[4]; + + float m_fJacCoeffInv[2]; + float m_fAppliedRambdaDt[2]; + + u32 m_bodyA; + u32 m_bodyB; + int m_batchIdx; + u32 m_paddings; +} Constraint4; + + + + + + +__kernel void CountBodiesKernel(__global struct b3Contact4Data* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex) +{ + int i = GET_GLOBAL_IDX; + + if( i < numContactManifolds) + { + int pa = manifoldPtr[i].m_bodyAPtrAndSignBit; + bool isFixedA = (pa <0) || (pa == fixedBodyIndex); + int bodyIndexA = abs(pa); + if (!isFixedA) + { + AtomInc1(bodyCount[bodyIndexA],contactConstraintOffsets[i].x); + } + barrier(CLK_GLOBAL_MEM_FENCE); + int pb = manifoldPtr[i].m_bodyBPtrAndSignBit; + bool isFixedB = (pb <0) || (pb == fixedBodyIndex); + int bodyIndexB = abs(pb); + if (!isFixedB) + { + AtomInc1(bodyCount[bodyIndexB],contactConstraintOffsets[i].y); + } + } +} + +__kernel void ClearVelocitiesKernel(__global float4* linearVelocities,__global float4* angularVelocities, int numSplitBodies) +{ + int i = GET_GLOBAL_IDX; + + if( i < numSplitBodies) + { + linearVelocities[i] = make_float4(0); + angularVelocities[i] = make_float4(0); + } +} + + +__kernel void AverageVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount, +__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies) +{ + int i = GET_GLOBAL_IDX; + if (i<numBodies) + { + if (gBodies[i].m_invMass) + { + int bodyOffset = offsetSplitBodies[i]; + int count = bodyCount[i]; + float factor = 1.f/((float)count); + float4 averageLinVel = make_float4(0.f); + float4 averageAngVel = make_float4(0.f); + + for (int j=0;j<count;j++) + { + averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor; + averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor; + } + + for (int j=0;j<count;j++) + { + deltaLinearVelocities[bodyOffset+j] = averageLinVel; + deltaAngularVelocities[bodyOffset+j] = averageAngVel; + } + + }//bodies[i].m_invMass + }//i<numBodies +} + + + +void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1) +{ + *linear = make_float4(n.xyz,0.f); + *angular0 = cross3(r0, n); + *angular1 = -cross3(r1, n); +} + + +float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 ) +{ + return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1); +} + + +float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1, + float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1, float countA, float countB) +{ + // linear0,1 are normlized + float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0; + float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0); + float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1; + float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1); + return -1.f/((jmj0+jmj1)*countA+(jmj2+jmj3)*countB); +} + + +void btPlaneSpace1 (float4 n, float4* p, float4* q); + void btPlaneSpace1 (float4 n, float4* p, float4* q) +{ + if (fabs(n.z) > 0.70710678f) { + // choose p in y-z plane + float a = n.y*n.y + n.z*n.z; + float k = 1.f/sqrt(a); + p[0].x = 0; + p[0].y = -n.z*k; + p[0].z = n.y*k; + // set q = n x p + q[0].x = a*k; + q[0].y = -n.x*p[0].z; + q[0].z = n.x*p[0].y; + } + else { + // choose p in x-y plane + float a = n.x*n.x + n.y*n.y; + float k = 1.f/sqrt(a); + p[0].x = -n.y*k; + p[0].y = n.x*k; + p[0].z = 0; + // set q = n x p + q[0].x = -n.z*p[0].y; + q[0].y = n.z*p[0].x; + q[0].z = a*k; + } +} + + + + + +void solveContact(__global Constraint4* cs, + float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA, + float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB, + float4* dLinVelA, float4* dAngVelA, float4* dLinVelB, float4* dAngVelB) +{ + float minRambdaDt = 0; + float maxRambdaDt = FLT_MAX; + + for(int ic=0; ic<4; ic++) + { + if( cs->m_jacCoeffInv[ic] == 0.f ) continue; + + float4 angular0, angular1, linear; + float4 r0 = cs->m_worldPos[ic] - posA; + float4 r1 = cs->m_worldPos[ic] - posB; + setLinearAndAngular( cs->m_linear, r0, r1, &linear, &angular0, &angular1 ); + + + + float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, + *linVelA+*dLinVelA, *angVelA+*dAngVelA, *linVelB+*dLinVelB, *angVelB+*dAngVelB ) + cs->m_b[ic]; + rambdaDt *= cs->m_jacCoeffInv[ic]; + + + { + float prevSum = cs->m_appliedRambdaDt[ic]; + float updated = prevSum; + updated += rambdaDt; + updated = max2( updated, minRambdaDt ); + updated = min2( updated, maxRambdaDt ); + rambdaDt = updated - prevSum; + cs->m_appliedRambdaDt[ic] = updated; + } + + + float4 linImp0 = invMassA*linear*rambdaDt; + float4 linImp1 = invMassB*(-linear)*rambdaDt; + float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt; + float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt; + + + if (invMassA) + { + *dLinVelA += linImp0; + *dAngVelA += angImp0; + } + if (invMassB) + { + *dLinVelB += linImp1; + *dAngVelB += angImp1; + } + } +} + + +// solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,contactConstraintOffsets,offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities); + + +void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs, +__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies, +__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities) +{ + + //float frictionCoeff = ldsCs[0].m_linear.w; + int aIdx = ldsCs[0].m_bodyA; + int bIdx = ldsCs[0].m_bodyB; + + float4 posA = gBodies[aIdx].m_pos; + float4 linVelA = gBodies[aIdx].m_linVel; + float4 angVelA = gBodies[aIdx].m_angVel; + float invMassA = gBodies[aIdx].m_invMass; + Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia; + + float4 posB = gBodies[bIdx].m_pos; + float4 linVelB = gBodies[bIdx].m_linVel; + float4 angVelB = gBodies[bIdx].m_angVel; + float invMassB = gBodies[bIdx].m_invMass; + Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia; + + + float4 dLinVelA = make_float4(0,0,0,0); + float4 dAngVelA = make_float4(0,0,0,0); + float4 dLinVelB = make_float4(0,0,0,0); + float4 dAngVelB = make_float4(0,0,0,0); + + int bodyOffsetA = offsetSplitBodies[aIdx]; + int constraintOffsetA = contactConstraintOffsets[0].x; + int splitIndexA = bodyOffsetA+constraintOffsetA; + + if (invMassA) + { + dLinVelA = deltaLinearVelocities[splitIndexA]; + dAngVelA = deltaAngularVelocities[splitIndexA]; + } + + int bodyOffsetB = offsetSplitBodies[bIdx]; + int constraintOffsetB = contactConstraintOffsets[0].y; + int splitIndexB= bodyOffsetB+constraintOffsetB; + + if (invMassB) + { + dLinVelB = deltaLinearVelocities[splitIndexB]; + dAngVelB = deltaAngularVelocities[splitIndexB]; + } + + solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA, + posB, &linVelB, &angVelB, invMassB, invInertiaB ,&dLinVelA, &dAngVelA, &dLinVelB, &dAngVelB); + + if (invMassA) + { + deltaLinearVelocities[splitIndexA] = dLinVelA; + deltaAngularVelocities[splitIndexA] = dAngVelA; + } + if (invMassB) + { + deltaLinearVelocities[splitIndexB] = dLinVelB; + deltaAngularVelocities[splitIndexB] = dAngVelB; + } + +} + + +__kernel void SolveContactJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes , +__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, +float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds +) +{ + int i = GET_GLOBAL_IDX; + if (i<numManifolds) + { + solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities); + } +} + + + + +void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs, + __global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies, + __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities) +{ + float frictionCoeff = 0.7f;//ldsCs[0].m_linear.w; + int aIdx = ldsCs[0].m_bodyA; + int bIdx = ldsCs[0].m_bodyB; + + + float4 posA = gBodies[aIdx].m_pos; + float4 linVelA = gBodies[aIdx].m_linVel; + float4 angVelA = gBodies[aIdx].m_angVel; + float invMassA = gBodies[aIdx].m_invMass; + Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia; + + float4 posB = gBodies[bIdx].m_pos; + float4 linVelB = gBodies[bIdx].m_linVel; + float4 angVelB = gBodies[bIdx].m_angVel; + float invMassB = gBodies[bIdx].m_invMass; + Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia; + + + float4 dLinVelA = make_float4(0,0,0,0); + float4 dAngVelA = make_float4(0,0,0,0); + float4 dLinVelB = make_float4(0,0,0,0); + float4 dAngVelB = make_float4(0,0,0,0); + + int bodyOffsetA = offsetSplitBodies[aIdx]; + int constraintOffsetA = contactConstraintOffsets[0].x; + int splitIndexA = bodyOffsetA+constraintOffsetA; + + if (invMassA) + { + dLinVelA = deltaLinearVelocities[splitIndexA]; + dAngVelA = deltaAngularVelocities[splitIndexA]; + } + + int bodyOffsetB = offsetSplitBodies[bIdx]; + int constraintOffsetB = contactConstraintOffsets[0].y; + int splitIndexB= bodyOffsetB+constraintOffsetB; + + if (invMassB) + { + dLinVelB = deltaLinearVelocities[splitIndexB]; + dAngVelB = deltaAngularVelocities[splitIndexB]; + } + + + + + { + float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + + float sum = 0; + for(int j=0; j<4; j++) + { + sum +=ldsCs[0].m_appliedRambdaDt[j]; + } + frictionCoeff = 0.7f; + for(int j=0; j<4; j++) + { + maxRambdaDt[j] = frictionCoeff*sum; + minRambdaDt[j] = -maxRambdaDt[j]; + } + + +// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA, +// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt ); + + + { + + __global Constraint4* cs = ldsCs; + + if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return; + const float4 center = cs->m_center; + + float4 n = -cs->m_linear; + + float4 tangent[2]; + btPlaneSpace1(n,&tangent[0],&tangent[1]); + float4 angular0, angular1, linear; + float4 r0 = center - posA; + float4 r1 = center - posB; + for(int i=0; i<2; i++) + { + setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 ); + float rambdaDt = calcRelVel(linear, -linear, angular0, angular1, + linVelA+dLinVelA, angVelA+dAngVelA, linVelB+dLinVelB, angVelB+dAngVelB ); + rambdaDt *= cs->m_fJacCoeffInv[i]; + + { + float prevSum = cs->m_fAppliedRambdaDt[i]; + float updated = prevSum; + updated += rambdaDt; + updated = max2( updated, minRambdaDt[i] ); + updated = min2( updated, maxRambdaDt[i] ); + rambdaDt = updated - prevSum; + cs->m_fAppliedRambdaDt[i] = updated; + } + + float4 linImp0 = invMassA*linear*rambdaDt; + float4 linImp1 = invMassB*(-linear)*rambdaDt; + float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt; + float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt; + + dLinVelA += linImp0; + dAngVelA += angImp0; + dLinVelB += linImp1; + dAngVelB += angImp1; + } + { // angular damping for point constraint + float4 ab = normalize3( posB - posA ); + float4 ac = normalize3( center - posA ); + if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f)) + { + float angNA = dot3F4( n, angVelA ); + float angNB = dot3F4( n, angVelB ); + + dAngVelA -= (angNA*0.1f)*n; + dAngVelB -= (angNB*0.1f)*n; + } + } + } + + + + } + + if (invMassA) + { + deltaLinearVelocities[splitIndexA] = dLinVelA; + deltaAngularVelocities[splitIndexA] = dAngVelA; + } + if (invMassB) + { + deltaLinearVelocities[splitIndexB] = dLinVelB; + deltaAngularVelocities[splitIndexB] = dAngVelB; + } + + +} + + +__kernel void SolveFrictionJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes , + __global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies, + __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, + float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds +) +{ + int i = GET_GLOBAL_IDX; + if (i<numManifolds) + { + solveFrictionConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities); + } +} + + +__kernel void UpdateBodyVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount, + __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies) +{ + int i = GET_GLOBAL_IDX; + if (i<numBodies) + { + if (gBodies[i].m_invMass) + { + int bodyOffset = offsetSplitBodies[i]; + int count = bodyCount[i]; + if (count) + { + gBodies[i].m_linVel += deltaLinearVelocities[bodyOffset]; + gBodies[i].m_angVel += deltaAngularVelocities[bodyOffset]; + } + } + } +} + + + +void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA, + const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, + __global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB, + Constraint4* dstC ) +{ + dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit); + dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit); + + float dtInv = 1.f/dt; + for(int ic=0; ic<4; ic++) + { + dstC->m_appliedRambdaDt[ic] = 0.f; + } + dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f; + + + dstC->m_linear = src->m_worldNormalOnB; + dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() ); + for(int ic=0; ic<4; ic++) + { + float4 r0 = src->m_worldPosB[ic] - posA; + float4 r1 = src->m_worldPosB[ic] - posB; + + if( ic >= src->m_worldNormalOnB.w )//npoints + { + dstC->m_jacCoeffInv[ic] = 0.f; + continue; + } + + float relVelN; + { + float4 linear, angular0, angular1; + setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1); + + dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1, + invMassA, &invInertiaA, invMassB, &invInertiaB , countA, countB); + + relVelN = calcRelVel(linear, -linear, angular0, angular1, + linVelA, angVelA, linVelB, angVelB); + + float e = 0.f;//src->getRestituitionCoeff(); + if( relVelN*relVelN < 0.004f ) e = 0.f; + + dstC->m_b[ic] = e*relVelN; + //float penetration = src->m_worldPosB[ic].w; + dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv; + dstC->m_appliedRambdaDt[ic] = 0.f; + } + } + + if( src->m_worldNormalOnB.w > 0 )//npoints + { // prepare friction + float4 center = make_float4(0.f); + for(int i=0; i<src->m_worldNormalOnB.w; i++) + center += src->m_worldPosB[i]; + center /= (float)src->m_worldNormalOnB.w; + + float4 tangent[2]; + btPlaneSpace1(-src->m_worldNormalOnB,&tangent[0],&tangent[1]); + + float4 r[2]; + r[0] = center - posA; + r[1] = center - posB; + + for(int i=0; i<2; i++) + { + float4 linear, angular0, angular1; + setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1); + + dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1, + invMassA, &invInertiaA, invMassB, &invInertiaB ,countA, countB); + dstC->m_fAppliedRambdaDt[i] = 0.f; + } + dstC->m_center = center; + } + + for(int i=0; i<4; i++) + { + if( i<src->m_worldNormalOnB.w ) + { + dstC->m_worldPos[i] = src->m_worldPosB[i]; + } + else + { + dstC->m_worldPos[i] = make_float4(0.f); + } + } +} + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void ContactToConstraintSplitKernel(__global const struct b3Contact4Data* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut, +__global const unsigned int* bodyCount, +int nContacts, +float dt, +float positionDrift, +float positionConstraintCoeff +) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit); + int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit); + + float4 posA = gBodies[aIdx].m_pos; + float4 linVelA = gBodies[aIdx].m_linVel; + float4 angVelA = gBodies[aIdx].m_angVel; + float invMassA = gBodies[aIdx].m_invMass; + Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia; + + float4 posB = gBodies[bIdx].m_pos; + float4 linVelB = gBodies[bIdx].m_linVel; + float4 angVelB = gBodies[bIdx].m_angVel; + float invMassB = gBodies[bIdx].m_invMass; + Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia; + + Constraint4 cs; + + float countA = invMassA != 0.f ? (float)bodyCount[aIdx] : 1; + float countB = invMassB != 0.f ? (float)bodyCount[bIdx] : 1; + + setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB, + &gContact[gIdx], dt, positionDrift, positionConstraintCoeff,countA,countB, + &cs ); + + cs.m_batchIdx = gContact[gIdx].m_batchIdx; + + gConstraintOut[gIdx] = cs; + } +}
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h new file mode 100644 index 0000000000..c0173ad9f4 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h @@ -0,0 +1,909 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* solverUtilsCL= \ +"/*\n" +"Copyright (c) 2013 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Erwin Coumans\n" +"#ifndef B3_CONTACT4DATA_H\n" +"#define B3_CONTACT4DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"typedef struct b3Contact4Data b3Contact4Data_t;\n" +"struct b3Contact4Data\n" +"{\n" +" b3Float4 m_worldPosB[4];\n" +"// b3Float4 m_localPosA[4];\n" +"// b3Float4 m_localPosB[4];\n" +" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" +" unsigned short m_restituitionCoeffCmp;\n" +" unsigned short m_frictionCoeffCmp;\n" +" int m_batchIdx;\n" +" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" +" int m_bodyBPtrAndSignBit;\n" +" int m_childIndexA;\n" +" int m_childIndexB;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"};\n" +"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" +"{\n" +" return (int)contact->m_worldNormalOnB.w;\n" +"};\n" +"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" +"{\n" +" contact->m_worldNormalOnB.w = (float)numPoints;\n" +"};\n" +"#endif //B3_CONTACT4DATA_H\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile global int*\n" +"#endif\n" +"typedef unsigned int u32;\n" +"typedef unsigned short u16;\n" +"typedef unsigned char u8;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define make_float4 (float4)\n" +"#define make_float2 (float2)\n" +"#define make_uint4 (uint4)\n" +"#define make_int4 (int4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"#define max2 max\n" +"#define min2 min\n" +"///////////////////////////////////////\n" +"// Vector\n" +"///////////////////////////////////////\n" +"__inline\n" +"float fastDiv(float numerator, float denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"// return numerator/denominator; \n" +"}\n" +"__inline\n" +"float4 fastDiv4(float4 numerator, float4 denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"}\n" +"__inline\n" +"float fastSqrtf(float f2)\n" +"{\n" +" return native_sqrt(f2);\n" +"// return sqrt(f2);\n" +"}\n" +"__inline\n" +"float fastRSqrt(float f2)\n" +"{\n" +" return native_rsqrt(f2);\n" +"}\n" +"__inline\n" +"float fastLength4(float4 v)\n" +"{\n" +" return fast_length(v);\n" +"}\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" return fast_normalize(v);\n" +"}\n" +"__inline\n" +"float sqrtf(float a)\n" +"{\n" +"// return sqrt(a);\n" +" return native_sqrt(a);\n" +"}\n" +"__inline\n" +"float4 cross3(float4 a1, float4 b1)\n" +"{\n" +" float4 a=make_float4(a1.xyz,0.f);\n" +" float4 b=make_float4(b1.xyz,0.f);\n" +" //float4 a=a1;\n" +" //float4 b=b1;\n" +" return cross(a,b);\n" +"}\n" +"__inline\n" +"float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = make_float4(a.xyz,0.f);\n" +" float4 b1 = make_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"__inline\n" +"float length3(const float4 a)\n" +"{\n" +" return sqrtf(dot3F4(a,a));\n" +"}\n" +"__inline\n" +"float dot4(const float4 a, const float4 b)\n" +"{\n" +" return dot( a, b );\n" +"}\n" +"// for height\n" +"__inline\n" +"float dot3w1(const float4 point, const float4 eqn)\n" +"{\n" +" return dot3F4(point,eqn) + eqn.w;\n" +"}\n" +"__inline\n" +"float4 normalize3(const float4 a)\n" +"{\n" +" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" +" return fastNormalize4( n );\n" +"// float length = sqrtf(dot3F4(a, a));\n" +"// return 1.f/length * a;\n" +"}\n" +"__inline\n" +"float4 normalize4(const float4 a)\n" +"{\n" +" float length = sqrtf(dot4(a, a));\n" +" return 1.f/length * a;\n" +"}\n" +"__inline\n" +"float4 createEquation(const float4 a, const float4 b, const float4 c)\n" +"{\n" +" float4 eqn;\n" +" float4 ab = b-a;\n" +" float4 ac = c-a;\n" +" eqn = normalize3( cross3(ab, ac) );\n" +" eqn.w = -dot3F4(eqn,a);\n" +" return eqn;\n" +"}\n" +"///////////////////////////////////////\n" +"// Matrix3x3\n" +"///////////////////////////////////////\n" +"typedef struct\n" +"{\n" +" float4 m_row[3];\n" +"}Matrix3x3;\n" +"__inline\n" +"Matrix3x3 mtZero();\n" +"__inline\n" +"Matrix3x3 mtIdentity();\n" +"__inline\n" +"Matrix3x3 mtTranspose(Matrix3x3 m);\n" +"__inline\n" +"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b);\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b);\n" +"__inline\n" +"Matrix3x3 mtZero()\n" +"{\n" +" Matrix3x3 m;\n" +" m.m_row[0] = (float4)(0.f);\n" +" m.m_row[1] = (float4)(0.f);\n" +" m.m_row[2] = (float4)(0.f);\n" +" return m;\n" +"}\n" +"__inline\n" +"Matrix3x3 mtIdentity()\n" +"{\n" +" Matrix3x3 m;\n" +" m.m_row[0] = (float4)(1,0,0,0);\n" +" m.m_row[1] = (float4)(0,1,0,0);\n" +" m.m_row[2] = (float4)(0,0,1,0);\n" +" return m;\n" +"}\n" +"__inline\n" +"Matrix3x3 mtTranspose(Matrix3x3 m)\n" +"{\n" +" Matrix3x3 out;\n" +" out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" +" out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" +" out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" +" return out;\n" +"}\n" +"__inline\n" +"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n" +"{\n" +" Matrix3x3 transB;\n" +" transB = mtTranspose( b );\n" +" Matrix3x3 ans;\n" +" // why this doesn't run when 0ing in the for{}\n" +" a.m_row[0].w = 0.f;\n" +" a.m_row[1].w = 0.f;\n" +" a.m_row[2].w = 0.f;\n" +" for(int i=0; i<3; i++)\n" +" {\n" +"// a.m_row[i].w = 0.f;\n" +" ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n" +" ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n" +" ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n" +" ans.m_row[i].w = 0.f;\n" +" }\n" +" return ans;\n" +"}\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b)\n" +"{\n" +" float4 ans;\n" +" ans.x = dot3F4( a.m_row[0], b );\n" +" ans.y = dot3F4( a.m_row[1], b );\n" +" ans.z = dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b)\n" +"{\n" +" float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" float4 ans;\n" +" ans.x = dot3F4( a, colx );\n" +" ans.y = dot3F4( a, coly );\n" +" ans.z = dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"///////////////////////////////////////\n" +"// Quaternion\n" +"///////////////////////////////////////\n" +"typedef float4 Quaternion;\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b);\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in);\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec);\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q);\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b)\n" +"{\n" +" Quaternion ans;\n" +" ans = cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in)\n" +"{\n" +" return fastNormalize4(in);\n" +"// in /= length( in );\n" +"// return in;\n" +"}\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec)\n" +"{\n" +" Quaternion qInv = qtInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q)\n" +"{\n" +" return (Quaternion)(-q.xyz, q.w);\n" +"}\n" +"__inline\n" +"float4 qtInvRotate(const Quaternion q, float4 vec)\n" +"{\n" +" return qtRotate( qtInvert( q ), vec );\n" +"}\n" +"#define WG_SIZE 64\n" +"typedef struct\n" +"{\n" +" float4 m_pos;\n" +" Quaternion m_quat;\n" +" float4 m_linVel;\n" +" float4 m_angVel;\n" +" u32 m_shapeIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"} Body;\n" +"typedef struct\n" +"{\n" +" Matrix3x3 m_invInertia;\n" +" Matrix3x3 m_initInvInertia;\n" +"} Shape;\n" +"typedef struct\n" +"{\n" +" float4 m_linear;\n" +" float4 m_worldPos[4];\n" +" float4 m_center; \n" +" float m_jacCoeffInv[4];\n" +" float m_b[4];\n" +" float m_appliedRambdaDt[4];\n" +" float m_fJacCoeffInv[2]; \n" +" float m_fAppliedRambdaDt[2]; \n" +" u32 m_bodyA;\n" +" u32 m_bodyB;\n" +" int m_batchIdx;\n" +" u32 m_paddings;\n" +"} Constraint4;\n" +"__kernel void CountBodiesKernel(__global struct b3Contact4Data* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex)\n" +"{\n" +" int i = GET_GLOBAL_IDX;\n" +" \n" +" if( i < numContactManifolds)\n" +" {\n" +" int pa = manifoldPtr[i].m_bodyAPtrAndSignBit;\n" +" bool isFixedA = (pa <0) || (pa == fixedBodyIndex);\n" +" int bodyIndexA = abs(pa);\n" +" if (!isFixedA)\n" +" {\n" +" AtomInc1(bodyCount[bodyIndexA],contactConstraintOffsets[i].x);\n" +" }\n" +" barrier(CLK_GLOBAL_MEM_FENCE);\n" +" int pb = manifoldPtr[i].m_bodyBPtrAndSignBit;\n" +" bool isFixedB = (pb <0) || (pb == fixedBodyIndex);\n" +" int bodyIndexB = abs(pb);\n" +" if (!isFixedB)\n" +" {\n" +" AtomInc1(bodyCount[bodyIndexB],contactConstraintOffsets[i].y);\n" +" } \n" +" }\n" +"}\n" +"__kernel void ClearVelocitiesKernel(__global float4* linearVelocities,__global float4* angularVelocities, int numSplitBodies)\n" +"{\n" +" int i = GET_GLOBAL_IDX;\n" +" \n" +" if( i < numSplitBodies)\n" +" {\n" +" linearVelocities[i] = make_float4(0);\n" +" angularVelocities[i] = make_float4(0);\n" +" }\n" +"}\n" +"__kernel void AverageVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,\n" +"__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)\n" +"{\n" +" int i = GET_GLOBAL_IDX;\n" +" if (i<numBodies)\n" +" {\n" +" if (gBodies[i].m_invMass)\n" +" {\n" +" int bodyOffset = offsetSplitBodies[i];\n" +" int count = bodyCount[i];\n" +" float factor = 1.f/((float)count);\n" +" float4 averageLinVel = make_float4(0.f);\n" +" float4 averageAngVel = make_float4(0.f);\n" +" \n" +" for (int j=0;j<count;j++)\n" +" {\n" +" averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor;\n" +" averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor;\n" +" }\n" +" \n" +" for (int j=0;j<count;j++)\n" +" {\n" +" deltaLinearVelocities[bodyOffset+j] = averageLinVel;\n" +" deltaAngularVelocities[bodyOffset+j] = averageAngVel;\n" +" }\n" +" \n" +" }//bodies[i].m_invMass\n" +" }//i<numBodies\n" +"}\n" +"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n" +"{\n" +" *linear = make_float4(n.xyz,0.f);\n" +" *angular0 = cross3(r0, n);\n" +" *angular1 = -cross3(r1, n);\n" +"}\n" +"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n" +"{\n" +" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n" +"}\n" +"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" +" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1, float countA, float countB)\n" +"{\n" +" // linear0,1 are normlized\n" +" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n" +" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n" +" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n" +" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" +" return -1.f/((jmj0+jmj1)*countA+(jmj2+jmj3)*countB);\n" +"}\n" +"void btPlaneSpace1 (float4 n, float4* p, float4* q);\n" +" void btPlaneSpace1 (float4 n, float4* p, float4* q)\n" +"{\n" +" if (fabs(n.z) > 0.70710678f) {\n" +" // choose p in y-z plane\n" +" float a = n.y*n.y + n.z*n.z;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = 0;\n" +" p[0].y = -n.z*k;\n" +" p[0].z = n.y*k;\n" +" // set q = n x p\n" +" q[0].x = a*k;\n" +" q[0].y = -n.x*p[0].z;\n" +" q[0].z = n.x*p[0].y;\n" +" }\n" +" else {\n" +" // choose p in x-y plane\n" +" float a = n.x*n.x + n.y*n.y;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = -n.y*k;\n" +" p[0].y = n.x*k;\n" +" p[0].z = 0;\n" +" // set q = n x p\n" +" q[0].x = -n.z*p[0].y;\n" +" q[0].y = n.z*p[0].x;\n" +" q[0].z = a*k;\n" +" }\n" +"}\n" +"void solveContact(__global Constraint4* cs,\n" +" float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n" +" float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB,\n" +" float4* dLinVelA, float4* dAngVelA, float4* dLinVelB, float4* dAngVelB)\n" +"{\n" +" float minRambdaDt = 0;\n" +" float maxRambdaDt = FLT_MAX;\n" +" for(int ic=0; ic<4; ic++)\n" +" {\n" +" if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n" +" float4 angular0, angular1, linear;\n" +" float4 r0 = cs->m_worldPos[ic] - posA;\n" +" float4 r1 = cs->m_worldPos[ic] - posB;\n" +" setLinearAndAngular( cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n" +" \n" +" float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n" +" *linVelA+*dLinVelA, *angVelA+*dAngVelA, *linVelB+*dLinVelB, *angVelB+*dAngVelB ) + cs->m_b[ic];\n" +" rambdaDt *= cs->m_jacCoeffInv[ic];\n" +" \n" +" {\n" +" float prevSum = cs->m_appliedRambdaDt[ic];\n" +" float updated = prevSum;\n" +" updated += rambdaDt;\n" +" updated = max2( updated, minRambdaDt );\n" +" updated = min2( updated, maxRambdaDt );\n" +" rambdaDt = updated - prevSum;\n" +" cs->m_appliedRambdaDt[ic] = updated;\n" +" }\n" +" \n" +" float4 linImp0 = invMassA*linear*rambdaDt;\n" +" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" +" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" +" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" +" \n" +" if (invMassA)\n" +" {\n" +" *dLinVelA += linImp0;\n" +" *dAngVelA += angImp0;\n" +" }\n" +" if (invMassB)\n" +" {\n" +" *dLinVelB += linImp1;\n" +" *dAngVelB += angImp1;\n" +" }\n" +" }\n" +"}\n" +"// solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,contactConstraintOffsets,offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n" +"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs, \n" +"__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n" +"__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)\n" +"{\n" +" //float frictionCoeff = ldsCs[0].m_linear.w;\n" +" int aIdx = ldsCs[0].m_bodyA;\n" +" int bIdx = ldsCs[0].m_bodyB;\n" +" float4 posA = gBodies[aIdx].m_pos;\n" +" float4 linVelA = gBodies[aIdx].m_linVel;\n" +" float4 angVelA = gBodies[aIdx].m_angVel;\n" +" float invMassA = gBodies[aIdx].m_invMass;\n" +" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" +" float4 posB = gBodies[bIdx].m_pos;\n" +" float4 linVelB = gBodies[bIdx].m_linVel;\n" +" float4 angVelB = gBodies[bIdx].m_angVel;\n" +" float invMassB = gBodies[bIdx].m_invMass;\n" +" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" +" \n" +" float4 dLinVelA = make_float4(0,0,0,0);\n" +" float4 dAngVelA = make_float4(0,0,0,0);\n" +" float4 dLinVelB = make_float4(0,0,0,0);\n" +" float4 dAngVelB = make_float4(0,0,0,0);\n" +" \n" +" int bodyOffsetA = offsetSplitBodies[aIdx];\n" +" int constraintOffsetA = contactConstraintOffsets[0].x;\n" +" int splitIndexA = bodyOffsetA+constraintOffsetA;\n" +" \n" +" if (invMassA)\n" +" {\n" +" dLinVelA = deltaLinearVelocities[splitIndexA];\n" +" dAngVelA = deltaAngularVelocities[splitIndexA];\n" +" }\n" +" int bodyOffsetB = offsetSplitBodies[bIdx];\n" +" int constraintOffsetB = contactConstraintOffsets[0].y;\n" +" int splitIndexB= bodyOffsetB+constraintOffsetB;\n" +" if (invMassB)\n" +" {\n" +" dLinVelB = deltaLinearVelocities[splitIndexB];\n" +" dAngVelB = deltaAngularVelocities[splitIndexB];\n" +" }\n" +" solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" +" posB, &linVelB, &angVelB, invMassB, invInertiaB ,&dLinVelA, &dAngVelA, &dLinVelB, &dAngVelB);\n" +" if (invMassA)\n" +" {\n" +" deltaLinearVelocities[splitIndexA] = dLinVelA;\n" +" deltaAngularVelocities[splitIndexA] = dAngVelA;\n" +" } \n" +" if (invMassB)\n" +" {\n" +" deltaLinearVelocities[splitIndexB] = dLinVelB;\n" +" deltaAngularVelocities[splitIndexB] = dAngVelB;\n" +" }\n" +"}\n" +"__kernel void SolveContactJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,\n" +"__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,\n" +"float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds\n" +")\n" +"{\n" +" int i = GET_GLOBAL_IDX;\n" +" if (i<numManifolds)\n" +" {\n" +" solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n" +" }\n" +"}\n" +"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs,\n" +" __global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n" +" __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)\n" +"{\n" +" float frictionCoeff = 0.7f;//ldsCs[0].m_linear.w;\n" +" int aIdx = ldsCs[0].m_bodyA;\n" +" int bIdx = ldsCs[0].m_bodyB;\n" +" float4 posA = gBodies[aIdx].m_pos;\n" +" float4 linVelA = gBodies[aIdx].m_linVel;\n" +" float4 angVelA = gBodies[aIdx].m_angVel;\n" +" float invMassA = gBodies[aIdx].m_invMass;\n" +" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" +" float4 posB = gBodies[bIdx].m_pos;\n" +" float4 linVelB = gBodies[bIdx].m_linVel;\n" +" float4 angVelB = gBodies[bIdx].m_angVel;\n" +" float invMassB = gBodies[bIdx].m_invMass;\n" +" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" +" \n" +" float4 dLinVelA = make_float4(0,0,0,0);\n" +" float4 dAngVelA = make_float4(0,0,0,0);\n" +" float4 dLinVelB = make_float4(0,0,0,0);\n" +" float4 dAngVelB = make_float4(0,0,0,0);\n" +" \n" +" int bodyOffsetA = offsetSplitBodies[aIdx];\n" +" int constraintOffsetA = contactConstraintOffsets[0].x;\n" +" int splitIndexA = bodyOffsetA+constraintOffsetA;\n" +" \n" +" if (invMassA)\n" +" {\n" +" dLinVelA = deltaLinearVelocities[splitIndexA];\n" +" dAngVelA = deltaAngularVelocities[splitIndexA];\n" +" }\n" +" int bodyOffsetB = offsetSplitBodies[bIdx];\n" +" int constraintOffsetB = contactConstraintOffsets[0].y;\n" +" int splitIndexB= bodyOffsetB+constraintOffsetB;\n" +" if (invMassB)\n" +" {\n" +" dLinVelB = deltaLinearVelocities[splitIndexB];\n" +" dAngVelB = deltaAngularVelocities[splitIndexB];\n" +" }\n" +" {\n" +" float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n" +" float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n" +" float sum = 0;\n" +" for(int j=0; j<4; j++)\n" +" {\n" +" sum +=ldsCs[0].m_appliedRambdaDt[j];\n" +" }\n" +" frictionCoeff = 0.7f;\n" +" for(int j=0; j<4; j++)\n" +" {\n" +" maxRambdaDt[j] = frictionCoeff*sum;\n" +" minRambdaDt[j] = -maxRambdaDt[j];\n" +" }\n" +" \n" +"// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" +"// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n" +" \n" +" \n" +" {\n" +" \n" +" __global Constraint4* cs = ldsCs;\n" +" \n" +" if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;\n" +" const float4 center = cs->m_center;\n" +" \n" +" float4 n = -cs->m_linear;\n" +" \n" +" float4 tangent[2];\n" +" btPlaneSpace1(n,&tangent[0],&tangent[1]);\n" +" float4 angular0, angular1, linear;\n" +" float4 r0 = center - posA;\n" +" float4 r1 = center - posB;\n" +" for(int i=0; i<2; i++)\n" +" {\n" +" setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );\n" +" float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,\n" +" linVelA+dLinVelA, angVelA+dAngVelA, linVelB+dLinVelB, angVelB+dAngVelB );\n" +" rambdaDt *= cs->m_fJacCoeffInv[i];\n" +" \n" +" {\n" +" float prevSum = cs->m_fAppliedRambdaDt[i];\n" +" float updated = prevSum;\n" +" updated += rambdaDt;\n" +" updated = max2( updated, minRambdaDt[i] );\n" +" updated = min2( updated, maxRambdaDt[i] );\n" +" rambdaDt = updated - prevSum;\n" +" cs->m_fAppliedRambdaDt[i] = updated;\n" +" }\n" +" \n" +" float4 linImp0 = invMassA*linear*rambdaDt;\n" +" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" +" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" +" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" +" \n" +" dLinVelA += linImp0;\n" +" dAngVelA += angImp0;\n" +" dLinVelB += linImp1;\n" +" dAngVelB += angImp1;\n" +" }\n" +" { // angular damping for point constraint\n" +" float4 ab = normalize3( posB - posA );\n" +" float4 ac = normalize3( center - posA );\n" +" if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))\n" +" {\n" +" float angNA = dot3F4( n, angVelA );\n" +" float angNB = dot3F4( n, angVelB );\n" +" \n" +" dAngVelA -= (angNA*0.1f)*n;\n" +" dAngVelB -= (angNB*0.1f)*n;\n" +" }\n" +" }\n" +" }\n" +" \n" +" \n" +" }\n" +" if (invMassA)\n" +" {\n" +" deltaLinearVelocities[splitIndexA] = dLinVelA;\n" +" deltaAngularVelocities[splitIndexA] = dAngVelA;\n" +" } \n" +" if (invMassB)\n" +" {\n" +" deltaLinearVelocities[splitIndexB] = dLinVelB;\n" +" deltaAngularVelocities[splitIndexB] = dAngVelB;\n" +" }\n" +" \n" +"}\n" +"__kernel void SolveFrictionJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,\n" +" __global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n" +" __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,\n" +" float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds\n" +")\n" +"{\n" +" int i = GET_GLOBAL_IDX;\n" +" if (i<numManifolds)\n" +" {\n" +" solveFrictionConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n" +" }\n" +"}\n" +"__kernel void UpdateBodyVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,\n" +" __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)\n" +"{\n" +" int i = GET_GLOBAL_IDX;\n" +" if (i<numBodies)\n" +" {\n" +" if (gBodies[i].m_invMass)\n" +" {\n" +" int bodyOffset = offsetSplitBodies[i];\n" +" int count = bodyCount[i];\n" +" if (count)\n" +" {\n" +" gBodies[i].m_linVel += deltaLinearVelocities[bodyOffset];\n" +" gBodies[i].m_angVel += deltaAngularVelocities[bodyOffset];\n" +" }\n" +" }\n" +" }\n" +"}\n" +"void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,\n" +" const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, \n" +" __global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB,\n" +" Constraint4* dstC )\n" +"{\n" +" dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n" +" dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n" +" float dtInv = 1.f/dt;\n" +" for(int ic=0; ic<4; ic++)\n" +" {\n" +" dstC->m_appliedRambdaDt[ic] = 0.f;\n" +" }\n" +" dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n" +" dstC->m_linear = src->m_worldNormalOnB;\n" +" dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n" +" for(int ic=0; ic<4; ic++)\n" +" {\n" +" float4 r0 = src->m_worldPosB[ic] - posA;\n" +" float4 r1 = src->m_worldPosB[ic] - posB;\n" +" if( ic >= src->m_worldNormalOnB.w )//npoints\n" +" {\n" +" dstC->m_jacCoeffInv[ic] = 0.f;\n" +" continue;\n" +" }\n" +" float relVelN;\n" +" {\n" +" float4 linear, angular0, angular1;\n" +" setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1);\n" +" dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n" +" invMassA, &invInertiaA, invMassB, &invInertiaB , countA, countB);\n" +" relVelN = calcRelVel(linear, -linear, angular0, angular1,\n" +" linVelA, angVelA, linVelB, angVelB);\n" +" float e = 0.f;//src->getRestituitionCoeff();\n" +" if( relVelN*relVelN < 0.004f ) e = 0.f;\n" +" dstC->m_b[ic] = e*relVelN;\n" +" //float penetration = src->m_worldPosB[ic].w;\n" +" dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n" +" dstC->m_appliedRambdaDt[ic] = 0.f;\n" +" }\n" +" }\n" +" if( src->m_worldNormalOnB.w > 0 )//npoints\n" +" { // prepare friction\n" +" float4 center = make_float4(0.f);\n" +" for(int i=0; i<src->m_worldNormalOnB.w; i++) \n" +" center += src->m_worldPosB[i];\n" +" center /= (float)src->m_worldNormalOnB.w;\n" +" float4 tangent[2];\n" +" btPlaneSpace1(-src->m_worldNormalOnB,&tangent[0],&tangent[1]);\n" +" \n" +" float4 r[2];\n" +" r[0] = center - posA;\n" +" r[1] = center - posB;\n" +" for(int i=0; i<2; i++)\n" +" {\n" +" float4 linear, angular0, angular1;\n" +" setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n" +" dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n" +" invMassA, &invInertiaA, invMassB, &invInertiaB ,countA, countB);\n" +" dstC->m_fAppliedRambdaDt[i] = 0.f;\n" +" }\n" +" dstC->m_center = center;\n" +" }\n" +" for(int i=0; i<4; i++)\n" +" {\n" +" if( i<src->m_worldNormalOnB.w )\n" +" {\n" +" dstC->m_worldPos[i] = src->m_worldPosB[i];\n" +" }\n" +" else\n" +" {\n" +" dstC->m_worldPos[i] = make_float4(0.f);\n" +" }\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void ContactToConstraintSplitKernel(__global const struct b3Contact4Data* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut, \n" +"__global const unsigned int* bodyCount,\n" +"int nContacts,\n" +"float dt,\n" +"float positionDrift,\n" +"float positionConstraintCoeff\n" +")\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" \n" +" if( gIdx < nContacts )\n" +" {\n" +" int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n" +" int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n" +" float4 posA = gBodies[aIdx].m_pos;\n" +" float4 linVelA = gBodies[aIdx].m_linVel;\n" +" float4 angVelA = gBodies[aIdx].m_angVel;\n" +" float invMassA = gBodies[aIdx].m_invMass;\n" +" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" +" float4 posB = gBodies[bIdx].m_pos;\n" +" float4 linVelB = gBodies[bIdx].m_linVel;\n" +" float4 angVelB = gBodies[bIdx].m_angVel;\n" +" float invMassB = gBodies[bIdx].m_invMass;\n" +" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" +" Constraint4 cs;\n" +" float countA = invMassA != 0.f ? (float)bodyCount[aIdx] : 1;\n" +" float countB = invMassB != 0.f ? (float)bodyCount[bIdx] : 1;\n" +" setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n" +" &gContact[gIdx], dt, positionDrift, positionConstraintCoeff,countA,countB,\n" +" &cs );\n" +" \n" +" cs.m_batchIdx = gContact[gIdx].m_batchIdx;\n" +" gConstraintOut[gIdx] = cs;\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl new file mode 100644 index 0000000000..ba8ba735d0 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl @@ -0,0 +1,22 @@ + + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3UpdateAabbs.h" + + +__kernel void initializeGpuAabbsFull( const int numNodes, __global b3RigidBodyData_t* gBodies,__global b3Collidable_t* collidables, __global b3Aabb_t* plocalShapeAABB, __global b3Aabb_t* pAABB) +{ + int nodeID = get_global_id(0); + if( nodeID < numNodes ) + { + b3ComputeWorldAabb(nodeID, gBodies, collidables, plocalShapeAABB,pAABB); + } +} + +__kernel void clearOverlappingPairsKernel( __global int4* pairs, int numPairs) +{ + int pairId = get_global_id(0); + if( pairId< numPairs ) + { + pairs[pairId].z = 0xffffffff; + } +}
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h new file mode 100644 index 0000000000..d70e74017a --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h @@ -0,0 +1,483 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* updateAabbsKernelCL= \ +"#ifndef B3_UPDATE_AABBS_H\n" +"#define B3_UPDATE_AABBS_H\n" +"#ifndef B3_AABB_H\n" +"#define B3_AABB_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_MAT3x3_H\n" +"#define B3_MAT3x3_H\n" +"#ifndef B3_QUAT_H\n" +"#define B3_QUAT_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif\n" +"#endif\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Quat;\n" +" #define b3QuatConstArg const b3Quat\n" +" \n" +" \n" +"inline float4 b3FastNormalize4(float4 v)\n" +"{\n" +" v = (float4)(v.xyz,0.f);\n" +" return fast_normalize(v);\n" +"}\n" +" \n" +"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" +"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" +"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" +"{\n" +" b3Quat ans;\n" +" ans = b3Cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - b3Dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" +"{\n" +" b3Quat q;\n" +" q=in;\n" +" //return b3FastNormalize4(in);\n" +" float len = native_sqrt(dot(q, q));\n" +" if(len > 0.f)\n" +" {\n" +" q *= 1.f / len;\n" +" }\n" +" else\n" +" {\n" +" q.x = q.y = q.z = 0.f;\n" +" q.w = 1.f;\n" +" }\n" +" return q;\n" +"}\n" +"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" b3Quat qInv = b3QuatInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" return b3QuatRotate( b3QuatInvert( q ), vec );\n" +"}\n" +"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" +"{\n" +" return b3QuatRotate( orientation, point ) + (translation);\n" +"}\n" +" \n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"typedef struct\n" +"{\n" +" b3Float4 m_row[3];\n" +"}b3Mat3x3;\n" +"#define b3Mat3x3ConstArg const b3Mat3x3\n" +"#define b3GetRow(m,row) (m.m_row[row])\n" +"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" +"{\n" +" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" +" b3Mat3x3 out;\n" +" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" +" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" +" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" +" out.m_row[0].w = 0.f;\n" +" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" +" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" +" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" +" out.m_row[1].w = 0.f;\n" +" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" +" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" +" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" +" out.m_row[2].w = 0.f;\n" +" return out;\n" +"}\n" +"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = fabs(matIn.m_row[0]);\n" +" out.m_row[1] = fabs(matIn.m_row[1]);\n" +" out.m_row[2] = fabs(matIn.m_row[2]);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtZero();\n" +"__inline\n" +"b3Mat3x3 mtIdentity();\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Mat3x3 mtZero()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(0.f);\n" +" m.m_row[1] = (b3Float4)(0.f);\n" +" m.m_row[2] = (b3Float4)(0.f);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtIdentity()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(1,0,0,0);\n" +" m.m_row[1] = (b3Float4)(0,1,0,0);\n" +" m.m_row[2] = (b3Float4)(0,0,1,0);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" +" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" +" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" +"{\n" +" b3Mat3x3 transB;\n" +" transB = mtTranspose( b );\n" +" b3Mat3x3 ans;\n" +" // why this doesn't run when 0ing in the for{}\n" +" a.m_row[0].w = 0.f;\n" +" a.m_row[1].w = 0.f;\n" +" a.m_row[2].w = 0.f;\n" +" for(int i=0; i<3; i++)\n" +" {\n" +"// a.m_row[i].w = 0.f;\n" +" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" +" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" +" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" +" ans.m_row[i].w = 0.f;\n" +" }\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" +"{\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a.m_row[0], b );\n" +" ans.y = b3Dot3F4( a.m_row[1], b );\n" +" ans.z = b3Dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" +"{\n" +" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a, colx );\n" +" ans.y = b3Dot3F4( a, coly );\n" +" ans.z = b3Dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"#endif\n" +"#endif //B3_MAT3x3_H\n" +"typedef struct b3Aabb b3Aabb_t;\n" +"struct b3Aabb\n" +"{\n" +" union\n" +" {\n" +" float m_min[4];\n" +" b3Float4 m_minVec;\n" +" int m_minIndices[4];\n" +" };\n" +" union\n" +" {\n" +" float m_max[4];\n" +" b3Float4 m_maxVec;\n" +" int m_signedMaxIndices[4];\n" +" };\n" +"};\n" +"inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n" +" b3Float4ConstArg pos,\n" +" b3QuatConstArg orn,\n" +" b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n" +"{\n" +" b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n" +" localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n" +" b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n" +" b3Mat3x3 m;\n" +" m = b3QuatGetRotationMatrix(orn);\n" +" b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n" +" b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n" +" \n" +" b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n" +" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n" +" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n" +" 0.f);\n" +" *aabbMinOut = center-extent;\n" +" *aabbMaxOut = center+extent;\n" +"}\n" +"/// conservative test for overlap between two aabbs\n" +"inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n" +" b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n" +"{\n" +" bool overlap = true;\n" +" overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n" +" overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n" +" overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"#endif //B3_AABB_H\n" +"#ifndef B3_COLLIDABLE_H\n" +"#define B3_COLLIDABLE_H\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"enum b3ShapeTypes\n" +"{\n" +" SHAPE_HEIGHT_FIELD=1,\n" +" SHAPE_CONVEX_HULL=3,\n" +" SHAPE_PLANE=4,\n" +" SHAPE_CONCAVE_TRIMESH=5,\n" +" SHAPE_COMPOUND_OF_CONVEX_HULLS=6,\n" +" SHAPE_SPHERE=7,\n" +" MAX_NUM_SHAPE_TYPES,\n" +"};\n" +"typedef struct b3Collidable b3Collidable_t;\n" +"struct b3Collidable\n" +"{\n" +" union {\n" +" int m_numChildShapes;\n" +" int m_bvhIndex;\n" +" };\n" +" union\n" +" {\n" +" float m_radius;\n" +" int m_compoundBvhIndex;\n" +" };\n" +" int m_shapeType;\n" +" union\n" +" {\n" +" int m_shapeIndex;\n" +" float m_height;\n" +" };\n" +"};\n" +"typedef struct b3GpuChildShape b3GpuChildShape_t;\n" +"struct b3GpuChildShape\n" +"{\n" +" b3Float4 m_childPosition;\n" +" b3Quat m_childOrientation;\n" +" union\n" +" {\n" +" int m_shapeIndex;//used for SHAPE_COMPOUND_OF_CONVEX_HULLS\n" +" int m_capsuleAxis;\n" +" };\n" +" union \n" +" {\n" +" float m_radius;//used for childshape of SHAPE_COMPOUND_OF_SPHERES or SHAPE_COMPOUND_OF_CAPSULES\n" +" int m_numChildShapes;//used for compound shape\n" +" };\n" +" union \n" +" {\n" +" float m_height;//used for childshape of SHAPE_COMPOUND_OF_CAPSULES\n" +" int m_collidableShapeIndex;\n" +" };\n" +" int m_shapeType;\n" +"};\n" +"struct b3CompoundOverlappingPair\n" +"{\n" +" int m_bodyIndexA;\n" +" int m_bodyIndexB;\n" +"// int m_pairType;\n" +" int m_childShapeIndexA;\n" +" int m_childShapeIndexB;\n" +"};\n" +"#endif //B3_COLLIDABLE_H\n" +"#ifndef B3_RIGIDBODY_DATA_H\n" +"#define B3_RIGIDBODY_DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifndef B3_MAT3x3_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif\n" +"#endif //B3_MAT3x3_H\n" +"typedef struct b3RigidBodyData b3RigidBodyData_t;\n" +"struct b3RigidBodyData\n" +"{\n" +" b3Float4 m_pos;\n" +" b3Quat m_quat;\n" +" b3Float4 m_linVel;\n" +" b3Float4 m_angVel;\n" +" int m_collidableIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"};\n" +"typedef struct b3InertiaData b3InertiaData_t;\n" +"struct b3InertiaData\n" +"{\n" +" b3Mat3x3 m_invInertiaWorld;\n" +" b3Mat3x3 m_initInvInertia;\n" +"};\n" +"#endif //B3_RIGIDBODY_DATA_H\n" +" \n" +"void b3ComputeWorldAabb( int bodyId, __global const b3RigidBodyData_t* bodies, __global const b3Collidable_t* collidables, __global const b3Aabb_t* localShapeAABB, __global b3Aabb_t* worldAabbs)\n" +"{\n" +" __global const b3RigidBodyData_t* body = &bodies[bodyId];\n" +" b3Float4 position = body->m_pos;\n" +" b3Quat orientation = body->m_quat;\n" +" \n" +" int collidableIndex = body->m_collidableIdx;\n" +" int shapeIndex = collidables[collidableIndex].m_shapeIndex;\n" +" \n" +" if (shapeIndex>=0)\n" +" {\n" +" \n" +" b3Aabb_t localAabb = localShapeAABB[collidableIndex];\n" +" b3Aabb_t worldAabb;\n" +" \n" +" b3Float4 aabbAMinOut,aabbAMaxOut; \n" +" float margin = 0.f;\n" +" b3TransformAabb2(localAabb.m_minVec,localAabb.m_maxVec,margin,position,orientation,&aabbAMinOut,&aabbAMaxOut);\n" +" \n" +" worldAabb.m_minVec =aabbAMinOut;\n" +" worldAabb.m_minIndices[3] = bodyId;\n" +" worldAabb.m_maxVec = aabbAMaxOut;\n" +" worldAabb.m_signedMaxIndices[3] = body[bodyId].m_invMass==0.f? 0 : 1;\n" +" worldAabbs[bodyId] = worldAabb;\n" +" }\n" +"}\n" +"#endif //B3_UPDATE_AABBS_H\n" +"__kernel void initializeGpuAabbsFull( const int numNodes, __global b3RigidBodyData_t* gBodies,__global b3Collidable_t* collidables, __global b3Aabb_t* plocalShapeAABB, __global b3Aabb_t* pAABB)\n" +"{\n" +" int nodeID = get_global_id(0);\n" +" if( nodeID < numNodes )\n" +" {\n" +" b3ComputeWorldAabb(nodeID, gBodies, collidables, plocalShapeAABB,pAABB);\n" +" }\n" +"}\n" +"__kernel void clearOverlappingPairsKernel( __global int4* pairs, int numPairs)\n" +"{\n" +" int pairId = get_global_id(0);\n" +" if( pairId< numPairs )\n" +" {\n" +" pairs[pairId].z = 0xffffffff;\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/premake4.lua b/thirdparty/bullet/src/Bullet3OpenCL/premake4.lua new file mode 100644 index 0000000000..55a8613634 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/premake4.lua @@ -0,0 +1,29 @@ +function createProject(vendor) + hasCL = findOpenCL(vendor) + + if (hasCL) then + + project ("Bullet3OpenCL_" .. vendor) + + initOpenCL(vendor) + + kind "StaticLib" + + + includedirs { + ".",".." + } + + files { + "**.cpp", + "**.h" + } + + end +end + +createProject("clew") +createProject("AMD") +createProject("Intel") +createProject("NVIDIA") +createProject("Apple") |